From f0ad4f458735e8f843cc4f058c8cfec9206f4f0f Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 12 May 2024 17:23:40 +0300 Subject: [PATCH 1/8] [Base] Add aliasing-safe xe::memory::Reinterpret Accessing the same memory as different types (other than char) using reinterpret_cast or a union is undefined behavior that has already caused issues like #1971. Also adds a XE_RESTRICT_VAR definition for declaring non-aliasing pointers in performance-critical areas in the future. --- src/xenia/base/memory.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index 14fb65968..3ed4dc3ab 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -16,6 +16,7 @@ #include #include #include +#include #include "xenia/base/assert.h" #include "xenia/base/byte_order.h" @@ -24,6 +25,30 @@ namespace xe { namespace memory { +// For variable declarations (not return values or `this` pointer). +// Not propagated. +#define XE_RESTRICT_VAR __restrict + +// Aliasing-safe bit reinterpretation. +// For more complex cases such as non-trivially-copyable types, write copying +// code respecting the requirements for them externally instead of using these +// functions. + +template +void Reinterpret(Dst& XE_RESTRICT_VAR dst, const Src& XE_RESTRICT_VAR src) { + static_assert(sizeof(Dst) == sizeof(Src)); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_copyable_v); + std::memcpy(&dst, &src, sizeof(Dst)); +} + +template +Dst Reinterpret(const Src& XE_RESTRICT_VAR src) { + Dst dst; + Reinterpret(dst, src); + return dst; +} + #if XE_PLATFORM_ANDROID void AndroidInitialize(); void AndroidShutdown(); From 376bad5056f3b8456b48c5f9ad61c85005127418 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 12 May 2024 17:17:30 +0300 Subject: [PATCH 2/8] [GPU] Remove register reinterpret_casts + WAIT_REG_MEM volatility Hopefully prevents some potential #1971-like situations. WAIT_REG_MEM's implementation also allowed the compiler to load the value only once, which caused an infinite loop with the other changes in the commit (even in debug builds), so it's now accessed as volatile. Possibly it would be even better to replace it with some (acquire/release?) atomic load/store some day at least for the registers actually seen as participating in those waits. Also fixes the endianness being handled only on the first wait iteration in WAIT_REG_MEM. --- src/xenia/gpu/command_processor.cc | 89 ++++++----- .../gpu/d3d12/d3d12_command_processor.cc | 81 +++++----- src/xenia/gpu/d3d12/d3d12_texture_cache.cc | 7 +- src/xenia/gpu/draw_extent_estimator.cc | 44 +++--- src/xenia/gpu/draw_util.cc | 53 ++++--- src/xenia/gpu/dxbc.h | 15 +- src/xenia/gpu/graphics_system.cc | 4 +- src/xenia/gpu/packet_disassembler.h | 4 +- src/xenia/gpu/primitive_processor.cc | 8 +- src/xenia/gpu/register_file.h | 66 +++++--- src/xenia/gpu/shader_interpreter.cc | 45 +++--- src/xenia/gpu/shader_interpreter.h | 5 +- src/xenia/gpu/texture_cache.cc | 3 +- src/xenia/gpu/trace_viewer.cc | 144 ++++++++---------- src/xenia/gpu/trace_viewer.h | 2 +- .../gpu/vulkan/vulkan_command_processor.cc | 64 ++++---- src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc | 2 +- src/xenia/gpu/vulkan/vulkan_texture_cache.cc | 7 +- src/xenia/gpu/xenos.cc | 11 +- 19 files changed, 336 insertions(+), 318 deletions(-) diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 8e9bc6067..cc8e80690 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -18,6 +18,7 @@ #include "xenia/base/byte_stream.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" +#include "xenia/base/memory.h" #include "xenia/base/profiling.h" #include "xenia/base/ring_buffer.h" #include "xenia/gpu/gpu_flags.h" @@ -334,7 +335,8 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { return; } - regs.values[index].u32 = value; + // Volatile for the WAIT_REG_MEM loop. + const_cast(regs.values[index]) = value; if (!regs.GetRegisterInfo(index)) { XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value); } @@ -342,19 +344,20 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { // Scratch register writeback. if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) { uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0; - if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK].u32) { + if ((1 << scratch_reg) & regs.values[XE_GPU_REG_SCRATCH_UMSK]) { // Enabled - write to address. - uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR].u32; + uint32_t scratch_addr = regs.values[XE_GPU_REG_SCRATCH_ADDR]; uint32_t mem_addr = scratch_addr + (scratch_reg * 4); xe::store_and_swap(memory_->TranslatePhysical(mem_addr), value); } } else { switch (index) { // If this is a COHER register, set the dirty flag. - // This will block the command processor the next time it WAIT_MEM_REGs + // This will block the command processor the next time it WAIT_REG_MEMs // and allow us to synchronize the memory. case XE_GPU_REG_COHER_STATUS_HOST: { - regs.values[index].u32 |= UINT32_C(0x80000000); + const_cast(regs.values[index]) |= + UINT32_C(0x80000000); } break; case XE_GPU_REG_DC_LUT_RW_INDEX: { @@ -365,12 +368,12 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { case XE_GPU_REG_DC_LUT_SEQ_COLOR: { // Should be in the 256-entry table writing mode. - assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1); + assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1); auto& gamma_ramp_rw_index = regs.Get(); // DC_LUT_SEQ_COLOR is in the red, green, blue order, but the write // enable mask is blue, green, red. bool write_gamma_ramp_component = - (regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & + (regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] & (UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0; if (write_gamma_ramp_component) { reg::DC_LUT_30_COLOR& gamma_ramp_entry = @@ -401,14 +404,14 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { case XE_GPU_REG_DC_LUT_PWL_DATA: { // Should be in the PWL writing mode. - assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1); + assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1); auto& gamma_ramp_rw_index = regs.Get(); // Bit 7 of the index is ignored for PWL. uint32_t gamma_ramp_rw_index_pwl = gamma_ramp_rw_index.rw_index & 0x7F; // DC_LUT_PWL_DATA is likely in the red, green, blue order because // DC_LUT_SEQ_COLOR is, but the write enable mask is blue, green, red. bool write_gamma_ramp_component = - (regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & + (regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] & (UINT32_C(1) << (2 - gamma_ramp_rw_component_))) != 0; if (write_gamma_ramp_component) { reg::DC_LUT_PWL_DATA& gamma_ramp_entry = @@ -436,10 +439,10 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { case XE_GPU_REG_DC_LUT_30_COLOR: { // Should be in the 256-entry table writing mode. - assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE].u32 & 0b1); + assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1); auto& gamma_ramp_rw_index = regs.Get(); uint32_t gamma_ramp_write_enable_mask = - regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK].u32 & 0b111; + regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] & 0b111; if (gamma_ramp_write_enable_mask) { reg::DC_LUT_30_COLOR& gamma_ramp_entry = gamma_ramp_256_entry_table_[gamma_ramp_rw_index.rw_index]; @@ -479,10 +482,12 @@ void CommandProcessor::MakeCoherent() { // https://web.archive.org/web/20160711162346/https://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf // https://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454 - RegisterFile* regs = register_file_; - auto& status_host = regs->Get(); - auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32; - auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32; + // Volatile because this may be called from the WAIT_REG_MEM loop. + volatile uint32_t* regs_volatile = register_file_->values; + auto status_host = xe::memory::Reinterpret( + uint32_t(regs_volatile[XE_GPU_REG_COHER_STATUS_HOST])); + uint32_t base_host = regs_volatile[XE_GPU_REG_COHER_BASE_HOST]; + uint32_t size_host = regs_volatile[XE_GPU_REG_COHER_SIZE_HOST]; if (!status_host.status) { return; @@ -502,7 +507,7 @@ void CommandProcessor::MakeCoherent() { base_host + size_host, size_host, action); // Mark coherent. - status_host.status = 0; + regs_volatile[XE_GPU_REG_COHER_STATUS_HOST] = 0; } void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); } @@ -940,28 +945,33 @@ bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingBuffer* reader, SCOPE_profile_cpu_f("gpu"); // wait until a register or memory location is a specific value + uint32_t wait_info = reader->ReadAndSwap(); uint32_t poll_reg_addr = reader->ReadAndSwap(); uint32_t ref = reader->ReadAndSwap(); uint32_t mask = reader->ReadAndSwap(); uint32_t wait = reader->ReadAndSwap(); + + bool is_memory = (wait_info & 0x10) != 0; + + assert_true(is_memory || poll_reg_addr < RegisterFile::kRegisterCount); + const volatile uint32_t& value_ref = + is_memory ? *reinterpret_cast(memory_->TranslatePhysical( + poll_reg_addr & ~uint32_t(0x3))) + : register_file_->values[poll_reg_addr]; + bool matched = false; do { - uint32_t value; - if (wait_info & 0x10) { - // Memory. - auto endianness = static_cast(poll_reg_addr & 0x3); - poll_reg_addr &= ~0x3; - value = xe::load(memory_->TranslatePhysical(poll_reg_addr)); - value = GpuSwap(value, endianness); - trace_writer_.WriteMemoryRead(CpuToGpu(poll_reg_addr), 4); + uint32_t value = value_ref; + if (is_memory) { + trace_writer_.WriteMemoryRead(CpuToGpu(poll_reg_addr & ~uint32_t(0x3)), + sizeof(uint32_t)); + value = xenos::GpuSwap(value, + static_cast(poll_reg_addr & 0x3)); } else { - // Register. - assert_true(poll_reg_addr < RegisterFile::kRegisterCount); - value = register_file_->values[poll_reg_addr].u32; if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { MakeCoherent(); - value = register_file_->values[poll_reg_addr].u32; + value = value_ref; } } switch (wait_info & 0x7) { @@ -1024,17 +1034,17 @@ bool CommandProcessor::ExecutePacketType3_REG_RMW(RingBuffer* reader, uint32_t rmw_info = reader->ReadAndSwap(); uint32_t and_mask = reader->ReadAndSwap(); uint32_t or_mask = reader->ReadAndSwap(); - uint32_t value = register_file_->values[rmw_info & 0x1FFF].u32; + uint32_t value = register_file_->values[rmw_info & 0x1FFF]; if ((rmw_info >> 31) & 0x1) { // & reg - value &= register_file_->values[and_mask & 0x1FFF].u32; + value &= register_file_->values[and_mask & 0x1FFF]; } else { // & imm value &= and_mask; } if ((rmw_info >> 30) & 0x1) { // | reg - value |= register_file_->values[or_mask & 0x1FFF].u32; + value |= register_file_->values[or_mask & 0x1FFF]; } else { // | imm value |= or_mask; @@ -1055,7 +1065,7 @@ bool CommandProcessor::ExecutePacketType3_REG_TO_MEM(RingBuffer* reader, uint32_t reg_val; assert_true(reg_addr < RegisterFile::kRegisterCount); - reg_val = register_file_->values[reg_addr].u32; + reg_val = register_file_->values[reg_addr]; auto endianness = static_cast(mem_addr & 0x3); mem_addr &= ~0x3; @@ -1105,7 +1115,7 @@ bool CommandProcessor::ExecutePacketType3_COND_WRITE(RingBuffer* reader, } else { // Register. assert_true(poll_reg_addr < RegisterFile::kRegisterCount); - value = register_file_->values[poll_reg_addr].u32; + value = register_file_->values[poll_reg_addr]; } bool matched = false; switch (wait_info & 0x7) { @@ -1240,7 +1250,7 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader, if (fake_sample_count >= 0) { auto* pSampleCounts = memory_->TranslatePhysical( - register_file_->values[XE_GPU_REG_RB_SAMPLE_COUNT_ADDR].u32); + register_file_->values[XE_GPU_REG_RB_SAMPLE_COUNT_ADDR]); // 0xFFFFFEED is written to this two locations by D3D only on D3DISSUE_END // and used to detect a finished query. bool is_end_via_z_pass = pSampleCounts->ZPass_A == kQueryFinished && @@ -1599,10 +1609,10 @@ bool CommandProcessor::ExecutePacketType3_VIZ_QUERY(RingBuffer* reader, // The scan converter writes the internal result back to the register here. // We just fake it and say it was visible in case it is read back. if (id < 32) { - register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_0].u32 |= - uint32_t(1) << id; + register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_0] |= uint32_t(1) + << id; } else { - register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_1].u32 |= + register_file_->values[XE_GPU_REG_PA_SC_VIZ_QUERY_STATUS_1] |= uint32_t(1) << (id - 32); } } @@ -1614,9 +1624,8 @@ void CommandProcessor::InitializeTrace() { // Write the initial register values, to be loaded directly into the // RegisterFile since all registers, including those that may have side // effects on setting, will be saved. - trace_writer_.WriteRegisters( - 0, reinterpret_cast(register_file_->values), - RegisterFile::kRegisterCount, false); + trace_writer_.WriteRegisters(0, register_file_->values, + RegisterFile::kRegisterCount, false); trace_writer_.WriteGammaRamp(gamma_ramp_256_entry_table(), gamma_ramp_pwl_rgb(), gamma_ramp_rw_component_); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 814a74a7c..93589cf7b 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -17,6 +17,7 @@ #include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" +#include "xenia/base/memory.h" #include "xenia/base/profiling.h" #include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/gpu/d3d12/d3d12_graphics_system.h" @@ -2306,8 +2307,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) { vfetch_bits_remaining &= ~(uint32_t(1) << j); uint32_t vfetch_index = i * 32 + j; - const auto& vfetch_constant = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2); + xenos::xe_gpu_vertex_fetch_t vfetch_constant = + regs.GetVertexFetch(vfetch_index); switch (vfetch_constant.type) { case xenos::FetchConstantType::kVertex: break; @@ -3050,10 +3051,10 @@ void D3D12CommandProcessor::UpdateFixedFunctionState( // Blend factor. float blend_factor[] = { - regs[XE_GPU_REG_RB_BLEND_RED].f32, - regs[XE_GPU_REG_RB_BLEND_GREEN].f32, - regs[XE_GPU_REG_RB_BLEND_BLUE].f32, - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32, + regs.Get(XE_GPU_REG_RB_BLEND_RED), + regs.Get(XE_GPU_REG_RB_BLEND_GREEN), + regs.Get(XE_GPU_REG_RB_BLEND_BLUE), + regs.Get(XE_GPU_REG_RB_BLEND_ALPHA), }; // std::memcmp instead of != so in case of NaN, every draw won't be // invalidating it. @@ -3100,7 +3101,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( auto pa_cl_clip_cntl = regs.Get(); auto pa_cl_vte_cntl = regs.Get(); auto pa_su_sc_mode_cntl = regs.Get(); - float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32; + auto rb_alpha_ref = regs.Get(XE_GPU_REG_RB_ALPHA_REF); auto rb_colorcontrol = regs.Get(); auto rb_depth_info = regs.Get(); auto rb_stencilrefmask = regs.Get(); @@ -3241,9 +3242,9 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( // Tessellation factor range, plus 1.0 according to the images in // https://www.slideshare.net/blackdevilvikas/next-generation-graphics-programming-on-xbox-360 float tessellation_factor_min = - regs[XE_GPU_REG_VGT_HOS_MIN_TESS_LEVEL].f32 + 1.0f; + regs.Get(XE_GPU_REG_VGT_HOS_MIN_TESS_LEVEL) + 1.0f; float tessellation_factor_max = - regs[XE_GPU_REG_VGT_HOS_MAX_TESS_LEVEL].f32 + 1.0f; + regs.Get(XE_GPU_REG_VGT_HOS_MAX_TESS_LEVEL) + 1.0f; dirty |= system_constants_.tessellation_factor_range_min != tessellation_factor_min; system_constants_.tessellation_factor_range_min = tessellation_factor_min; @@ -3280,12 +3281,12 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( while (xe::bit_scan_forward(user_clip_planes_remaining, &user_clip_plane_index)) { user_clip_planes_remaining &= ~(UINT32_C(1) << user_clip_plane_index); - const float* user_clip_plane = - ®s[XE_GPU_REG_PA_CL_UCP_0_X + user_clip_plane_index * 4].f32; - if (std::memcmp(user_clip_plane_write_ptr, user_clip_plane, + const void* user_clip_plane_regs = + ®s[XE_GPU_REG_PA_CL_UCP_0_X + user_clip_plane_index * 4]; + if (std::memcmp(user_clip_plane_write_ptr, user_clip_plane_regs, 4 * sizeof(float))) { dirty = true; - std::memcpy(user_clip_plane_write_ptr, user_clip_plane, + std::memcpy(user_clip_plane_write_ptr, user_clip_plane_regs, 4 * sizeof(float)); } user_clip_plane_write_ptr += 4; @@ -3423,9 +3424,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( color_exp_bias -= 5; } } - float color_exp_bias_scale; - *reinterpret_cast(&color_exp_bias_scale) = - 0x3F800000 + (color_exp_bias << 23); + auto color_exp_bias_scale = xe::memory::Reinterpret( + int32_t(0x3F800000 + (color_exp_bias << 23))); dirty |= system_constants_.color_exp_bias[i] != color_exp_bias_scale; system_constants_.color_exp_bias[i] = color_exp_bias_scale; if (edram_rov_used) { @@ -3454,7 +3454,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( std::memcpy(system_constants_.edram_rt_clamp[i], rt_clamp[i], 4 * sizeof(float)); uint32_t blend_factors_ops = - regs[reg::RB_BLENDCONTROL::rt_register_indices[i]].u32 & 0x1FFF1FFF; + regs[reg::RB_BLENDCONTROL::rt_register_indices[i]] & 0x1FFF1FFF; dirty |= system_constants_.edram_rt_blend_factors_ops[i] != blend_factors_ops; system_constants_.edram_rt_blend_factors_ops[i] = blend_factors_ops; @@ -3477,22 +3477,22 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( if (primitive_polygonal) { if (pa_su_sc_mode_cntl.poly_offset_front_enable) { poly_offset_front_scale = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE); poly_offset_front_offset = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET); } if (pa_su_sc_mode_cntl.poly_offset_back_enable) { poly_offset_back_scale = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE); poly_offset_back_offset = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET); } } else { if (pa_su_sc_mode_cntl.poly_offset_para_enable) { poly_offset_front_scale = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE); poly_offset_front_offset = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET); poly_offset_back_scale = poly_offset_front_scale; poly_offset_back_offset = poly_offset_front_offset; } @@ -3567,21 +3567,21 @@ void D3D12CommandProcessor::UpdateSystemConstantValues( } dirty |= system_constants_.edram_blend_constant[0] != - regs[XE_GPU_REG_RB_BLEND_RED].f32; + regs.Get(XE_GPU_REG_RB_BLEND_RED); system_constants_.edram_blend_constant[0] = - regs[XE_GPU_REG_RB_BLEND_RED].f32; + regs.Get(XE_GPU_REG_RB_BLEND_RED); dirty |= system_constants_.edram_blend_constant[1] != - regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + regs.Get(XE_GPU_REG_RB_BLEND_GREEN); system_constants_.edram_blend_constant[1] = - regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + regs.Get(XE_GPU_REG_RB_BLEND_GREEN); dirty |= system_constants_.edram_blend_constant[2] != - regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + regs.Get(XE_GPU_REG_RB_BLEND_BLUE); system_constants_.edram_blend_constant[2] = - regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + regs.Get(XE_GPU_REG_RB_BLEND_BLUE); dirty |= system_constants_.edram_blend_constant[3] != - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; + regs.Get(XE_GPU_REG_RB_BLEND_ALPHA); system_constants_.edram_blend_constant[3] = - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; + regs.Get(XE_GPU_REG_RB_BLEND_ALPHA); } cbuffer_binding_system_.up_to_date &= !dirty; @@ -3638,10 +3638,10 @@ bool D3D12CommandProcessor::UpdateBindings(const D3D12Shader* vertex_shader, // These are the constant base addresses/ranges for shaders. // We have these hardcoded right now cause nothing seems to differ on the Xbox // 360 (however, OpenGL ES on Adreno 200 on Android has different ranges). - assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 || - regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000); - assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 || - regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000); + assert_true(regs[XE_GPU_REG_SQ_VS_CONST] == 0x000FF000 || + regs[XE_GPU_REG_SQ_VS_CONST] == 0x00000000); + assert_true(regs[XE_GPU_REG_SQ_PS_CONST] == 0x000FF100 || + regs[XE_GPU_REG_SQ_PS_CONST] == 0x00000000); // Check if the float constant layout is still the same and get the counts. const Shader::ConstantRegisterMap& float_constant_map_vertex = vertex_shader->constant_register_map(); @@ -3715,8 +3715,7 @@ bool D3D12CommandProcessor::UpdateBindings(const D3D12Shader* vertex_shader, float_constant_map_entry &= ~(1ull << float_constant_index); std::memcpy(float_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) + - (float_constant_index << 2)] - .f32, + (float_constant_index << 2)], 4 * sizeof(float)); float_constants += 4 * sizeof(float); } @@ -3746,8 +3745,7 @@ bool D3D12CommandProcessor::UpdateBindings(const D3D12Shader* vertex_shader, float_constant_map_entry &= ~(1ull << float_constant_index); std::memcpy(float_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) + - (float_constant_index << 2)] - .f32, + (float_constant_index << 2)], 4 * sizeof(float)); float_constants += 4 * sizeof(float); } @@ -3767,7 +3765,7 @@ bool D3D12CommandProcessor::UpdateBindings(const D3D12Shader* vertex_shader, return false; } std::memcpy(bool_loop_constants, - ®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32, + ®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031], kBoolLoopConstantsSize); cbuffer_binding_bool_loop_.up_to_date = true; current_graphics_root_up_to_date_ &= @@ -3782,8 +3780,7 @@ bool D3D12CommandProcessor::UpdateBindings(const D3D12Shader* vertex_shader, if (fetch_constants == nullptr) { return false; } - std::memcpy(fetch_constants, - ®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32, + std::memcpy(fetch_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0], kFetchConstantsSize); cbuffer_binding_fetch_.up_to_date = true; current_graphics_root_up_to_date_ &= diff --git a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc index 24904c7e8..9e3b794d1 100644 --- a/src/xenia/gpu/d3d12/d3d12_texture_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_texture_cache.cc @@ -960,8 +960,8 @@ uint32_t D3D12TextureCache::GetActiveTextureBindlessSRVIndex( D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters( const D3D12Shader::SamplerBinding& binding) const { const auto& regs = register_file(); - const auto& fetch = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + binding.fetch_constant * 6); + xenos::xe_gpu_texture_fetch_t fetch = + regs.GetTextureFetch(binding.fetch_constant); SamplerParameters parameters; @@ -1441,8 +1441,7 @@ ID3D12Resource* D3D12TextureCache::RequestSwapTexture( D3D12_SHADER_RESOURCE_VIEW_DESC& srv_desc_out, xenos::TextureFormat& format_out) { const auto& regs = register_file(); - const auto& fetch = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0); + xenos::xe_gpu_texture_fetch_t fetch = regs.GetTextureFetch(0); TextureKey key; BindingInfoFromFetchConstant(fetch, key, nullptr); if (!key.is_valid || key.base_page == 0 || diff --git a/src/xenia/gpu/draw_extent_estimator.cc b/src/xenia/gpu/draw_extent_estimator.cc index fb65fb96b..20c6086ee 100644 --- a/src/xenia/gpu/draw_extent_estimator.cc +++ b/src/xenia/gpu/draw_extent_estimator.cc @@ -15,6 +15,7 @@ #include "xenia/base/assert.h" #include "xenia/base/cvar.h" +#include "xenia/base/memory.h" #include "xenia/base/profiling.h" #include "xenia/gpu/registers.h" #include "xenia/gpu/ucode.h" @@ -67,7 +68,7 @@ void DrawExtentEstimator::PositionYExportSink::Export( point_size_ = value[0]; } if (value_mask & 0b0100) { - vertex_kill_ = *reinterpret_cast(&value[2]); + vertex_kill_ = xe::memory::Reinterpret(value[2]); } } } @@ -110,7 +111,7 @@ uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) { xenos::Endian index_endian = vgt_dma_size.swap_mode; if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) { xenos::IndexFormat index_format = vgt_draw_initiator.index_size; - uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32; + uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE]; uint32_t index_buffer_read_count = std::min(uint32_t(vgt_draw_initiator.num_indices), uint32_t(vgt_dma_size.num_words)); @@ -145,21 +146,22 @@ uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) { auto pa_cl_vte_cntl = regs.Get(); float viewport_y_scale = pa_cl_vte_cntl.vport_y_scale_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 + ? regs.Get(XE_GPU_REG_PA_CL_VPORT_YSCALE) : 1.0f; - float viewport_y_offset = pa_cl_vte_cntl.vport_y_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 - : 0.0f; + float viewport_y_offset = + pa_cl_vte_cntl.vport_y_offset_ena + ? regs.Get(XE_GPU_REG_PA_CL_VPORT_YOFFSET) + : 0.0f; int32_t point_vertex_min_diameter_float = 0; int32_t point_vertex_max_diameter_float = 0; float point_constant_radius_y = 0.0f; if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) { auto pa_su_point_minmax = regs.Get(); - *reinterpret_cast(&point_vertex_min_diameter_float) = - float(pa_su_point_minmax.min_size) * (2.0f / 16.0f); - *reinterpret_cast(&point_vertex_max_diameter_float) = - float(pa_su_point_minmax.max_size) * (2.0f / 16.0f); + point_vertex_min_diameter_float = xe::memory::Reinterpret( + float(pa_su_point_minmax.min_size) * (2.0f / 16.0f)); + point_vertex_max_diameter_float = xe::memory::Reinterpret( + float(pa_su_point_minmax.max_size) * (2.0f / 16.0f)); point_constant_radius_y = float(regs.Get().height) * (1.0f / 16.0f); } @@ -224,12 +226,13 @@ uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) { // Vertex-specified diameter. Clamped effectively as a signed integer in // the hardware, -NaN, -Infinity ... -0 to the minimum, +Infinity, +NaN // to the maximum. - point_radius_y = position_y_export_sink.point_size().value(); - *reinterpret_cast(&point_radius_y) = std::min( - point_vertex_max_diameter_float, - std::max(point_vertex_min_diameter_float, - *reinterpret_cast(&point_radius_y))); - point_radius_y *= 0.5f; + point_radius_y = + 0.5f * + xe::memory::Reinterpret(std::min( + point_vertex_max_diameter_float, + std::max(point_vertex_min_diameter_float, + xe::memory::Reinterpret( + position_y_export_sink.point_size().value())))); } else { // Constant radius. point_radius_y = point_constant_radius_y; @@ -331,11 +334,12 @@ uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y, } // Then apply the floating-point viewport offset. if (pa_cl_vte_cntl.vport_y_offset_ena) { - viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; + viewport_bottom += regs.Get(XE_GPU_REG_PA_CL_VPORT_YOFFSET); } - viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena - ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32) - : 1.0f; + viewport_bottom += + pa_cl_vte_cntl.vport_y_scale_ena + ? std::abs(regs.Get(XE_GPU_REG_PA_CL_VPORT_YSCALE)) + : 1.0f; // Using floor, or, rather, truncation (because maxing with zero anyway) // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia // GPUs on Direct3D 12 (but not WARP), also like in diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index eb61c39cb..73494a7f2 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -11,7 +11,6 @@ #include #include -#include #include "xenia/base/assert.h" #include "xenia/base/cvar.h" @@ -100,20 +99,20 @@ void GetPreferredFacePolygonOffset(const RegisterFile& regs, // ones that are rendered (except for shadow volumes). if (pa_su_sc_mode_cntl.poly_offset_front_enable && !pa_su_sc_mode_cntl.cull_front) { - scale = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; - offset = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + scale = regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE); + offset = regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET); } if (pa_su_sc_mode_cntl.poly_offset_back_enable && !pa_su_sc_mode_cntl.cull_back && !scale && !offset) { - scale = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE].f32; - offset = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET].f32; + scale = regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE); + offset = regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET); } } else { // Non-triangle primitives use the front offset, but it's toggled via // poly_offset_para_enable. if (pa_su_sc_mode_cntl.poly_offset_para_enable) { - scale = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; - offset = regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + scale = regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE); + offset = regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET); } } scale_out = scale; @@ -148,7 +147,7 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader, } // Check if a color target is actually written. - uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; + uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK]; uint32_t rts_remaining = shader.writes_color_targets(); uint32_t rt_index; while (xe::bit_scan_forward(rts_remaining, &rt_index)) { @@ -311,24 +310,26 @@ void GetHostViewportInfo(const RegisterFile& regs, // Obtain the original viewport values in a normalized way. float scale_xy[] = { - pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 - : 1.0f, - pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 - : 1.0f, + pa_cl_vte_cntl.vport_x_scale_ena + ? regs.Get(XE_GPU_REG_PA_CL_VPORT_XSCALE) + : 1.0f, + pa_cl_vte_cntl.vport_y_scale_ena + ? regs.Get(XE_GPU_REG_PA_CL_VPORT_YSCALE) + : 1.0f, }; float scale_z = pa_cl_vte_cntl.vport_z_scale_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 + ? regs.Get(XE_GPU_REG_PA_CL_VPORT_ZSCALE) : 1.0f; float offset_base_xy[] = { pa_cl_vte_cntl.vport_x_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 + ? regs.Get(XE_GPU_REG_PA_CL_VPORT_XOFFSET) : 0.0f, pa_cl_vte_cntl.vport_y_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 + ? regs.Get(XE_GPU_REG_PA_CL_VPORT_YOFFSET) : 0.0f, }; float offset_z = pa_cl_vte_cntl.vport_z_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 + ? regs.Get(XE_GPU_REG_PA_CL_VPORT_ZOFFSET) : 0.0f; // Calculate all the integer.0 or integer.5 offsetting exactly at full // precision, separately so it can be used in other integer calculations @@ -615,7 +616,7 @@ uint32_t GetNormalizedColorMask(const RegisterFile& regs, return 0; } uint32_t normalized_color_mask = 0; - uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; + uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK]; for (uint32_t i = 0; i < xenos::kMaxColorRenderTargets; ++i) { // Exclude the render targets not statically written to by the pixel shader. // If the shader doesn't write to a render target, it shouldn't be written @@ -661,9 +662,8 @@ void AddMemExportRanges(const RegisterFile& regs, const Shader& shader, ? regs.Get().base : regs.Get().base; for (uint32_t constant_index : shader.memexport_stream_constants()) { - const auto& stream = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_000_X + - (float_constants_base + constant_index) * 4); + xenos::xe_gpu_memexport_stream_t stream = + regs.GetMemExportStream(float_constants_base + constant_index); if (!stream.index_count) { continue; } @@ -705,7 +705,7 @@ void AddMemExportRanges(const RegisterFile& regs, const Shader& shader, } // Add a new range if haven't expanded an existing one. if (!range_reused) { - ranges_out.emplace_back(stream.base_address, stream_size_bytes); + ranges_out.emplace_back(uint32_t(stream.base_address), stream_size_bytes); } } } @@ -824,8 +824,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, // Get the extent of pixels covered by the resolve rectangle, according to the // top-left rasterization rule. // D3D9 HACK: Vertices to use are always in vf0, and are written by the CPU. - auto fetch = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0); + xenos::xe_gpu_vertex_fetch_t fetch = regs.GetVertexFetch(0); if (fetch.type != xenos::FetchConstantType::kVertex || fetch.size != 3 * 2) { XELOGE("Unsupported resolve vertex buffer format"); assert_always(); @@ -994,7 +993,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, } // Calculate the destination memory extent. - uint32_t rb_copy_dest_base = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32; + uint32_t rb_copy_dest_base = regs[XE_GPU_REG_RB_COPY_DEST_BASE]; uint32_t copy_dest_base_adjusted = rb_copy_dest_base; uint32_t copy_dest_extent_start, copy_dest_extent_end; auto rb_copy_dest_pitch = regs.Get(); @@ -1164,9 +1163,9 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, info_out.copy_dest_info.copy_dest_swap = false; } - info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32; - info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32; - info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32; + info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR]; + info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR]; + info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO]; XELOGD( "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially " diff --git a/src/xenia/gpu/dxbc.h b/src/xenia/gpu/dxbc.h index 57b8511c6..d2bef4458 100644 --- a/src/xenia/gpu/dxbc.h +++ b/src/xenia/gpu/dxbc.h @@ -17,6 +17,7 @@ #include "xenia/base/assert.h" #include "xenia/base/math.h" +#include "xenia/base/memory.h" namespace xe { namespace gpu { @@ -1102,10 +1103,10 @@ struct Src : OperandAddress { } static Src LI(int32_t x) { return LI(x, x, x, x); } static Src LF(float x, float y, float z, float w) { - return LU(*reinterpret_cast(&x), - *reinterpret_cast(&y), - *reinterpret_cast(&z), - *reinterpret_cast(&w)); + return LU(xe::memory::Reinterpret(x), + xe::memory::Reinterpret(y), + xe::memory::Reinterpret(z), + xe::memory::Reinterpret(w)); } static Src LF(float x) { return LF(x, x, x, x); } static Src LP(const uint32_t* xyzw) { @@ -1222,12 +1223,10 @@ struct Src : OperandAddress { bool negate) { if (is_integer) { if (absolute) { - *reinterpret_cast(&value) = - std::abs(*reinterpret_cast(&value)); + value = uint32_t(std::abs(int32_t(value))); } if (negate) { - *reinterpret_cast(&value) = - -*reinterpret_cast(&value); + value = uint32_t(-int32_t(value)); } } else { if (absolute) { diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index b5470fd0a..3c04e0fff 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -201,7 +201,7 @@ uint32_t GraphicsSystem::ReadRegister(uint32_t addr) { } assert_true(r < RegisterFile::kRegisterCount); - return register_file_.values[r].u32; + return register_file_.values[r]; } void GraphicsSystem::WriteRegister(uint32_t addr, uint32_t value) { @@ -219,7 +219,7 @@ void GraphicsSystem::WriteRegister(uint32_t addr, uint32_t value) { } assert_true(r < RegisterFile::kRegisterCount); - register_file_.values[r].u32 = value; + register_file_.values[r] = value; } void GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) { diff --git a/src/xenia/gpu/packet_disassembler.h b/src/xenia/gpu/packet_disassembler.h index 942a88409..c4572b928 100644 --- a/src/xenia/gpu/packet_disassembler.h +++ b/src/xenia/gpu/packet_disassembler.h @@ -42,7 +42,7 @@ struct PacketAction { union { struct { uint32_t index; - RegisterFile::RegisterValue value; + uint32_t value; } register_write; struct { uint64_t value; @@ -56,7 +56,7 @@ struct PacketAction { PacketAction action; action.type = Type::kRegisterWrite; action.register_write.index = index; - action.register_write.value.u32 = value; + action.register_write.value = value; return action; } diff --git a/src/xenia/gpu/primitive_processor.cc b/src/xenia/gpu/primitive_processor.cc index 827fb7b4e..9e20be2c4 100644 --- a/src/xenia/gpu/primitive_processor.cc +++ b/src/xenia/gpu/primitive_processor.cc @@ -498,8 +498,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { uint32_t index_size_log2 = guest_index_format == xenos::IndexFormat::kInt16 ? 1 : 2; // The base should already be aligned, but aligning here too for safety. - guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 & - ~uint32_t((1 << index_size_log2) - 1); + guest_index_base = + regs[XE_GPU_REG_VGT_DMA_BASE] & ~uint32_t((1 << index_size_log2) - 1); guest_index_buffer_needed_bytes = guest_draw_vertex_count << index_size_log2; if (guest_index_base > SharedMemory::kBufferSize || @@ -652,8 +652,8 @@ bool PrimitiveProcessor::Process(ProcessingResult& result_out) { uint32_t index_size_log2 = guest_index_format == xenos::IndexFormat::kInt16 ? 1 : 2; // The base should already be aligned, but aligning here too for safety. - guest_index_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32 & - ~uint32_t((1 << index_size_log2) - 1); + guest_index_base = + regs[XE_GPU_REG_VGT_DMA_BASE] & ~uint32_t((1 << index_size_log2) - 1); guest_index_buffer_needed_bytes = guest_draw_vertex_count << index_size_log2; if (guest_index_base > SharedMemory::kBufferSize || diff --git a/src/xenia/gpu/register_file.h b/src/xenia/gpu/register_file.h index e9a4f1137..40870810f 100644 --- a/src/xenia/gpu/register_file.h +++ b/src/xenia/gpu/register_file.h @@ -12,8 +12,12 @@ #include #include +#include +#include "xenia/base/assert.h" +#include "xenia/base/memory.h" #include "xenia/gpu/registers.h" +#include "xenia/gpu/xenos.h" namespace xe { namespace gpu { @@ -34,39 +38,53 @@ class RegisterFile { static const RegisterInfo* GetRegisterInfo(uint32_t index); static constexpr size_t kRegisterCount = 0x5003; - union RegisterValue { - uint32_t u32; - float f32; - }; - RegisterValue values[kRegisterCount]; + uint32_t values[kRegisterCount]; + + const uint32_t& operator[](uint32_t reg) const { return values[reg]; } + uint32_t& operator[](uint32_t reg) { return values[reg]; } - const RegisterValue& operator[](uint32_t reg) const { return values[reg]; } - RegisterValue& operator[](uint32_t reg) { return values[reg]; } - const RegisterValue& operator[](Register reg) const { return values[reg]; } - RegisterValue& operator[](Register reg) { return values[reg]; } template - const T& Get(uint32_t reg) const { - return *reinterpret_cast(&values[reg]); + T Get(uint32_t reg) const { + return xe::memory::Reinterpret(values[reg]); } template - T& Get(uint32_t reg) { - return *reinterpret_cast(&values[reg]); + T Get(Register reg) const { + return Get(static_cast(reg)); } template - const T& Get(Register reg) const { - return *reinterpret_cast(&values[reg]); + T Get() const { + return Get(T::register_index); } - template - T& Get(Register reg) { - return *reinterpret_cast(&values[reg]); + + xenos::xe_gpu_vertex_fetch_t GetVertexFetch(uint32_t index) const { + assert_true(index < 96); + xenos::xe_gpu_vertex_fetch_t fetch; + std::memcpy(&fetch, + &values[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + + (sizeof(fetch) / sizeof(uint32_t)) * index], + sizeof(fetch)); + return fetch; } - template - const T& Get() const { - return *reinterpret_cast(&values[T::register_index]); + + xenos::xe_gpu_texture_fetch_t GetTextureFetch(uint32_t index) const { + assert_true(index < 32); + xenos::xe_gpu_texture_fetch_t fetch; + std::memcpy(&fetch, + &values[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + + (sizeof(fetch) / sizeof(uint32_t)) * index], + sizeof(fetch)); + return fetch; } - template - T& Get() { - return *reinterpret_cast(&values[T::register_index]); + + xenos::xe_gpu_memexport_stream_t GetMemExportStream( + uint32_t float_constant_index) const { + assert_true(float_constant_index < 512); + xenos::xe_gpu_memexport_stream_t stream; + std::memcpy( + &stream, + &values[XE_GPU_REG_SHADER_CONSTANT_000_X + 4 * float_constant_index], + sizeof(stream)); + return stream; } }; diff --git a/src/xenia/gpu/shader_interpreter.cc b/src/xenia/gpu/shader_interpreter.cc index 9e1084397..9a1342aca 100644 --- a/src/xenia/gpu/shader_interpreter.cc +++ b/src/xenia/gpu/shader_interpreter.cc @@ -28,10 +28,7 @@ void ShaderInterpreter::Execute() { state_.Reset(); const uint32_t* bool_constants = - ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32; - const xenos::LoopConstant* loop_constants = - reinterpret_cast( - ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].u32); + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031]; bool exec_ended = false; uint32_t cf_index_next = 1; @@ -140,8 +137,8 @@ void ShaderInterpreter::Execute() { cf_index_next = cf_loop_start.address(); continue; } - xenos::LoopConstant loop_constant = - loop_constants[cf_loop_start.loop_id()]; + auto loop_constant = register_file_.Get( + XE_GPU_REG_SHADER_CONSTANT_LOOP_00 + cf_loop_start.loop_id()); state_.loop_constants[state_.loop_stack_depth] = loop_constant; uint32_t& loop_iterator_ref = state_.loop_iterators[state_.loop_stack_depth]; @@ -170,8 +167,11 @@ void ShaderInterpreter::Execute() { &cf_instr); xenos::LoopConstant loop_constant = state_.loop_constants[state_.loop_stack_depth - 1]; - assert_true(loop_constant.value == - loop_constants[cf_loop_end.loop_id()].value); + assert_zero( + std::memcmp(&loop_constant, + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_LOOP_00 + + cf_loop_end.loop_id()], + sizeof(loop_constant))); uint32_t loop_iterator = ++state_.loop_iterators[state_.loop_stack_depth - 1]; if (loop_iterator < loop_constant.count && @@ -257,28 +257,31 @@ void ShaderInterpreter::Execute() { } } -const float* ShaderInterpreter::GetFloatConstant( +const std::array ShaderInterpreter::GetFloatConstant( uint32_t address, bool is_relative, bool relative_address_is_a0) const { - static const float zero[4] = {}; int32_t index = int32_t(address); if (is_relative) { index += relative_address_is_a0 ? state_.address_register : state_.GetLoopAddress(); } if (index < 0) { - return zero; + return std::array(); } auto base_and_size_minus_1 = register_file_.Get( shader_type_ == xenos::ShaderType::kVertex ? XE_GPU_REG_SQ_VS_CONST : XE_GPU_REG_SQ_PS_CONST); if (uint32_t(index) > base_and_size_minus_1.size) { - return zero; + return std::array(); } index += base_and_size_minus_1.base; if (index >= 512) { - return zero; + return std::array(); } - return ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_000_X + 4 * index].f32; + std::array value; + std::memcpy(value.data(), + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_000_X + 4 * index], + sizeof(float) * 4); + return value; } void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { @@ -297,6 +300,7 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { const float* vector_src_ptr; uint32_t vector_src_register = instr.src_reg(1 + i); bool vector_src_absolute = false; + std::array vector_src_float_constant; if (instr.src_is_temp(1 + i)) { vector_src_ptr = GetTempRegister( ucode::AluInstruction::src_temp_reg(vector_src_register), @@ -304,9 +308,10 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { vector_src_absolute = ucode::AluInstruction::is_src_temp_value_absolute( vector_src_register); } else { - vector_src_ptr = GetFloatConstant( + vector_src_float_constant = GetFloatConstant( vector_src_register, instr.src_const_is_addressed(1 + i), instr.is_const_address_register_relative()); + vector_src_ptr = vector_src_float_constant.data(); } uint32_t vector_src_absolute_mask = ~(uint32_t(vector_src_absolute) << 31); @@ -618,6 +623,7 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { // r#/c#.w or r#/c#.wx. const float* scalar_src_ptr; uint32_t scalar_src_register = instr.src_reg(3); + std::array scalar_src_float_constant; if (instr.src_is_temp(3)) { scalar_src_ptr = GetTempRegister( ucode::AluInstruction::src_temp_reg(scalar_src_register), @@ -625,9 +631,10 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { scalar_src_absolute = ucode::AluInstruction::is_src_temp_value_absolute( scalar_src_register); } else { - scalar_src_ptr = GetFloatConstant( + scalar_src_float_constant = GetFloatConstant( scalar_src_register, instr.src_const_is_addressed(3), instr.is_const_address_register_relative()); + scalar_src_ptr = scalar_src_float_constant.data(); } uint32_t scalar_src_swizzle = instr.src_swizzle(3); scalar_operand_component_count = @@ -984,10 +991,8 @@ void ShaderInterpreter::ExecuteVertexFetchInstruction( state_.vfetch_full_last = instr; } - xenos::xe_gpu_vertex_fetch_t fetch_constant = - *reinterpret_cast( - ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + - state_.vfetch_full_last.fetch_constant_index()]); + xenos::xe_gpu_vertex_fetch_t fetch_constant = register_file_.GetVertexFetch( + state_.vfetch_full_last.fetch_constant_index()); if (!instr.is_mini_fetch()) { // Get the part of the address that depends on vfetch_full data. diff --git a/src/xenia/gpu/shader_interpreter.h b/src/xenia/gpu/shader_interpreter.h index dca530221..47b3d957a 100644 --- a/src/xenia/gpu/shader_interpreter.h +++ b/src/xenia/gpu/shader_interpreter.h @@ -11,6 +11,7 @@ #define XENIA_GPU_SHADER_INTERPRETER_H_ #include +#include #include #include @@ -120,8 +121,8 @@ class ShaderInterpreter { float* GetTempRegister(uint32_t address, bool is_relative) { return temp_registers_[GetTempRegisterIndex(address, is_relative)]; } - const float* GetFloatConstant(uint32_t address, bool is_relative, - bool relative_address_is_a0) const; + const std::array GetFloatConstant( + uint32_t address, bool is_relative, bool relative_address_is_a0) const; void ExecuteAluInstruction(ucode::AluInstruction instr); void StoreFetchResult(uint32_t dest, bool is_dest_relative, uint32_t swizzle, diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc index 18fac01d9..7ba729b89 100644 --- a/src/xenia/gpu/texture_cache.cc +++ b/src/xenia/gpu/texture_cache.cc @@ -333,8 +333,7 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) { uint32_t index_bit = UINT32_C(1) << index; textures_remaining &= ~index_bit; TextureBinding& binding = texture_bindings_[index]; - const auto& fetch = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6); + xenos::xe_gpu_texture_fetch_t fetch = regs.GetTextureFetch(index); TextureKey old_key = binding.key; uint8_t old_swizzled_signs = binding.swizzled_signs; BindingInfoFromFetchConstant(fetch, binding.key, &binding.swizzled_signs); diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index be614eda3..178c30fc9 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -19,6 +19,7 @@ #include "xenia/base/filesystem.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" +#include "xenia/base/memory.h" #include "xenia/base/platform.h" #include "xenia/base/string.h" #include "xenia/base/system.h" @@ -357,9 +358,10 @@ void TraceViewer::DrawPacketDisassemblerUI() { ImGui::NextColumn(); if (!register_info || register_info->type == RegisterInfo::Type::kDword) { - ImGui::Text("%.8X", action.register_write.value.u32); + ImGui::Text("%.8X", action.register_write.value); } else { - ImGui::Text("%8f", action.register_write.value.f32); + ImGui::Text("%8f", xe::memory::Reinterpret( + action.register_write.value)); } ImGui::Columns(1); break; @@ -709,10 +711,8 @@ void TraceViewer::DrawTextureInfo( const Shader::TextureBinding& texture_binding) { auto& regs = *graphics_system_->register_file(); - int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + - texture_binding.fetch_constant * 6; - auto group = reinterpret_cast(®s.values[r]); - auto& fetch = group->texture_fetch; + xenos::xe_gpu_texture_fetch_t fetch = + regs.GetTextureFetch(texture_binding.fetch_constant); if (fetch.type != xenos::FetchConstantType::kTexture && (!cvars::gpu_allow_invalid_fetch_constants || fetch.type != xenos::FetchConstantType::kInvalidTexture)) { @@ -780,9 +780,9 @@ void TraceViewer::DrawFailedTextureInfo( void TraceViewer::DrawVertexFetcher(Shader* shader, const Shader::VertexBinding& vertex_binding, - const xe_gpu_vertex_fetch_t* fetch) { - const uint8_t* addr = memory_->TranslatePhysical(fetch->address << 2); - uint32_t vertex_count = fetch->size / vertex_binding.stride_words; + const xe_gpu_vertex_fetch_t& fetch) { + const uint8_t* addr = memory_->TranslatePhysical(fetch.address << 2); + uint32_t vertex_count = fetch.size / vertex_binding.stride_words; int column_count = 0; for (const auto& attrib : vertex_binding.attributes) { switch (attrib.fetch_instr.attributes.data_format) { @@ -883,7 +883,7 @@ void TraceViewer::DrawVertexFetcher(Shader* shader, #define LOADEL(type, wo) \ GpuSwap(xe::load(vstart + \ (attrib.fetch_instr.attributes.offset + wo) * 4), \ - fetch->endian) + fetch.endian) switch (attrib.fetch_instr.attributes.data_format) { case xenos::VertexFormat::k_32: ImGui::Text("%.8X", LOADEL(uint32_t, 0)); @@ -1187,7 +1187,7 @@ void TraceViewer::DrawStateUI() { } auto enable_mode = - static_cast(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7); + static_cast(regs[XE_GPU_REG_RB_MODECONTROL] & 0x7); const char* mode_name = "Unknown"; switch (enable_mode) { @@ -1210,7 +1210,7 @@ void TraceViewer::DrawStateUI() { break; } case ModeControl::kCopy: { - uint32_t copy_dest_base = regs[XE_GPU_REG_RB_COPY_DEST_BASE].u32; + uint32_t copy_dest_base = regs[XE_GPU_REG_RB_COPY_DEST_BASE]; ImGui::Text("Copy Command %d (to %.8X)", player_->current_command_index(), copy_dest_base); break; @@ -1221,9 +1221,9 @@ void TraceViewer::DrawStateUI() { ImGui::BulletText("Viewport State:"); if (true) { ImGui::TreePush((const void*)0); - uint32_t pa_su_sc_mode_cntl = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; + uint32_t pa_su_sc_mode_cntl = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL]; if ((pa_su_sc_mode_cntl >> 16) & 1) { - uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; + uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET]; int16_t window_offset_x = window_offset & 0x7FFF; int16_t window_offset_y = (window_offset >> 16) & 0x7FFF; if (window_offset_x & 0x4000) { @@ -1237,8 +1237,8 @@ void TraceViewer::DrawStateUI() { } else { ImGui::BulletText("Window Offset: disabled"); } - uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; - uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; + uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL]; + uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR]; ImGui::BulletText( "Window Scissor: %d,%d to %d,%d (%d x %d)", window_scissor_tl & 0x7FFF, (window_scissor_tl >> 16) & 0x7FFF, window_scissor_br & 0x7FFF, @@ -1246,7 +1246,7 @@ void TraceViewer::DrawStateUI() { (window_scissor_br & 0x7FFF) - (window_scissor_tl & 0x7FFF), ((window_scissor_br >> 16) & 0x7FFF) - ((window_scissor_tl >> 16) & 0x7FFF)); - uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32; + uint32_t surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO]; uint32_t surface_hiz = (surface_info >> 18) & 0x3FFF; uint32_t surface_pitch = surface_info & 0x3FFF; auto surface_msaa = (surface_info >> 16) & 0x3; @@ -1258,7 +1258,7 @@ void TraceViewer::DrawStateUI() { ImGui::BulletText("Surface Pitch: %d", surface_pitch); ImGui::BulletText("Surface HI-Z Pitch: %d", surface_hiz); ImGui::BulletText("Surface MSAA: %s", kMsaaNames[surface_msaa]); - uint32_t vte_control = regs[XE_GPU_REG_PA_CL_VTE_CNTL].u32; + uint32_t vte_control = regs[XE_GPU_REG_PA_CL_VTE_CNTL]; bool vport_xscale_enable = (vte_control & (1 << 0)) > 0; bool vport_xoffset_enable = (vte_control & (1 << 1)) > 0; bool vport_yscale_enable = (vte_control & (1 << 2)) > 0; @@ -1273,14 +1273,20 @@ void TraceViewer::DrawStateUI() { } ImGui::BulletText( "Viewport Offset: %f, %f, %f", - vport_xoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 : 0, - vport_yoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 : 0, - vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0); + vport_xoffset_enable ? regs.Get(XE_GPU_REG_PA_CL_VPORT_XOFFSET) + : 0.0f, + vport_yoffset_enable ? regs.Get(XE_GPU_REG_PA_CL_VPORT_YOFFSET) + : 0.0f, + vport_zoffset_enable ? regs.Get(XE_GPU_REG_PA_CL_VPORT_ZOFFSET) + : 0.0f); ImGui::BulletText( "Viewport Scale: %f, %f, %f", - vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1, - vport_yscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 : 1, - vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1); + vport_xscale_enable ? regs.Get(XE_GPU_REG_PA_CL_VPORT_XSCALE) + : 1.0f, + vport_yscale_enable ? regs.Get(XE_GPU_REG_PA_CL_VPORT_YSCALE) + : 1.0f, + vport_zscale_enable ? regs.Get(XE_GPU_REG_PA_CL_VPORT_ZSCALE) + : 1.0f); if (!vport_xscale_enable) { ImGui::PopStyleColor(); } @@ -1290,7 +1296,7 @@ void TraceViewer::DrawStateUI() { ((vte_control >> 8) & 0x1) ? "y/w0" : "y", ((vte_control >> 9) & 0x1) ? "z/w0" : "z", ((vte_control >> 10) & 0x1) ? "w0" : "1/w0"); - uint32_t clip_control = regs[XE_GPU_REG_PA_CL_CLIP_CNTL].u32; + uint32_t clip_control = regs[XE_GPU_REG_PA_CL_CLIP_CNTL]; bool clip_enabled = ((clip_control >> 17) & 0x1) == 0; bool dx_clip = ((clip_control >> 20) & 0x1) == 0x1; ImGui::BulletText("Clip Enabled: %s, DX Clip: %s", @@ -1302,11 +1308,9 @@ void TraceViewer::DrawStateUI() { ImGui::BulletText("Rasterizer State:"); if (true) { ImGui::TreePush((const void*)0); - uint32_t pa_su_sc_mode_cntl = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; - uint32_t pa_sc_screen_scissor_tl = - regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32; - uint32_t pa_sc_screen_scissor_br = - regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32; + uint32_t pa_su_sc_mode_cntl = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL]; + uint32_t pa_sc_screen_scissor_tl = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL]; + uint32_t pa_sc_screen_scissor_br = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR]; if (pa_sc_screen_scissor_tl != 0 && pa_sc_screen_scissor_br != 0x20002000) { int32_t screen_scissor_x = pa_sc_screen_scissor_tl & 0x7FFF; int32_t screen_scissor_y = (pa_sc_screen_scissor_tl >> 16) & 0x7FFF; @@ -1361,7 +1365,7 @@ void TraceViewer::DrawStateUI() { } ImGui::Columns(1); - auto rb_surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO].u32; + auto rb_surface_info = regs[XE_GPU_REG_RB_SURFACE_INFO]; uint32_t surface_pitch = rb_surface_info & 0x3FFF; auto surface_msaa = static_cast((rb_surface_info >> 16) & 0x3); @@ -1370,39 +1374,39 @@ void TraceViewer::DrawStateUI() { if (enable_mode != ModeControl::kDepth) { // Alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard; - uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL].u32; + uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL]; if ((color_control & 0x8) != 0) { ImGui::BulletText("Alpha Test: %s %.2f", kCompareFuncNames[color_control & 0x7], - regs[XE_GPU_REG_RB_ALPHA_REF].f32); + regs.Get(XE_GPU_REG_RB_ALPHA_REF)); } else { ImGui::PushStyleColor(ImGuiCol_Text, kColorIgnored); ImGui::BulletText("Alpha Test: disabled"); ImGui::PopStyleColor(); } - auto blend_color = ImVec4(regs[XE_GPU_REG_RB_BLEND_RED].f32, - regs[XE_GPU_REG_RB_BLEND_GREEN].f32, - regs[XE_GPU_REG_RB_BLEND_BLUE].f32, - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32); + auto blend_color = ImVec4(regs.Get(XE_GPU_REG_RB_BLEND_RED), + regs.Get(XE_GPU_REG_RB_BLEND_GREEN), + regs.Get(XE_GPU_REG_RB_BLEND_BLUE), + regs.Get(XE_GPU_REG_RB_BLEND_ALPHA)); ImGui::BulletText("Blend Color: (%.2f,%.2f,%.2f,%.2f)", blend_color.x, blend_color.y, blend_color.z, blend_color.w); ImGui::SameLine(); // TODO small_height (was true) parameter was removed ImGui::ColorButton(nullptr, blend_color); - uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; + uint32_t rb_color_mask = regs[XE_GPU_REG_RB_COLOR_MASK]; uint32_t color_info[4] = { - regs[XE_GPU_REG_RB_COLOR_INFO].u32, - regs[XE_GPU_REG_RB_COLOR1_INFO].u32, - regs[XE_GPU_REG_RB_COLOR2_INFO].u32, - regs[XE_GPU_REG_RB_COLOR3_INFO].u32, + regs[XE_GPU_REG_RB_COLOR_INFO], + regs[XE_GPU_REG_RB_COLOR1_INFO], + regs[XE_GPU_REG_RB_COLOR2_INFO], + regs[XE_GPU_REG_RB_COLOR3_INFO], }; uint32_t rb_blendcontrol[4] = { - regs[XE_GPU_REG_RB_BLENDCONTROL0].u32, - regs[XE_GPU_REG_RB_BLENDCONTROL1].u32, - regs[XE_GPU_REG_RB_BLENDCONTROL2].u32, - regs[XE_GPU_REG_RB_BLENDCONTROL3].u32, + regs[XE_GPU_REG_RB_BLENDCONTROL0], + regs[XE_GPU_REG_RB_BLENDCONTROL1], + regs[XE_GPU_REG_RB_BLENDCONTROL2], + regs[XE_GPU_REG_RB_BLENDCONTROL3], }; ImGui::Columns(2); for (int i = 0; i < xe::countof(color_info); ++i) { @@ -1511,9 +1515,9 @@ void TraceViewer::DrawStateUI() { } if (ImGui::CollapsingHeader("Depth/Stencil Target")) { - auto rb_depthcontrol = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32; - auto rb_stencilrefmask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32; - auto rb_depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO].u32; + auto rb_depthcontrol = regs[XE_GPU_REG_RB_DEPTHCONTROL]; + auto rb_stencilrefmask = regs[XE_GPU_REG_RB_STENCILREFMASK]; + auto rb_depth_info = regs[XE_GPU_REG_RB_DEPTH_INFO]; bool uses_depth = (rb_depthcontrol & 0x00000002) || (rb_depthcontrol & 0x00000004); uint32_t stencil_ref = (rb_stencilrefmask & 0xFF); @@ -1697,10 +1701,9 @@ void TraceViewer::DrawStateUI() { draw_info.index_buffer_size, kIndexFormatNames[int(draw_info.index_format)], kEndiannessNames[int(draw_info.index_endianness)]); - uint32_t pa_su_sc_mode_cntl = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; + uint32_t pa_su_sc_mode_cntl = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL]; if (pa_su_sc_mode_cntl & (1 << 21)) { - uint32_t reset_index = - regs[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32; + uint32_t reset_index = regs[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX]; if (draw_info.index_format == xenos::IndexFormat::kInt16) { ImGui::Text("Reset Index: %.4X", reset_index & 0xFFFF); } else { @@ -1760,30 +1763,16 @@ void TraceViewer::DrawStateUI() { auto shader = command_processor->active_vertex_shader(); if (shader) { for (const auto& vertex_binding : shader->vertex_bindings()) { - int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + - (vertex_binding.fetch_constant / 3) * 6; - const auto group = - reinterpret_cast(®s.values[r]); - const xe_gpu_vertex_fetch_t* fetch = nullptr; - switch (vertex_binding.fetch_constant % 3) { - case 0: - fetch = &group->vertex_fetch_0; - break; - case 1: - fetch = &group->vertex_fetch_1; - break; - case 2: - fetch = &group->vertex_fetch_2; - break; - } - assert_true(fetch->endian == xenos::Endian::k8in32); + xe_gpu_vertex_fetch_t fetch = + regs.GetVertexFetch(vertex_binding.fetch_constant); + assert_true(fetch.endian == xenos::Endian::k8in32); char tree_root_id[32]; sprintf(tree_root_id, "#vertices_root_%d", vertex_binding.fetch_constant); if (ImGui::TreeNode(tree_root_id, "vf%d: 0x%.8X (%db), %s", - vertex_binding.fetch_constant, fetch->address << 2, - fetch->size * 4, - kEndiannessNames[int(fetch->endian)])) { + vertex_binding.fetch_constant, fetch.address << 2, + fetch.size * 4, + kEndiannessNames[int(fetch.endian)])) { ImGui::BeginChild("#vertices", ImVec2(0, 300)); DrawVertexFetcher(shader, vertex_binding, fetch); ImGui::EndChild(); @@ -1831,7 +1820,7 @@ void TraceViewer::DrawStateUI() { ImGui::Text("f%02d_%d", (i - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6, (i - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) % 6); ImGui::NextColumn(); - ImGui::Text("%.8X", regs[i].u32); + ImGui::Text("%.8X", regs[i]); ImGui::NextColumn(); } ImGui::Columns(1); @@ -1842,8 +1831,9 @@ void TraceViewer::DrawStateUI() { i <= XE_GPU_REG_SHADER_CONSTANT_511_X; i += 4) { ImGui::Text("c%d", (i - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4); ImGui::NextColumn(); - ImGui::Text("%f, %f, %f, %f", regs[i + 0].f32, regs[i + 1].f32, - regs[i + 2].f32, regs[i + 3].f32); + ImGui::Text("%f, %f, %f, %f", regs.Get(i + 0), + regs.Get(i + 1), regs.Get(i + 2), + regs.Get(i + 3)); ImGui::NextColumn(); } ImGui::Columns(1); @@ -1856,7 +1846,7 @@ void TraceViewer::DrawStateUI() { (i - XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031) * 32, (i - XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031) * 32 + 31); ImGui::NextColumn(); - ImGui::Text("%.8X", regs[i].u32); + ImGui::Text("%.8X", regs[i]); ImGui::NextColumn(); } ImGui::Columns(1); @@ -1867,7 +1857,7 @@ void TraceViewer::DrawStateUI() { i <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31; ++i) { ImGui::Text("l%d", i - XE_GPU_REG_SHADER_CONSTANT_LOOP_00); ImGui::NextColumn(); - ImGui::Text("%.8X", regs[i].u32); + ImGui::Text("%.8X", regs[i]); ImGui::NextColumn(); } ImGui::Columns(1); diff --git a/src/xenia/gpu/trace_viewer.h b/src/xenia/gpu/trace_viewer.h index 58ab16e4e..b9a988eef 100644 --- a/src/xenia/gpu/trace_viewer.h +++ b/src/xenia/gpu/trace_viewer.h @@ -122,7 +122,7 @@ class TraceViewer : public xe::ui::WindowedApp { void DrawVertexFetcher(Shader* shader, const Shader::VertexBinding& vertex_binding, - const xenos::xe_gpu_vertex_fetch_t* fetch); + const xenos::xe_gpu_vertex_fetch_t& fetch); TraceViewerWindowListener window_listener_; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 58336c901..806382e00 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -2486,8 +2486,8 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, (uint64_t(1) << (vfetch_index & 63))) { continue; } - const auto& vfetch_constant = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2); + xenos::xe_gpu_vertex_fetch_t vfetch_constant = + regs.GetVertexFetch(vfetch_index); switch (vfetch_constant.type) { case xenos::FetchConstantType::kVertex: break; @@ -3285,10 +3285,10 @@ void VulkanCommandProcessor::UpdateDynamicState( // Blend constants. float blend_constants[] = { - regs[XE_GPU_REG_RB_BLEND_RED].f32, - regs[XE_GPU_REG_RB_BLEND_GREEN].f32, - regs[XE_GPU_REG_RB_BLEND_BLUE].f32, - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32, + regs.Get(XE_GPU_REG_RB_BLEND_RED), + regs.Get(XE_GPU_REG_RB_BLEND_GREEN), + regs.Get(XE_GPU_REG_RB_BLEND_BLUE), + regs.Get(XE_GPU_REG_RB_BLEND_ALPHA), }; dynamic_blend_constants_update_needed_ |= std::memcmp(dynamic_blend_constants_, blend_constants, @@ -3434,7 +3434,7 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( const RegisterFile& regs = *register_file_; auto pa_cl_vte_cntl = regs.Get(); auto pa_su_sc_mode_cntl = regs.Get(); - float rb_alpha_ref = regs[XE_GPU_REG_RB_ALPHA_REF].f32; + auto rb_alpha_ref = regs.Get(XE_GPU_REG_RB_ALPHA_REF); auto rb_colorcontrol = regs.Get(); auto rb_depth_info = regs.Get(); auto rb_stencilrefmask = regs.Get(); @@ -3442,7 +3442,7 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( regs.Get(XE_GPU_REG_RB_STENCILREFMASK_BF); auto rb_surface_info = regs.Get(); auto vgt_draw_initiator = regs.Get(); - int32_t vgt_indx_offset = int32_t(regs[XE_GPU_REG_VGT_INDX_OFFSET].u32); + auto vgt_indx_offset = regs.Get(XE_GPU_REG_VGT_INDX_OFFSET); bool edram_fragment_shader_interlock = render_target_cache_->GetPath() == @@ -3755,7 +3755,7 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( dirty |= system_constants_.edram_rt_format_flags[i] != format_flags; system_constants_.edram_rt_format_flags[i] = format_flags; uint32_t blend_factors_ops = - regs[reg::RB_BLENDCONTROL::rt_register_indices[i]].u32 & 0x1FFF1FFF; + regs[reg::RB_BLENDCONTROL::rt_register_indices[i]] & 0x1FFF1FFF; dirty |= system_constants_.edram_rt_blend_factors_ops[i] != blend_factors_ops; system_constants_.edram_rt_blend_factors_ops[i] = blend_factors_ops; @@ -3784,22 +3784,22 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( if (primitive_polygonal) { if (pa_su_sc_mode_cntl.poly_offset_front_enable) { poly_offset_front_scale = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE); poly_offset_front_offset = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET); } if (pa_su_sc_mode_cntl.poly_offset_back_enable) { poly_offset_back_scale = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_SCALE); poly_offset_back_offset = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_BACK_OFFSET); } } else { if (pa_su_sc_mode_cntl.poly_offset_para_enable) { poly_offset_front_scale = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_SCALE); poly_offset_front_offset = - regs[XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET].f32; + regs.Get(XE_GPU_REG_PA_SU_POLY_OFFSET_FRONT_OFFSET); poly_offset_back_scale = poly_offset_front_scale; poly_offset_back_offset = poly_offset_front_offset; } @@ -3862,21 +3862,21 @@ void VulkanCommandProcessor::UpdateSystemConstantValues( } dirty |= system_constants_.edram_blend_constant[0] != - regs[XE_GPU_REG_RB_BLEND_RED].f32; + regs.Get(XE_GPU_REG_RB_BLEND_RED); system_constants_.edram_blend_constant[0] = - regs[XE_GPU_REG_RB_BLEND_RED].f32; + regs.Get(XE_GPU_REG_RB_BLEND_RED); dirty |= system_constants_.edram_blend_constant[1] != - regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + regs.Get(XE_GPU_REG_RB_BLEND_GREEN); system_constants_.edram_blend_constant[1] = - regs[XE_GPU_REG_RB_BLEND_GREEN].f32; + regs.Get(XE_GPU_REG_RB_BLEND_GREEN); dirty |= system_constants_.edram_blend_constant[2] != - regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + regs.Get(XE_GPU_REG_RB_BLEND_BLUE); system_constants_.edram_blend_constant[2] = - regs[XE_GPU_REG_RB_BLEND_BLUE].f32; + regs.Get(XE_GPU_REG_RB_BLEND_BLUE); dirty |= system_constants_.edram_blend_constant[3] != - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; + regs.Get(XE_GPU_REG_RB_BLEND_ALPHA); system_constants_.edram_blend_constant[3] = - regs[XE_GPU_REG_RB_BLEND_ALPHA].f32; + regs.Get(XE_GPU_REG_RB_BLEND_ALPHA); } if (dirty) { @@ -3903,10 +3903,10 @@ bool VulkanCommandProcessor::UpdateBindings(const VulkanShader* vertex_shader, // These are the constant base addresses/ranges for shaders. // We have these hardcoded right now cause nothing seems to differ on the Xbox // 360 (however, OpenGL ES on Adreno 200 on Android has different ranges). - assert_true(regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x000FF000 || - regs[XE_GPU_REG_SQ_VS_CONST].u32 == 0x00000000); - assert_true(regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x000FF100 || - regs[XE_GPU_REG_SQ_PS_CONST].u32 == 0x00000000); + assert_true(regs[XE_GPU_REG_SQ_VS_CONST] == 0x000FF000 || + regs[XE_GPU_REG_SQ_VS_CONST] == 0x00000000); + assert_true(regs[XE_GPU_REG_SQ_PS_CONST] == 0x000FF100 || + regs[XE_GPU_REG_SQ_PS_CONST] == 0x00000000); // Check if the float constant layout is still the same and get the counts. const Shader::ConstantRegisterMap& float_constant_map_vertex = vertex_shader->constant_register_map(); @@ -4001,8 +4001,7 @@ bool VulkanCommandProcessor::UpdateBindings(const VulkanShader* vertex_shader, float_constant_map_entry &= ~(1ull << float_constant_index); std::memcpy(mapping, ®s[XE_GPU_REG_SHADER_CONSTANT_000_X + (i << 8) + - (float_constant_index << 2)] - .f32, + (float_constant_index << 2)], sizeof(float) * 4); mapping += sizeof(float) * 4; } @@ -4033,8 +4032,7 @@ bool VulkanCommandProcessor::UpdateBindings(const VulkanShader* vertex_shader, float_constant_map_entry &= ~(1ull << float_constant_index); std::memcpy(mapping, ®s[XE_GPU_REG_SHADER_CONSTANT_256_X + (i << 8) + - (float_constant_index << 2)] - .f32, + (float_constant_index << 2)], sizeof(float) * 4); mapping += sizeof(float) * 4; } @@ -4055,7 +4053,7 @@ bool VulkanCommandProcessor::UpdateBindings(const VulkanShader* vertex_shader, return false; } buffer_info.range = VkDeviceSize(kBoolLoopConstantsSize); - std::memcpy(mapping, ®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32, + std::memcpy(mapping, ®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031], kBoolLoopConstantsSize); current_constant_buffers_up_to_date_ |= UINT32_C(1) << SpirvShaderTranslator::kConstantBufferBoolLoop; @@ -4073,7 +4071,7 @@ bool VulkanCommandProcessor::UpdateBindings(const VulkanShader* vertex_shader, return false; } buffer_info.range = VkDeviceSize(kFetchConstantsSize); - std::memcpy(mapping, ®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32, + std::memcpy(mapping, ®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0], kFetchConstantsSize); current_constant_buffers_up_to_date_ |= UINT32_C(1) << SpirvShaderTranslator::kConstantBufferFetch; diff --git a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc index f91cc4e6b..eb2ee9b21 100644 --- a/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_pipeline_cache.cc @@ -718,7 +718,7 @@ bool VulkanPipelineCache::GetCurrentStateDescription( [common_blend_rt_index]), (((normalized_color_mask & ~(uint32_t(0b1111) << (4 * common_blend_rt_index))) - ? regs[XE_GPU_REG_RB_COLOR_MASK].u32 + ? regs[XE_GPU_REG_RB_COLOR_MASK] : normalized_color_mask) >> (4 * common_blend_rt_index)) & 0b1111, diff --git a/src/xenia/gpu/vulkan/vulkan_texture_cache.cc b/src/xenia/gpu/vulkan/vulkan_texture_cache.cc index bff490b9d..1f3ccaf24 100644 --- a/src/xenia/gpu/vulkan/vulkan_texture_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_texture_cache.cc @@ -612,8 +612,8 @@ VkImageView VulkanTextureCache::GetActiveBindingOrNullImageView( VulkanTextureCache::SamplerParameters VulkanTextureCache::GetSamplerParameters( const VulkanShader::SamplerBinding& binding) const { const auto& regs = register_file(); - const auto& fetch = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + binding.fetch_constant * 6); + xenos::xe_gpu_texture_fetch_t fetch = + regs.GetTextureFetch(binding.fetch_constant); SamplerParameters parameters; @@ -875,8 +875,7 @@ VkImageView VulkanTextureCache::RequestSwapTexture( uint32_t& width_scaled_out, uint32_t& height_scaled_out, xenos::TextureFormat& format_out) { const auto& regs = register_file(); - const auto& fetch = regs.Get( - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0); + xenos::xe_gpu_texture_fetch_t fetch = regs.GetTextureFetch(0); TextureKey key; BindingInfoFromFetchConstant(fetch, key, nullptr); if (!key.is_valid || key.base_page == 0 || diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc index f15c621cd..ce7f6177d 100644 --- a/src/xenia/gpu/xenos.cc +++ b/src/xenia/gpu/xenos.cc @@ -12,6 +12,7 @@ #include #include "xenia/base/math.h" +#include "xenia/base/memory.h" namespace xe { namespace gpu { @@ -118,8 +119,8 @@ float Float7e3To32(uint32_t f10) { exponent = uint32_t(1 - int32_t(mantissa_lzcnt)); mantissa = (mantissa << mantissa_lzcnt) & 0x7F; } - uint32_t f32 = ((exponent + 124) << 23) | (mantissa << 3); - return *reinterpret_cast(&f32); + return xe::memory::Reinterpret( + uint32_t(((exponent + 124) << 23) | (mantissa << 3))); } // Based on CFloat24 from d3dref9.dll and the 6e4 code from: @@ -131,7 +132,7 @@ uint32_t Float32To20e4(float f32, bool round_to_nearest_even) { // Positive only, and not -0 or NaN. return 0; } - uint32_t f32u32 = *reinterpret_cast(&f32); + auto f32u32 = xe::memory::Reinterpret(f32); if (f32u32 >= 0x3FFFFFF8) { // Saturate. return 0xFFFFFF; @@ -165,8 +166,8 @@ float Float20e4To32(uint32_t f24) { exponent = uint32_t(1 - int32_t(mantissa_lzcnt)); mantissa = (mantissa << mantissa_lzcnt) & 0xFFFFF; } - uint32_t f32 = ((exponent + 112) << 23) | (mantissa << 3); - return *reinterpret_cast(&f32); + return xe::memory::Reinterpret( + uint32_t(((exponent + 112) << 23) | (mantissa << 3))); } const char* GetColorRenderTargetFormatName(ColorRenderTargetFormat format) { From f964290ea803bfc4b8eca1fcd3b870379bdaffaa Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 12 May 2024 17:44:52 +0300 Subject: [PATCH 3/8] [Base] Relax the system clock difference allowance in the test Hopefully should reduce the CI failure rate, although this testing approach is fundamentally flawed as it depends on OS scheduling. --- src/xenia/base/testing/chrono_test.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/xenia/base/testing/chrono_test.cc b/src/xenia/base/testing/chrono_test.cc index a63aac53c..f35f17ed8 100644 --- a/src/xenia/base/testing/chrono_test.cc +++ b/src/xenia/base/testing/chrono_test.cc @@ -107,10 +107,11 @@ TEST_CASE("WinSystemClock <-> XSystemClock", "[clock_cast]") { auto error2 = xsys.time_since_epoch() - wxsys.time_since_epoch(); auto error3 = wsys - wxsys; - REQUIRE(error1 < 10ms); - REQUIRE(error1 > -10ms); - REQUIRE(error2 < 10ms); - REQUIRE(error2 > -10ms); + // In AppVeyor, the difference often can be as large as roughly 16ms. + REQUIRE(error1 < 20ms); + REQUIRE(error1 > -20ms); + REQUIRE(error2 < 20ms); + REQUIRE(error2 > -20ms); REQUIRE(error3 < duration); REQUIRE(error3 > -duration); } From a3304d252fac72f951ec3ce8b56a010edae05f63 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 12 May 2024 19:21:37 +0300 Subject: [PATCH 4/8] [Base/GPU] Cleanup float comparisons and NaN and -0 in clamping C++ relational operators are supposed to raise FE_INVALID if an argument is NaN, use std::isless/greater[equal] instead where they were easy to locate (though there are other places possibly, mostly min/max and clamp usage was checked). Also fixes a copy-paste error making the CPU shader interpreter execute MINs as MAXs instead. --- src/xenia/apu/xma_context.cc | 2 +- src/xenia/base/math.h | 28 ++--- src/xenia/debug/ui/debug_window.cc | 12 +- src/xenia/gpu/draw_util.cc | 25 ++-- src/xenia/gpu/shader_interpreter.cc | 170 +++++++++++++++------------- src/xenia/gpu/trace_viewer.cc | 2 +- src/xenia/gpu/xenos.cc | 4 +- src/xenia/ui/immediate_drawer.cc | 24 ++-- src/xenia/ui/window_android.cc | 20 ++-- 9 files changed, 143 insertions(+), 144 deletions(-) diff --git a/src/xenia/apu/xma_context.cc b/src/xenia/apu/xma_context.cc index e9047654a..27b07ba28 100644 --- a/src/xenia/apu/xma_context.cc +++ b/src/xenia/apu/xma_context.cc @@ -921,7 +921,7 @@ void XmaContext::ConvertFrame(const uint8_t** samples, bool is_two_channel, auto in = reinterpret_cast(samples[j]); // Raw samples sometimes aren't within [-1, 1] - float scaled_sample = xe::saturate_signed(in[i]) * scale; + float scaled_sample = xe::clamp_float(in[i], -1.0f, 1.0f) * scale; // Convert the sample and output it in big endian. auto sample = static_cast(scaled_sample); diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index 889cf03ed..55dce4b45 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -60,20 +60,22 @@ constexpr T round_up(T value, V multiple, bool force_non_zero = true) { return (value + multiple - 1) / multiple * multiple; } -// Using the same conventions as in shading languages, returning 0 for NaN. -// std::max is `a < b ? b : a`, thus in case of NaN, the first argument is -// always returned. Also -0 is not < +0, so +0 is also chosen for it. +// For NaN, returns min_value (or, if it's NaN too, max_value). +// If either of the boundaries is zero, and if the value is at that boundary or +// exceeds it, the result will have the sign of that boundary. If both +// boundaries are zero, which sign is selected among the argument signs is not +// explicitly defined. template -constexpr T saturate_unsigned(T value) { - return std::min(static_cast(1.0f), std::max(static_cast(0.0f), value)); +T clamp_float(T value, T min_value, T max_value) { + float clamped_to_min = std::isgreater(value, min_value) ? value : min_value; + return std::isless(clamped_to_min, max_value) ? clamped_to_min : max_value; } -// This diverges from the GPU NaN rules for signed normalized formats (NaN -// should be converted to 0, not to -1), but this expectation is not needed most -// of time, and cannot be met for free (unlike for 0...1 clamping). +// Using the same conventions as in shading languages, returning 0 for NaN. +// 0 is always returned as positive. template -constexpr T saturate_signed(T value) { - return std::min(static_cast(1.0f), std::max(static_cast(-1.0f), value)); +T saturate(T value) { + return clamp_float(value, static_cast(0.0f), static_cast(1.0f)); } // Gets the next power of two value that is greater than or equal to the given @@ -330,12 +332,6 @@ inline uint64_t rotate_left(uint64_t v, uint8_t sh) { } #endif // XE_PLATFORM_WIN32 -template -T clamp(T value, T min_value, T max_value) { - const T t = value < min_value ? min_value : value; - return t > max_value ? max_value : t; -} - #if XE_ARCH_AMD64 // Utilities for SSE values. template diff --git a/src/xenia/debug/ui/debug_window.cc b/src/xenia/debug/ui/debug_window.cc index d56bcfd03..07d4404db 100644 --- a/src/xenia/debug/ui/debug_window.cc +++ b/src/xenia/debug/ui/debug_window.cc @@ -182,7 +182,7 @@ void DebugWindow::DrawFrame(ImGuiIO& io) { ImVec2(kSplitterWidth, top_panes_height)); if (ImGui::IsItemActive()) { function_pane_width += io.MouseDelta.x; - function_pane_width = xe::clamp(function_pane_width, 30.0f, FLT_MAX); + function_pane_width = xe::clamp_float(function_pane_width, 30.0f, FLT_MAX); } ImGui::SameLine(); ImGui::BeginChild("##source_pane", @@ -194,7 +194,7 @@ void DebugWindow::DrawFrame(ImGuiIO& io) { ImVec2(kSplitterWidth, top_panes_height)); if (ImGui::IsItemActive()) { source_pane_width += io.MouseDelta.x; - source_pane_width = xe::clamp(source_pane_width, 30.0f, FLT_MAX); + source_pane_width = xe::clamp_float(source_pane_width, 30.0f, FLT_MAX); } ImGui::SameLine(); ImGui::BeginChild("##registers_pane", @@ -206,7 +206,8 @@ void DebugWindow::DrawFrame(ImGuiIO& io) { ImVec2(kSplitterWidth, top_panes_height)); if (ImGui::IsItemActive()) { registers_pane_width += io.MouseDelta.x; - registers_pane_width = xe::clamp(registers_pane_width, 30.0f, FLT_MAX); + registers_pane_width = + xe::clamp_float(registers_pane_width, 30.0f, FLT_MAX); } ImGui::SameLine(); ImGui::BeginChild("##right_pane", ImVec2(0, top_panes_height), true); @@ -234,7 +235,7 @@ void DebugWindow::DrawFrame(ImGuiIO& io) { ImGui::InvisibleButton("##hsplitter0", ImVec2(-1, kSplitterWidth)); if (ImGui::IsItemActive()) { bottom_panes_height -= io.MouseDelta.y; - bottom_panes_height = xe::clamp(bottom_panes_height, 30.0f, FLT_MAX); + bottom_panes_height = xe::clamp_float(bottom_panes_height, 30.0f, FLT_MAX); } ImGui::BeginChild("##log_pane", ImVec2(log_pane_width, bottom_panes_height), true); @@ -245,7 +246,8 @@ void DebugWindow::DrawFrame(ImGuiIO& io) { ImVec2(kSplitterWidth, bottom_panes_height)); if (ImGui::IsItemActive()) { breakpoints_pane_width -= io.MouseDelta.x; - breakpoints_pane_width = xe::clamp(breakpoints_pane_width, 30.0f, FLT_MAX); + breakpoints_pane_width = + xe::clamp_float(breakpoints_pane_width, 30.0f, FLT_MAX); } ImGui::SameLine(); ImGui::BeginChild("##breakpoints_pane", ImVec2(0, 0), true); diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 73494a7f2..1b5671fcb 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -399,16 +399,11 @@ void GetHostViewportInfo(const RegisterFile& regs, float offset_axis = offset_base_xy[i] + offset_add_xy[i]; float scale_axis = scale_xy[i]; float scale_axis_abs = std::abs(scale_xy[i]); - float axis_0 = offset_axis - scale_axis_abs; - float axis_1 = offset_axis + scale_axis_abs; float axis_max_unscaled_float = float(xy_max_unscaled[i]); - // max(0.0f, xy) drops NaN and < 0 - max picks the first argument in the - // !(a < b) case (always for NaN), min as float (axis_max_unscaled_float - // is well below 2^24) to safely drop very large values. - uint32_t axis_0_int = - uint32_t(std::min(axis_max_unscaled_float, std::max(0.0f, axis_0))); - uint32_t axis_1_int = - uint32_t(std::min(axis_max_unscaled_float, std::max(0.0f, axis_1))); + uint32_t axis_0_int = uint32_t(xe::clamp_float( + offset_axis - scale_axis_abs, 0.0f, axis_max_unscaled_float)); + uint32_t axis_1_int = uint32_t(xe::clamp_float( + offset_axis + scale_axis_abs, 0.0f, axis_max_unscaled_float)); uint32_t axis_extent_int = axis_1_int - axis_0_int; viewport_info_out.xy_offset[i] = axis_0_int * axis_resolution_scale; viewport_info_out.xy_extent[i] = axis_extent_int * axis_resolution_scale; @@ -511,8 +506,8 @@ void GetHostViewportInfo(const RegisterFile& regs, // extension. But cases when this really matters are yet to be found - // trying to fix this will result in more correct depth values, but // incorrect clipping. - z_min = xe::saturate_unsigned(host_clip_offset_z); - z_max = xe::saturate_unsigned(host_clip_offset_z + host_clip_scale_z); + z_min = xe::saturate(host_clip_offset_z); + z_max = xe::saturate(host_clip_offset_z + host_clip_scale_z); // Direct3D 12 doesn't allow reverse depth range - on some drivers it // works, on some drivers it doesn't, actually, but it was never // explicitly allowed by the specification. @@ -877,10 +872,10 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, GetScissor(regs, scissor, false); int32_t scissor_right = int32_t(scissor.offset[0] + scissor.extent[0]); int32_t scissor_bottom = int32_t(scissor.offset[1] + scissor.extent[1]); - x0 = xe::clamp(x0, int32_t(scissor.offset[0]), scissor_right); - y0 = xe::clamp(y0, int32_t(scissor.offset[1]), scissor_bottom); - x1 = xe::clamp(x1, int32_t(scissor.offset[0]), scissor_right); - y1 = xe::clamp(y1, int32_t(scissor.offset[1]), scissor_bottom); + x0 = std::clamp(x0, int32_t(scissor.offset[0]), scissor_right); + y0 = std::clamp(y0, int32_t(scissor.offset[1]), scissor_bottom); + x1 = std::clamp(x1, int32_t(scissor.offset[0]), scissor_right); + y1 = std::clamp(y1, int32_t(scissor.offset[1]), scissor_bottom); assert_true(x0 <= x1 && y0 <= y1); diff --git a/src/xenia/gpu/shader_interpreter.cc b/src/xenia/gpu/shader_interpreter.cc index 9a1342aca..aaf7b2ebd 100644 --- a/src/xenia/gpu/shader_interpreter.cc +++ b/src/xenia/gpu/shader_interpreter.cc @@ -346,16 +346,18 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { } break; case ucode::AluVectorOpcode::kMax: { for (uint32_t i = 0; i < 4; ++i) { - vector_result[i] = vector_operands[0][i] >= vector_operands[1][i] - ? vector_operands[0][i] - : vector_operands[1][i]; + vector_result[i] = + std::isgreaterequal(vector_operands[0][i], vector_operands[1][i]) + ? vector_operands[0][i] + : vector_operands[1][i]; } } break; case ucode::AluVectorOpcode::kMin: { for (uint32_t i = 0; i < 4; ++i) { - vector_result[i] = vector_operands[0][i] < vector_operands[1][i] - ? vector_operands[0][i] - : vector_operands[1][i]; + vector_result[i] = + std::isless(vector_operands[0][i], vector_operands[1][i]) + ? vector_operands[0][i] + : vector_operands[1][i]; } } break; case ucode::AluVectorOpcode::kSeq: { @@ -366,14 +368,14 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { } break; case ucode::AluVectorOpcode::kSgt: { for (uint32_t i = 0; i < 4; ++i) { - vector_result[i] = - float(vector_operands[0][i] > vector_operands[1][i]); + vector_result[i] = float( + std::isgreater(vector_operands[0][i], vector_operands[1][i])); } } break; case ucode::AluVectorOpcode::kSge: { for (uint32_t i = 0; i < 4; ++i) { - vector_result[i] = - float(vector_operands[0][i] >= vector_operands[1][i]); + vector_result[i] = float(std::isgreaterequal(vector_operands[0][i], + vector_operands[1][i])); } } break; case ucode::AluVectorOpcode::kSne: { @@ -419,14 +421,14 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { } break; case ucode::AluVectorOpcode::kCndGe: { for (uint32_t i = 0; i < 4; ++i) { - vector_result[i] = vector_operands[0][i] >= 0.0f + vector_result[i] = std::isgreaterequal(vector_operands[0][i], 0.0f) ? vector_operands[1][i] : vector_operands[2][i]; } } break; case ucode::AluVectorOpcode::kCndGt: { for (uint32_t i = 0; i < 4; ++i) { - vector_result[i] = vector_operands[0][i] > 0.0f + vector_result[i] = std::isgreater(vector_operands[0][i], 0.0f) ? vector_operands[1][i] : vector_operands[2][i]; } @@ -478,32 +480,38 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { float x_abs = std::abs(x), y_abs = std::abs(y), z_abs = std::abs(z); // Result is T coordinate, S coordinate, 2 * major axis, face ID. if (z_abs >= x_abs && z_abs >= y_abs) { + bool z_negative = std::isless(z, 0.0f); vector_result[0] = -y; - vector_result[1] = z < 0.0f ? -x : x; + vector_result[1] = z_negative ? -x : x; vector_result[2] = z; - vector_result[3] = z < 0.0f ? 5.0f : 4.0f; + vector_result[3] = z_negative ? 5.0f : 4.0f; } else if (y_abs >= x_abs) { - vector_result[0] = y < 0.0f ? -z : z; + bool y_negative = std::isless(y, 0.0f); + vector_result[0] = y_negative ? -z : z; vector_result[1] = x; vector_result[2] = y; - vector_result[3] = y < 0.0f ? 3.0f : 2.0f; + vector_result[3] = y_negative ? 3.0f : 2.0f; } else { + bool x_negative = std::isless(x, 0.0f); vector_result[0] = -y; - vector_result[1] = x < 0.0f ? z : -z; + vector_result[1] = x_negative ? z : -z; vector_result[2] = x; - vector_result[3] = x < 0.0f ? 1.0f : 0.0f; + vector_result[3] = x_negative ? 1.0f : 0.0f; } vector_result[2] *= 2.0f; } break; case ucode::AluVectorOpcode::kMax4: { - if (vector_operands[0][0] >= vector_operands[0][1] && - vector_operands[0][0] >= vector_operands[0][2] && - vector_operands[0][0] >= vector_operands[0][3]) { + if (std::isgreaterequal(vector_operands[0][0], vector_operands[0][1]) && + std::isgreaterequal(vector_operands[0][0], vector_operands[0][2]) && + std::isgreaterequal(vector_operands[0][0], vector_operands[0][3])) { vector_result[0] = vector_operands[0][0]; - } else if (vector_operands[0][1] >= vector_operands[0][2] && - vector_operands[0][1] >= vector_operands[0][3]) { + } else if (std::isgreaterequal(vector_operands[0][1], + vector_operands[0][2]) && + std::isgreaterequal(vector_operands[0][1], + vector_operands[0][3])) { vector_result[0] = vector_operands[0][1]; - } else if (vector_operands[0][2] >= vector_operands[0][3]) { + } else if (std::isgreaterequal(vector_operands[0][2], + vector_operands[0][3])) { vector_result[0] = vector_operands[0][2]; } else { vector_result[0] = vector_operands[0][3]; @@ -529,21 +537,21 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { replicate_vector_result_x = true; } break; case ucode::AluVectorOpcode::kSetpGtPush: { - state_.predicate = - vector_operands[0][3] == 0.0f && vector_operands[1][3] > 0.0f; - vector_result[0] = - (vector_operands[0][0] == 0.0f && vector_operands[1][0] > 0.0f) - ? 0.0f - : vector_operands[0][0] + 1.0f; + state_.predicate = vector_operands[0][3] == 0.0f && + std::isgreater(vector_operands[1][3], 0.0f); + vector_result[0] = (vector_operands[0][0] == 0.0f && + std::isgreater(vector_operands[1][0], 0.0f)) + ? 0.0f + : vector_operands[0][0] + 1.0f; replicate_vector_result_x = true; } break; case ucode::AluVectorOpcode::kSetpGePush: { - state_.predicate = - vector_operands[0][3] == 0.0f && vector_operands[1][3] >= 0.0f; - vector_result[0] = - (vector_operands[0][0] == 0.0f && vector_operands[1][0] >= 0.0f) - ? 0.0f - : vector_operands[0][0] + 1.0f; + state_.predicate = vector_operands[0][3] == 0.0f && + std::isgreaterequal(vector_operands[1][3], 0.0f); + vector_result[0] = (vector_operands[0][0] == 0.0f && + std::isgreaterequal(vector_operands[1][0], 0.0f)) + ? 0.0f + : vector_operands[0][0] + 1.0f; replicate_vector_result_x = true; } break; // Not implementing pixel kill currently, the interpreter is currently @@ -557,19 +565,19 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { replicate_vector_result_x = true; } break; case ucode::AluVectorOpcode::kKillGt: { - vector_result[0] = - float(vector_operands[0][0] > vector_operands[1][0] || - vector_operands[0][1] > vector_operands[1][1] || - vector_operands[0][2] > vector_operands[1][2] || - vector_operands[0][3] > vector_operands[1][3]); + vector_result[0] = float( + std::isgreater(vector_operands[0][0], vector_operands[1][0]) || + std::isgreater(vector_operands[0][1], vector_operands[1][1]) || + std::isgreater(vector_operands[0][2], vector_operands[1][2]) || + std::isgreater(vector_operands[0][3], vector_operands[1][3])); replicate_vector_result_x = true; } break; case ucode::AluVectorOpcode::kKillGe: { - vector_result[0] = - float(vector_operands[0][0] >= vector_operands[1][0] || - vector_operands[0][1] >= vector_operands[1][1] || - vector_operands[0][2] >= vector_operands[1][2] || - vector_operands[0][3] >= vector_operands[1][3]); + vector_result[0] = float( + std::isgreaterequal(vector_operands[0][0], vector_operands[1][0]) || + std::isgreaterequal(vector_operands[0][1], vector_operands[1][1]) || + std::isgreaterequal(vector_operands[0][2], vector_operands[1][2]) || + std::isgreaterequal(vector_operands[0][3], vector_operands[1][3])); replicate_vector_result_x = true; } break; case ucode::AluVectorOpcode::kKillNe: { @@ -590,14 +598,13 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { vector_result[3] = vector_operands[1][3]; } break; case ucode::AluVectorOpcode::kMaxA: { - // std::max is `a < b ? b : a`, thus in case of NaN, the first argument - // (-256.0f) is always the result. state_.address_register = int32_t(std::floor( - std::min(255.0f, std::max(-256.0f, vector_operands[0][3])) + 0.5f)); + xe::clamp_float(vector_operands[0][3], -256.0f, 255.0f) + 0.5f)); for (uint32_t i = 0; i < 4; ++i) { - vector_result[i] = vector_operands[0][i] >= vector_operands[1][i] - ? vector_operands[0][i] - : vector_operands[1][i]; + vector_result[i] = + std::isgreaterequal(vector_operands[0][i], vector_operands[1][i]) + ? vector_operands[0][i] + : vector_operands[1][i]; } } break; default: { @@ -702,7 +709,8 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { case ucode::AluScalarOpcode::kMulsPrev2: { if (state_.previous_scalar == -FLT_MAX || !std::isfinite(state_.previous_scalar) || - !std::isfinite(scalar_operands[1]) || scalar_operands[1] <= 0.0f) { + !std::isfinite(scalar_operands[1]) || + std::islessequal(scalar_operands[1], 0.0f)) { state_.previous_scalar = -FLT_MAX; } else { // Direct3D 9 behavior (0 or denormal * anything = +0). @@ -713,23 +721,26 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { } } break; case ucode::AluScalarOpcode::kMaxs: { - state_.previous_scalar = scalar_operands[0] >= scalar_operands[1] - ? scalar_operands[0] - : scalar_operands[1]; + state_.previous_scalar = + std::isgreaterequal(scalar_operands[0], scalar_operands[1]) + ? scalar_operands[0] + : scalar_operands[1]; } break; case ucode::AluScalarOpcode::kMins: { - state_.previous_scalar = scalar_operands[0] >= scalar_operands[1] - ? scalar_operands[0] - : scalar_operands[1]; + state_.previous_scalar = + std::isless(scalar_operands[0], scalar_operands[1]) + ? scalar_operands[0] + : scalar_operands[1]; } break; case ucode::AluScalarOpcode::kSeqs: { state_.previous_scalar = float(scalar_operands[0] == 0.0f); } break; case ucode::AluScalarOpcode::kSgts: { - state_.previous_scalar = float(scalar_operands[0] > 0.0f); + state_.previous_scalar = float(std::isgreater(scalar_operands[0], 0.0f)); } break; case ucode::AluScalarOpcode::kSges: { - state_.previous_scalar = float(scalar_operands[0] >= 0.0f); + state_.previous_scalar = + float(std::isgreaterequal(scalar_operands[0], 0.0f)); } break; case ucode::AluScalarOpcode::kSnes: { state_.previous_scalar = float(scalar_operands[0] != 0.0f); @@ -795,22 +806,20 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { state_.previous_scalar = 1.0f / std::sqrt(scalar_operands[0]); } break; case ucode::AluScalarOpcode::kMaxAs: { - // std::max is `a < b ? b : a`, thus in case of NaN, the first argument - // (-256.0f) is always the result. state_.address_register = int32_t(std::floor( - std::min(255.0f, std::max(-256.0f, scalar_operands[0])) + 0.5f)); - state_.previous_scalar = scalar_operands[0] >= scalar_operands[1] - ? scalar_operands[0] - : scalar_operands[1]; + xe::clamp_float(scalar_operands[0], -256.0f, 255.0f) + 0.5f)); + state_.previous_scalar = + std::isgreaterequal(scalar_operands[0], scalar_operands[1]) + ? scalar_operands[0] + : scalar_operands[1]; } break; case ucode::AluScalarOpcode::kMaxAsf: { - // std::max is `a < b ? b : a`, thus in case of NaN, the first argument - // (-256.0f) is always the result. state_.address_register = int32_t( - std::floor(std::min(255.0f, std::max(-256.0f, scalar_operands[0])))); - state_.previous_scalar = scalar_operands[0] >= scalar_operands[1] - ? scalar_operands[0] - : scalar_operands[1]; + std::floor(xe::clamp_float(scalar_operands[0], -256.0f, 255.0f))); + state_.previous_scalar = + std::isgreaterequal(scalar_operands[0], scalar_operands[1]) + ? scalar_operands[0] + : scalar_operands[1]; } break; case ucode::AluScalarOpcode::kSubs: case ucode::AluScalarOpcode::kSubsc0: @@ -829,11 +838,11 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { state_.previous_scalar = float(!state_.predicate); } break; case ucode::AluScalarOpcode::kSetpGt: { - state_.predicate = scalar_operands[0] > 0.0f; + state_.predicate = std::isgreater(scalar_operands[0], 0.0f); state_.previous_scalar = float(!state_.predicate); } break; case ucode::AluScalarOpcode::kSetpGe: { - state_.predicate = scalar_operands[0] >= 0.0f; + state_.predicate = std::isgreaterequal(scalar_operands[0], 0.0f); state_.previous_scalar = float(!state_.predicate); } break; case ucode::AluScalarOpcode::kSetpInv: { @@ -845,7 +854,7 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { } break; case ucode::AluScalarOpcode::kSetpPop: { float new_counter = scalar_operands[0] - 1.0f; - state_.predicate = new_counter <= 0.0f; + state_.predicate = std::islessequal(new_counter, 0.0f); state_.previous_scalar = state_.predicate ? 0.0f : new_counter; } break; case ucode::AluScalarOpcode::kSetpClr: { @@ -862,10 +871,11 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { state_.previous_scalar = float(scalar_operands[0] == 0.0f); } break; case ucode::AluScalarOpcode::kKillsGt: { - state_.previous_scalar = float(scalar_operands[0] > 0.0f); + state_.previous_scalar = float(std::isgreater(scalar_operands[0], 0.0f)); } break; case ucode::AluScalarOpcode::kKillsGe: { - state_.previous_scalar = float(scalar_operands[0] >= 0.0f); + state_.previous_scalar = + float(std::isgreaterequal(scalar_operands[0], 0.0f)); } break; case ucode::AluScalarOpcode::kKillsNe: { state_.previous_scalar = float(scalar_operands[0] != 0.0f); @@ -891,11 +901,11 @@ void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { if (instr.vector_clamp()) { for (uint32_t i = 0; i < 4; ++i) { - vector_result[i] = xe::saturate_unsigned(vector_result[i]); + vector_result[i] = xe::saturate(vector_result[i]); } } float scalar_result = instr.scalar_clamp() - ? xe::saturate_unsigned(state_.previous_scalar) + ? xe::saturate(state_.previous_scalar) : state_.previous_scalar; uint32_t scalar_result_write_mask = instr.GetScalarOpResultWriteMask(); diff --git a/src/xenia/gpu/trace_viewer.cc b/src/xenia/gpu/trace_viewer.cc index 178c30fc9..754942411 100644 --- a/src/xenia/gpu/trace_viewer.cc +++ b/src/xenia/gpu/trace_viewer.cc @@ -1066,7 +1066,7 @@ void ProgressBar(float frac, float width, float height = 0, if (height == 0) { height = ImGui::GetTextLineHeightWithSpacing(); } - frac = xe::saturate_unsigned(frac); + frac = xe::saturate(frac); const auto fontAtlas = ImGui::GetIO().Fonts; diff --git a/src/xenia/gpu/xenos.cc b/src/xenia/gpu/xenos.cc index ce7f6177d..a2ab6cea5 100644 --- a/src/xenia/gpu/xenos.cc +++ b/src/xenia/gpu/xenos.cc @@ -27,7 +27,7 @@ namespace xenos { float PWLGammaToLinear(float gamma) { // Not found in game executables, so just using the logic similar to that in // the Source Engine. - gamma = xe::saturate_unsigned(gamma); + gamma = xe::saturate(gamma); float scale, offset; // While the compiled code for linear to gamma conversion uses `vcmpgtfp // constant, value` comparison (constant > value, or value < constant), it's @@ -68,7 +68,7 @@ float PWLGammaToLinear(float gamma) { } float LinearToPWLGamma(float linear) { - linear = xe::saturate_unsigned(linear); + linear = xe::saturate(linear); float scale, offset; // While the compiled code uses `vcmpgtfp constant, value` comparison // (constant > value, or value < constant), it's preferable to use `value >= diff --git a/src/xenia/ui/immediate_drawer.cc b/src/xenia/ui/immediate_drawer.cc index fb00be77f..4d3c6bb4e 100644 --- a/src/xenia/ui/immediate_drawer.cc +++ b/src/xenia/ui/immediate_drawer.cc @@ -12,6 +12,7 @@ #include #include "xenia/base/assert.h" +#include "xenia/base/math.h" #include "xenia/ui/graphics_util.h" #include "xenia/ui/presenter.h" @@ -67,24 +68,19 @@ bool ImmediateDrawer::ScissorToRenderTarget(const ImmediateDraw& immediate_draw, } float render_target_width_float = float(render_target_width); float render_target_height_float = float(render_target_height); - // Scale to render target coordinates, drop NaNs (by doing - // std::max(0.0f, variable) in this argument order), and clamp to the render + // Scale to render target coordinates, drop NaNs, and clamp to the render // target size, below which the values are representable as 16p8 fixed-point. float scale_x = render_target_width / coordinate_space_width(); float scale_y = render_target_height / coordinate_space_height(); - float x0_float = - std::min(render_target_width_float, - std::max(0.0f, immediate_draw.scissor_left * scale_x)); - float y0_float = - std::min(render_target_height_float, - std::max(0.0f, immediate_draw.scissor_top * scale_y)); + float x0_float = xe::clamp_float(immediate_draw.scissor_left * scale_x, 0.0f, + render_target_width_float); + float y0_float = xe::clamp_float(immediate_draw.scissor_top * scale_y, 0.0f, + render_target_height_float); // Also make sure the size is non-negative. - float x1_float = - std::min(render_target_width_float, - std::max(x0_float, immediate_draw.scissor_right * scale_x)); - float y1_float = - std::min(render_target_height_float, - std::max(y0_float, immediate_draw.scissor_bottom * scale_y)); + float x1_float = xe::clamp_float(immediate_draw.scissor_right * scale_x, + x0_float, render_target_width_float); + float y1_float = xe::clamp_float(immediate_draw.scissor_bottom * scale_y, + y0_float, render_target_height_float); // Top-left - include .5 (0.128 treated as 0 covered, 0.129 as 0 not covered). int32_t x0 = (FloatToD3D11Fixed16p8(x0_float) + 127) >> 8; int32_t y0 = (FloatToD3D11Fixed16p8(y0_float) + 127) >> 8; diff --git a/src/xenia/ui/window_android.cc b/src/xenia/ui/window_android.cc index d67d478d1..8de82f400 100644 --- a/src/xenia/ui/window_android.cc +++ b/src/xenia/ui/window_android.cc @@ -153,16 +153,16 @@ bool AndroidWindow::OnActivitySurfaceMotionEvent(jobject event) { // with out-of-bounds coordinates), when moving the mouse outside the // View, or when starting moving the mouse when the pointer was previously // outside the View in some cases. - int32_t mouse_x = int32_t( - std::min(float(GetActualPhysicalWidth()), - std::max(0.0f, jni_env->CallFloatMethod( - event, jni_ids.motion_event_get_x, 0))) + - 0.5f); - int32_t mouse_y = int32_t( - std::min(float(GetActualPhysicalHeight()), - std::max(0.0f, jni_env->CallFloatMethod( - event, jni_ids.motion_event_get_y, 0))) + - 0.5f); + int32_t mouse_x = + int32_t(xe::clamp_float(jni_env->CallFloatMethod( + event, jni_ids.motion_event_get_x, 0), + 0.0f, float(GetActualPhysicalWidth())) + + 0.5f); + int32_t mouse_y = + int32_t(xe::clamp_float(jni_env->CallFloatMethod( + event, jni_ids.motion_event_get_y, 0), + 0.0f, float(GetActualPhysicalHeight())) + + 0.5f); static const MouseEvent::Button kMouseEventButtons[] = { MouseEvent::Button::kLeft, MouseEvent::Button::kRight, MouseEvent::Button::kMiddle, MouseEvent::Button::kX1, From 3189a0e259938fc3dbedc16a63a0b860e772dc7d Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sun, 12 May 2024 20:26:14 +0300 Subject: [PATCH 5/8] [GPU] Check memexport stream constant upper bits in range gathering --- src/xenia/gpu/draw_util.cc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 1b5671fcb..b9f70ef1d 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -659,7 +659,14 @@ void AddMemExportRanges(const RegisterFile& regs, const Shader& shader, for (uint32_t constant_index : shader.memexport_stream_constants()) { xenos::xe_gpu_memexport_stream_t stream = regs.GetMemExportStream(float_constants_base + constant_index); - if (!stream.index_count) { + // Safety checks for stream constants potentially not set up if the export + // isn't done on the control flow path taken by the shader (not checking the + // Y component because the index is more likely to be constructed + // arbitrarily). + // The hardware validates the upper bits of eA according to the + // IPR2015-00325 sequencer specification. + if (stream.const_0x1 != 0x1 || stream.const_0x4b0 != 0x4B0 || + stream.const_0x96 != 0x96 || !stream.index_count) { continue; } const FormatInfo& format_info = From 8e7301f4d80e97eb03a21a16d0801e12a5c91c48 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 16 May 2024 23:04:48 +0300 Subject: [PATCH 6/8] [SPIR-V] Use a helper class for most if/else branching Simplifies emission of the blocks themselves (including inserting blocks into the function's block list in the correct order), as well as phi after the branching. Also fixes 64bpp storing with blending in the fragment shader interlock render backend implementation (had a typo that caused the high 32 bits to overwrite the low ones). --- src/xenia/gpu/spirv_builder.cc | 102 ++ src/xenia/gpu/spirv_builder.h | 57 + src/xenia/gpu/spirv_shader_translator.cc | 298 +++--- src/xenia/gpu/spirv_shader_translator.h | 2 +- src/xenia/gpu/spirv_shader_translator_alu.cc | 83 +- .../gpu/spirv_shader_translator_fetch.cc | 774 ++++++-------- src/xenia/gpu/spirv_shader_translator_rb.cc | 983 +++++++----------- .../gpu/vulkan/vulkan_render_target_cache.cc | 23 +- 8 files changed, 998 insertions(+), 1324 deletions(-) diff --git a/src/xenia/gpu/spirv_builder.cc b/src/xenia/gpu/spirv_builder.cc index 2ba9446bc..2ed78bd65 100644 --- a/src/xenia/gpu/spirv_builder.cc +++ b/src/xenia/gpu/spirv_builder.cc @@ -13,6 +13,8 @@ #include #include +#include "xenia/base/assert.h" + namespace xe { namespace gpu { @@ -101,5 +103,105 @@ spv::Id SpirvBuilder::createTriBuiltinCall(spv::Id result_type, return result; } +SpirvBuilder::IfBuilder::IfBuilder(spv::Id condition, unsigned int control, + SpirvBuilder& builder, + unsigned int thenWeight, + unsigned int elseWeight) + : builder(builder), + condition(condition), + control(control), + thenWeight(thenWeight), + elseWeight(elseWeight), + function(builder.getBuildPoint()->getParent()) { + // Make the blocks, but only put the then-block into the function, the + // else-block and merge-block will be added later, in order, after earlier + // code is emitted. + thenBlock = new spv::Block(builder.getUniqueId(), function); + elseBlock = nullptr; + mergeBlock = new spv::Block(builder.getUniqueId(), function); + + // Save the current block, so that we can add in the flow control split when + // makeEndIf is called. + headerBlock = builder.getBuildPoint(); + + spv::Id headerBlockId = headerBlock->getId(); + thenPhiParent = headerBlockId; + elsePhiParent = headerBlockId; + + function.addBlock(thenBlock); + builder.setBuildPoint(thenBlock); +} + +void SpirvBuilder::IfBuilder::makeBeginElse(bool branchToMerge) { +#ifndef NDEBUG + assert_true(currentBranch == Branch::kThen); +#endif + + if (branchToMerge) { + // Close out the "then" by having it jump to the mergeBlock. + thenPhiParent = builder.getBuildPoint()->getId(); + builder.createBranch(mergeBlock); + } + + // Make the first else block and add it to the function. + elseBlock = new spv::Block(builder.getUniqueId(), function); + function.addBlock(elseBlock); + + // Start building the else block. + builder.setBuildPoint(elseBlock); + +#ifndef NDEBUG + currentBranch = Branch::kElse; +#endif +} + +void SpirvBuilder::IfBuilder::makeEndIf(bool branchToMerge) { +#ifndef NDEBUG + assert_true(currentBranch == Branch::kThen || currentBranch == Branch::kElse); +#endif + + if (branchToMerge) { + // Jump to the merge block. + (elseBlock ? elsePhiParent : thenPhiParent) = + builder.getBuildPoint()->getId(); + builder.createBranch(mergeBlock); + } + + // Go back to the headerBlock and make the flow control split. + builder.setBuildPoint(headerBlock); + builder.createSelectionMerge(mergeBlock, control); + { + spv::Block* falseBlock = elseBlock ? elseBlock : mergeBlock; + std::unique_ptr branch = + std::make_unique(spv::OpBranchConditional); + branch->addIdOperand(condition); + branch->addIdOperand(thenBlock->getId()); + branch->addIdOperand(falseBlock->getId()); + if (thenWeight || elseWeight) { + branch->addImmediateOperand(thenWeight); + branch->addImmediateOperand(elseWeight); + } + builder.getBuildPoint()->addInstruction(std::move(branch)); + thenBlock->addPredecessor(builder.getBuildPoint()); + falseBlock->addPredecessor(builder.getBuildPoint()); + } + + // Add the merge block to the function. + function.addBlock(mergeBlock); + builder.setBuildPoint(mergeBlock); + +#ifndef NDEBUG + currentBranch = Branch::kMerge; +#endif +} + +spv::Id SpirvBuilder::IfBuilder::createMergePhi(spv::Id then_variable, + spv::Id else_variable) const { + assert_true(builder.getBuildPoint() == mergeBlock); + return builder.createQuadOp(spv::OpPhi, builder.getTypeId(then_variable), + then_variable, getThenPhiParent(), else_variable, + getElsePhiParent()); +} + } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/spirv_builder.h b/src/xenia/gpu/spirv_builder.h index 0496aa7c4..1bb2e6851 100644 --- a/src/xenia/gpu/spirv_builder.h +++ b/src/xenia/gpu/spirv_builder.h @@ -10,7 +10,10 @@ #ifndef XENIA_GPU_SPIRV_BUILDER_H_ #define XENIA_GPU_SPIRV_BUILDER_H_ +#include + #include "third_party/glslang/SPIRV/SpvBuilder.h" +#include "xenia/base/assert.h" namespace xe { namespace gpu { @@ -42,6 +45,60 @@ class SpirvBuilder : public spv::Builder { spv::Id createTriBuiltinCall(spv::Id result_type, spv::Id builtins, int entry_point, spv::Id operand1, spv::Id operand2, spv::Id operand3); + + // Helper to use for building nested control flow with if-then-else with + // additions over SpvBuilder::If. + class IfBuilder { + public: + IfBuilder(spv::Id condition, unsigned int control, SpirvBuilder& builder, + unsigned int thenWeight = 0, unsigned int elseWeight = 0); + + ~IfBuilder() { +#ifndef NDEBUG + assert_true(currentBranch == Branch::kMerge); +#endif + } + + void makeBeginElse(bool branchToMerge = true); + void makeEndIf(bool branchToMerge = true); + + // If there's no then/else block that branches to the merge block, the phi + // parent is the header block - this simplifies then-only usage. + spv::Id getThenPhiParent() const { return thenPhiParent; } + spv::Id getElsePhiParent() const { return elsePhiParent; } + + spv::Id createMergePhi(spv::Id then_variable, spv::Id else_variable) const; + + private: + enum class Branch { + kThen, + kElse, + kMerge, + }; + + IfBuilder(const IfBuilder& ifBuilder) = delete; + IfBuilder& operator=(const IfBuilder& ifBuilder) = delete; + + SpirvBuilder& builder; + spv::Id condition; + unsigned int control; + unsigned int thenWeight; + unsigned int elseWeight; + + spv::Function& function; + + spv::Block* headerBlock; + spv::Block* thenBlock; + spv::Block* elseBlock; + spv::Block* mergeBlock; + + spv::Id thenPhiParent; + spv::Id elsePhiParent; + +#ifndef NDEBUG + Branch currentBranch = Branch::kThen; +#endif + }; }; } // namespace gpu diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index 8bcaa19fd..e34193219 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -1272,89 +1272,70 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { builder_->makeUintConstant(static_cast( kSysFlag_ComputeOrPrimitiveVertexIndexLoad))), const_uint_0_); - spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint(); - spv::Block& block_load_vertex_index_start = builder_->makeNewBlock(); - spv::Block& block_load_vertex_index_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_load_vertex_index_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(load_vertex_index, - &block_load_vertex_index_start, - &block_load_vertex_index_merge); - builder_->setBuildPoint(&block_load_vertex_index_start); - // Check if the index is 32-bit. - spv::Id vertex_index_is_32bit = builder_->createBinOp( - spv::OpINotEqual, type_bool_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, - builder_->makeUintConstant(static_cast( - kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit))), - const_uint_0_); - // Calculate the vertex index address in the shared memory. - id_vector_temp_.clear(); - id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress)); - spv::Id vertex_index_address = builder_->createBinOp( - spv::OpIAdd, type_uint_, - builder_->createLoad( - builder_->createAccessChain(spv::StorageClassUniform, - uniform_system_constants_, - id_vector_temp_), - spv::NoPrecision), - builder_->createBinOp( - spv::OpShiftLeftLogical, type_uint_, vertex_index, - builder_->createTriOp(spv::OpSelect, type_uint_, - vertex_index_is_32bit, const_uint_2, - builder_->makeUintConstant(1)))); - // Load the 32 bits containing the whole vertex index or two 16-bit - // vertex indices. - // TODO(Triang3l): Bounds checking. - spv::Id loaded_vertex_index = - LoadUint32FromSharedMemory(builder_->createUnaryOp( - spv::OpBitcast, type_int_, - builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, - vertex_index_address, const_uint_2))); - // Extract the 16-bit index from the loaded 32 bits if needed. - loaded_vertex_index = builder_->createTriOp( - spv::OpSelect, type_uint_, vertex_index_is_32bit, - loaded_vertex_index, - builder_->createTriOp( - spv::OpBitFieldUExtract, type_uint_, loaded_vertex_index, - builder_->createBinOp( - spv::OpShiftLeftLogical, type_uint_, - builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, - vertex_index_address, const_uint_2), - builder_->makeUintConstant(4 - 1)), - builder_->makeUintConstant(16))); - // Endian-swap the loaded index. - id_vector_temp_.clear(); - id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); - loaded_vertex_index = EndianSwap32Uint( - loaded_vertex_index, - builder_->createLoad( - builder_->createAccessChain(spv::StorageClassUniform, - uniform_system_constants_, - id_vector_temp_), - spv::NoPrecision)); - // Get the actual build point for phi. - spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint(); - builder_->createBranch(&block_load_vertex_index_merge); - // Select between the loaded index and the original index from Vulkan. - builder_->setBuildPoint(&block_load_vertex_index_merge); + SpirvBuilder::IfBuilder load_vertex_index_if( + load_vertex_index, spv::SelectionControlDontFlattenMask, *builder_); + spv::Id loaded_vertex_index; { - std::unique_ptr loaded_vertex_index_phi_op = - std::make_unique(builder_->getUniqueId(), - type_uint_, spv::OpPhi); - loaded_vertex_index_phi_op->addIdOperand(loaded_vertex_index); - loaded_vertex_index_phi_op->addIdOperand( - block_load_vertex_index_end.getId()); - loaded_vertex_index_phi_op->addIdOperand(vertex_index); - loaded_vertex_index_phi_op->addIdOperand( - block_load_vertex_index_pre.getId()); - vertex_index = loaded_vertex_index_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(loaded_vertex_index_phi_op)); + // Check if the index is 32-bit. + spv::Id vertex_index_is_32bit = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, + builder_->makeUintConstant(static_cast( + kSysFlag_ComputeOrPrimitiveVertexIndexLoad32Bit))), + const_uint_0_); + // Calculate the vertex index address in the shared memory. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress)); + spv::Id vertex_index_address = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision), + builder_->createBinOp( + spv::OpShiftLeftLogical, type_uint_, vertex_index, + builder_->createTriOp(spv::OpSelect, type_uint_, + vertex_index_is_32bit, const_uint_2, + builder_->makeUintConstant(1)))); + // Load the 32 bits containing the whole vertex index or two 16-bit + // vertex indices. + // TODO(Triang3l): Bounds checking. + loaded_vertex_index = + LoadUint32FromSharedMemory(builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + vertex_index_address, const_uint_2))); + // Extract the 16-bit index from the loaded 32 bits if needed. + loaded_vertex_index = builder_->createTriOp( + spv::OpSelect, type_uint_, vertex_index_is_32bit, + loaded_vertex_index, + builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, loaded_vertex_index, + builder_->createBinOp( + spv::OpShiftLeftLogical, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + vertex_index_address, const_uint_2), + builder_->makeUintConstant(4 - 1)), + builder_->makeUintConstant(16))); + // Endian-swap the loaded index. + id_vector_temp_.clear(); + id_vector_temp_.push_back( + builder_->makeIntConstant(kSystemConstantVertexIndexEndian)); + loaded_vertex_index = EndianSwap32Uint( + loaded_vertex_index, + builder_->createLoad( + builder_->createAccessChain(spv::StorageClassUniform, + uniform_system_constants_, + id_vector_temp_), + spv::NoPrecision)); } + load_vertex_index_if.makeEndIf(); + // Select between the loaded index and the original index from Vulkan. + vertex_index = load_vertex_index_if.createMergePhi(loaded_vertex_index, + vertex_index); } else { // TODO(Triang3l): Close line loop primitive. // Load the unswapped index as uint for swapping, or for indirect @@ -1368,53 +1349,35 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { builder_->makeUintConstant( static_cast(kSysFlag_VertexIndexLoad))), const_uint_0_); - spv::Block& block_load_vertex_index_pre = *builder_->getBuildPoint(); - spv::Block& block_load_vertex_index_start = builder_->makeNewBlock(); - spv::Block& block_load_vertex_index_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_load_vertex_index_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(load_vertex_index, - &block_load_vertex_index_start, - &block_load_vertex_index_merge); - builder_->setBuildPoint(&block_load_vertex_index_start); - // Load the 32-bit index. - // TODO(Triang3l): Bounds checking. - id_vector_temp_.clear(); - id_vector_temp_.push_back( - builder_->makeIntConstant(kSystemConstantVertexIndexLoadAddress)); - spv::Id loaded_vertex_index = - LoadUint32FromSharedMemory(builder_->createUnaryOp( - spv::OpBitcast, type_int_, - builder_->createBinOp( - spv::OpIAdd, type_uint_, - builder_->createBinOp( - spv::OpShiftRightLogical, type_uint_, - builder_->createLoad( - builder_->createAccessChain( - spv::StorageClassUniform, - uniform_system_constants_, id_vector_temp_), - spv::NoPrecision), - builder_->makeUintConstant(2)), - vertex_index))); - // Get the actual build point for phi. - spv::Block& block_load_vertex_index_end = *builder_->getBuildPoint(); - builder_->createBranch(&block_load_vertex_index_merge); - // Select between the loaded index and the original index from Vulkan. - builder_->setBuildPoint(&block_load_vertex_index_merge); + SpirvBuilder::IfBuilder load_vertex_index_if( + load_vertex_index, spv::SelectionControlDontFlattenMask, + *builder_); + spv::Id loaded_vertex_index; { - std::unique_ptr loaded_vertex_index_phi_op = - std::make_unique(builder_->getUniqueId(), - type_uint_, spv::OpPhi); - loaded_vertex_index_phi_op->addIdOperand(loaded_vertex_index); - loaded_vertex_index_phi_op->addIdOperand( - block_load_vertex_index_end.getId()); - loaded_vertex_index_phi_op->addIdOperand(vertex_index); - loaded_vertex_index_phi_op->addIdOperand( - block_load_vertex_index_pre.getId()); - vertex_index = loaded_vertex_index_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(loaded_vertex_index_phi_op)); + // Load the 32-bit index. + // TODO(Triang3l): Bounds checking. + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeIntConstant( + kSystemConstantVertexIndexLoadAddress)); + loaded_vertex_index = + LoadUint32FromSharedMemory(builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createLoad( + builder_->createAccessChain( + spv::StorageClassUniform, + uniform_system_constants_, id_vector_temp_), + spv::NoPrecision), + builder_->makeUintConstant(2)), + vertex_index))); } + load_vertex_index_if.makeEndIf(); + // Select between the loaded index and the original index from Vulkan. + vertex_index = load_vertex_index_if.createMergePhi( + loaded_vertex_index, vertex_index); } // Endian-swap the index. id_vector_temp_.clear(); @@ -2808,40 +2771,25 @@ spv::Id SpirvShaderTranslator::EndianSwap32Uint(spv::Id value, spv::Id endian) { static_cast(xenos::Endian::k8in32))); spv::Id is_8in16_or_8in32 = builder_->createBinOp(spv::OpLogicalOr, type_bool_, is_8in16, is_8in32); - spv::Block& block_pre_8in16 = *builder_->getBuildPoint(); - assert_false(block_pre_8in16.isTerminated()); - spv::Block& block_8in16 = builder_->makeNewBlock(); - spv::Block& block_8in16_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_8in16_merge, - spv::SelectionControlMaskNone); - builder_->createConditionalBranch(is_8in16_or_8in32, &block_8in16, - &block_8in16_merge); - builder_->setBuildPoint(&block_8in16); - spv::Id swapped_8in16 = builder_->createBinOp( - spv::OpBitwiseOr, type, - builder_->createBinOp( - spv::OpBitwiseAnd, type, - builder_->createBinOp(spv::OpShiftRightLogical, type, value, - const_uint_8_typed), - const_uint_00ff00ff_typed), - builder_->createBinOp( - spv::OpShiftLeftLogical, type, - builder_->createBinOp(spv::OpBitwiseAnd, type, value, - const_uint_00ff00ff_typed), - const_uint_8_typed)); - builder_->createBranch(&block_8in16_merge); - builder_->setBuildPoint(&block_8in16_merge); + SpirvBuilder::IfBuilder if_8in16(is_8in16_or_8in32, + spv::SelectionControlMaskNone, *builder_); + spv::Id swapped_8in16; { - std::unique_ptr phi_op = - std::make_unique(builder_->getUniqueId(), type, - spv::OpPhi); - phi_op->addIdOperand(swapped_8in16); - phi_op->addIdOperand(block_8in16.getId()); - phi_op->addIdOperand(value); - phi_op->addIdOperand(block_pre_8in16.getId()); - value = phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(phi_op)); + swapped_8in16 = builder_->createBinOp( + spv::OpBitwiseOr, type, + builder_->createBinOp( + spv::OpBitwiseAnd, type, + builder_->createBinOp(spv::OpShiftRightLogical, type, value, + const_uint_8_typed), + const_uint_00ff00ff_typed), + builder_->createBinOp( + spv::OpShiftLeftLogical, type, + builder_->createBinOp(spv::OpBitwiseAnd, type, value, + const_uint_00ff00ff_typed), + const_uint_8_typed)); } + if_8in16.makeEndIf(); + value = if_8in16.createMergePhi(swapped_8in16, value); // 16-in-32 or another half of 8-in-32 (doing 16-in-32 swap). spv::Id is_16in32 = builder_->createBinOp( @@ -2850,32 +2798,18 @@ spv::Id SpirvShaderTranslator::EndianSwap32Uint(spv::Id value, spv::Id endian) { static_cast(xenos::Endian::k16in32))); spv::Id is_8in32_or_16in32 = builder_->createBinOp(spv::OpLogicalOr, type_bool_, is_8in32, is_16in32); - spv::Block& block_pre_16in32 = *builder_->getBuildPoint(); - spv::Block& block_16in32 = builder_->makeNewBlock(); - spv::Block& block_16in32_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_16in32_merge, - spv::SelectionControlMaskNone); - builder_->createConditionalBranch(is_8in32_or_16in32, &block_16in32, - &block_16in32_merge); - builder_->setBuildPoint(&block_16in32); - spv::Id swapped_16in32 = builder_->createQuadOp( - spv::OpBitFieldInsert, type, - builder_->createBinOp(spv::OpShiftRightLogical, type, value, - const_uint_16_typed), - value, builder_->makeIntConstant(16), builder_->makeIntConstant(16)); - builder_->createBranch(&block_16in32_merge); - builder_->setBuildPoint(&block_16in32_merge); + SpirvBuilder::IfBuilder if_16in32(is_8in32_or_16in32, + spv::SelectionControlMaskNone, *builder_); + spv::Id swapped_16in32; { - std::unique_ptr phi_op = - std::make_unique(builder_->getUniqueId(), type, - spv::OpPhi); - phi_op->addIdOperand(swapped_16in32); - phi_op->addIdOperand(block_16in32.getId()); - phi_op->addIdOperand(value); - phi_op->addIdOperand(block_pre_16in32.getId()); - value = phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(phi_op)); + swapped_16in32 = builder_->createQuadOp( + spv::OpBitFieldInsert, type, + builder_->createBinOp(spv::OpShiftRightLogical, type, value, + const_uint_16_typed), + value, builder_->makeIntConstant(16), builder_->makeIntConstant(16)); } + if_16in32.makeEndIf(); + value = if_16in32.createMergePhi(swapped_16in32, value); return value; } diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 0ed368ae4..8c4942156 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -605,7 +605,7 @@ class SpirvShaderTranslator : public ShaderTranslator { void SampleTexture(spv::Builder::TextureParameters& texture_parameters, spv::ImageOperandsMask image_operands_mask, spv::Id image_unsigned, spv::Id image_signed, - spv::Id sampler, spv::Id is_all_signed, + spv::Id sampler, spv::Id is_any_unsigned, spv::Id is_any_signed, spv::Id& result_unsigned_out, spv::Id& result_signed_out, spv::Id lerp_factor = spv::NoResult, diff --git a/src/xenia/gpu/spirv_shader_translator_alu.cc b/src/xenia/gpu/spirv_shader_translator_alu.cc index 05e41d5ab..ecc88f57b 100644 --- a/src/xenia/gpu/spirv_shader_translator_alu.cc +++ b/src/xenia/gpu/spirv_shader_translator_alu.cc @@ -40,30 +40,18 @@ spv::Id SpirvShaderTranslator::ZeroIfAnyOperandIsZero(spv::Id value, } void SpirvShaderTranslator::KillPixel(spv::Id condition) { - // Same calls as in spv::Builder::If. - spv::Function& function = builder_->getBuildPoint()->getParent(); - spv::Block* kill_block = new spv::Block(builder_->getUniqueId(), function); - spv::Block* merge_block = new spv::Block(builder_->getUniqueId(), function); - spv::Block& header_block = *builder_->getBuildPoint(); - - function.addBlock(kill_block); - builder_->setBuildPoint(kill_block); - // Kill without influencing the control flow in the translated shader. - if (var_main_kill_pixel_ != spv::NoResult) { - builder_->createStore(builder_->makeBoolConstant(true), - var_main_kill_pixel_); + SpirvBuilder::IfBuilder kill_if(condition, spv::SelectionControlMaskNone, + *builder_); + { + if (var_main_kill_pixel_ != spv::NoResult) { + builder_->createStore(builder_->makeBoolConstant(true), + var_main_kill_pixel_); + } + if (features_.demote_to_helper_invocation) { + builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); + } } - if (features_.demote_to_helper_invocation) { - builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); - } - builder_->createBranch(merge_block); - - builder_->setBuildPoint(&header_block); - builder_->createSelectionMerge(merge_block, spv::SelectionControlMaskNone); - builder_->createConditionalBranch(condition, kill_block, merge_block); - - function.addBlock(merge_block); - builder_->setBuildPoint(merge_block); + kill_if.makeEndIf(); } void SpirvShaderTranslator::ProcessAluInstruction( @@ -564,7 +552,7 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( spv::Id ma_z_result[4] = {}, ma_yx_result[4] = {}; // Check if the major axis is Z (abs(z) >= abs(x) && abs(z) >= abs(y)). - spv::Builder::If ma_z_if( + SpirvBuilder::IfBuilder ma_z_if( builder_->createBinOp( spv::OpLogicalAnd, type_bool_, builder_->createBinOp(spv::OpFOrdGreaterThanEqual, type_bool_, @@ -596,14 +584,13 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( } } } - spv::Block& ma_z_end_block = *builder_->getBuildPoint(); ma_z_if.makeBeginElse(); { spv::Id ma_y_result[4] = {}, ma_x_result[4] = {}; // The major axis is not Z - create an inner conditional to check if the // major axis is Y (abs(y) >= abs(x)). - spv::Builder::If ma_y_if( + SpirvBuilder::IfBuilder ma_y_if( builder_->createBinOp(spv::OpFOrdGreaterThanEqual, type_bool_, operand_abs[1], operand_abs[0]), spv::SelectionControlMaskNone, *builder_); @@ -629,7 +616,6 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( } } } - spv::Block& ma_y_end_block = *builder_->getBuildPoint(); ma_y_if.makeBeginElse(); { // The major axis is X. @@ -654,7 +640,6 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( } } } - spv::Block& ma_x_end_block = *builder_->getBuildPoint(); ma_y_if.makeEndIf(); // The major axis is Y or X - choose the options of the result from Y @@ -663,18 +648,10 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( if (!(used_result_components & (1 << i))) { continue; } - std::unique_ptr phi_op = - std::make_unique(builder_->getUniqueId(), - type_float_, spv::OpPhi); - phi_op->addIdOperand(ma_y_result[i]); - phi_op->addIdOperand(ma_y_end_block.getId()); - phi_op->addIdOperand(ma_x_result[i]); - phi_op->addIdOperand(ma_x_end_block.getId()); - ma_yx_result[i] = phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(phi_op)); + ma_yx_result[i] = + ma_y_if.createMergePhi(ma_y_result[i], ma_x_result[i]); } } - spv::Block& ma_yx_end_block = *builder_->getBuildPoint(); ma_z_if.makeEndIf(); // Choose the result options from Z and YX cases. @@ -683,15 +660,8 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( if (!(used_result_components & (1 << i))) { continue; } - std::unique_ptr phi_op = - std::make_unique(builder_->getUniqueId(), - type_float_, spv::OpPhi); - phi_op->addIdOperand(ma_z_result[i]); - phi_op->addIdOperand(ma_z_end_block.getId()); - phi_op->addIdOperand(ma_yx_result[i]); - phi_op->addIdOperand(ma_yx_end_block.getId()); - id_vector_temp_.push_back(phi_op->getResultId()); - builder_->getBuildPoint()->addInstruction(std::move(phi_op)); + id_vector_temp_.push_back( + ma_z_if.createMergePhi(ma_z_result[i], ma_yx_result[i])); } assert_true(id_vector_temp_.size() == used_result_component_count); if (used_result_components & 0b0100) { @@ -1044,10 +1014,9 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( spv::OpLogicalAnd, type_bool_, condition, builder_->createBinOp(spv::OpFOrdGreaterThan, type_bool_, b, const_float_0_)); - spv::Block& pre_multiply_if_block = *builder_->getBuildPoint(); + SpirvBuilder::IfBuilder multiply_if( + condition, spv::SelectionControlMaskNone, *builder_); spv::Id product; - spv::Builder::If multiply_if(condition, spv::SelectionControlMaskNone, - *builder_); { // Multiplication case. spv::Id a = instr.scalar_operands[0].GetComponent(0) != @@ -1061,21 +1030,9 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( product = ZeroIfAnyOperandIsZero( product, GetAbsoluteOperand(a, instr.scalar_operands[0]), ps_abs); } - spv::Block& multiply_end_block = *builder_->getBuildPoint(); multiply_if.makeEndIf(); // Merge - choose between the product and -FLT_MAX. - { - std::unique_ptr phi_op = - std::make_unique(builder_->getUniqueId(), - type_float_, spv::OpPhi); - phi_op->addIdOperand(product); - phi_op->addIdOperand(multiply_end_block.getId()); - phi_op->addIdOperand(const_float_max_neg); - phi_op->addIdOperand(pre_multiply_if_block.getId()); - spv::Id phi_result = phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(phi_op)); - return phi_result; - } + return multiply_if.createMergePhi(product, const_float_max_neg); } case ucode::AluScalarOpcode::kMaxs: diff --git a/src/xenia/gpu/spirv_shader_translator_fetch.cc b/src/xenia/gpu/spirv_shader_translator_fetch.cc index 265082ba1..8f5a74690 100644 --- a/src/xenia/gpu/spirv_shader_translator_fetch.cc +++ b/src/xenia/gpu/spirv_shader_translator_fetch.cc @@ -1145,31 +1145,18 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( z_coordinate_ref = builder_->createNoContractionBinOp( spv::OpFAdd, type_float_, z_coordinate_ref, z_offset); } - spv::Block& block_dimension_head = *builder_->getBuildPoint(); - spv::Block& block_dimension_merge = builder_->makeNewBlock(); - spv::Block& block_dimension_3d = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_dimension_merge, - spv::SelectionControlDontFlattenMask); assert_true(data_is_3d != spv::NoResult); - builder_->createConditionalBranch(data_is_3d, &block_dimension_3d, - &block_dimension_merge); - builder_->setBuildPoint(&block_dimension_3d); - assert_true(z_size != spv::NoResult); - spv::Id z_3d = builder_->createNoContractionBinOp( - spv::OpFDiv, type_float_, z_coordinate_ref, z_size); - builder_->createBranch(&block_dimension_merge); - builder_->setBuildPoint(&block_dimension_merge); + SpirvBuilder::IfBuilder if_data_is_3d( + data_is_3d, spv::SelectionControlDontFlattenMask, *builder_); + spv::Id z_3d; { - std::unique_ptr z_phi_op = - std::make_unique(builder_->getUniqueId(), - type_float_, spv::OpPhi); - z_phi_op->addIdOperand(z_3d); - z_phi_op->addIdOperand(block_dimension_3d.getId()); - z_phi_op->addIdOperand(z_coordinate_ref); - z_phi_op->addIdOperand(block_dimension_head.getId()); - z_coordinate_ref = z_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(z_phi_op)); + assert_true(z_size != spv::NoResult); + z_3d = builder_->createNoContractionBinOp(spv::OpFDiv, type_float_, + z_coordinate_ref, z_size); } + if_data_is_3d.makeEndIf(); + z_coordinate_ref = + if_data_is_3d.createMergePhi(z_3d, z_coordinate_ref); } else { // Denormalize the Z coordinate for a stacked texture, and apply the // offset. @@ -1394,63 +1381,39 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( // OpSampledImage must be in the same block as where its result is used. if (instr.dimension == xenos::FetchOpDimension::k3DOrStacked) { // Check if the texture is 3D or stacked. - spv::Block& block_dimension_head = *builder_->getBuildPoint(); - spv::Block& block_dimension_3d_start = builder_->makeNewBlock(); - spv::Block& block_dimension_stacked_start = builder_->makeNewBlock(); - spv::Block& block_dimension_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_dimension_merge, - spv::SelectionControlDontFlattenMask); assert_true(data_is_3d != spv::NoResult); - builder_->createConditionalBranch(data_is_3d, - &block_dimension_3d_start, - &block_dimension_stacked_start); - - // 3D. - builder_->setBuildPoint(&block_dimension_3d_start); - id_vector_temp_.clear(); - for (uint32_t i = 0; i < 3; ++i) { - id_vector_temp_.push_back(coordinates[i]); - } - texture_parameters.coords = - builder_->createCompositeConstruct(type_float3_, id_vector_temp_); - spv::Id lod_3d = QueryTextureLod(texture_parameters, - image_3d_unsigned, image_3d_signed, - sampler, swizzled_signs_all_signed); - // Get the actual build point for phi. - spv::Block& block_dimension_3d_end = *builder_->getBuildPoint(); - builder_->createBranch(&block_dimension_merge); - - // 2D stacked. - builder_->setBuildPoint(&block_dimension_stacked_start); - id_vector_temp_.clear(); - for (uint32_t i = 0; i < 2; ++i) { - id_vector_temp_.push_back(coordinates[i]); - } - texture_parameters.coords = - builder_->createCompositeConstruct(type_float2_, id_vector_temp_); - spv::Id lod_stacked = QueryTextureLod( - texture_parameters, image_2d_array_or_cube_unsigned, - image_2d_array_or_cube_signed, sampler, - swizzled_signs_all_signed); - // Get the actual build point for phi. - spv::Block& block_dimension_stacked_end = *builder_->getBuildPoint(); - builder_->createBranch(&block_dimension_merge); - - // Choose between the 3D and the stacked result based on the actual - // data dimensionality. - builder_->setBuildPoint(&block_dimension_merge); + SpirvBuilder::IfBuilder if_data_is_3d( + data_is_3d, spv::SelectionControlDontFlattenMask, *builder_); + spv::Id lod_3d; { - std::unique_ptr dimension_phi_op = - std::make_unique(builder_->getUniqueId(), - type_float_, spv::OpPhi); - dimension_phi_op->addIdOperand(lod_3d); - dimension_phi_op->addIdOperand(block_dimension_3d_end.getId()); - dimension_phi_op->addIdOperand(lod_stacked); - dimension_phi_op->addIdOperand(block_dimension_stacked_end.getId()); - result[0] = dimension_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(dimension_phi_op)); + // 3D. + id_vector_temp_.clear(); + for (uint32_t i = 0; i < 3; ++i) { + id_vector_temp_.push_back(coordinates[i]); + } + texture_parameters.coords = builder_->createCompositeConstruct( + type_float3_, id_vector_temp_); + lod_3d = QueryTextureLod(texture_parameters, image_3d_unsigned, + image_3d_signed, sampler, + swizzled_signs_all_signed); } + if_data_is_3d.makeBeginElse(); + spv::Id lod_stacked; + { + // 2D stacked. + id_vector_temp_.clear(); + for (uint32_t i = 0; i < 2; ++i) { + id_vector_temp_.push_back(coordinates[i]); + } + texture_parameters.coords = builder_->createCompositeConstruct( + type_float2_, id_vector_temp_); + lod_stacked = QueryTextureLod(texture_parameters, + image_2d_array_or_cube_unsigned, + image_2d_array_or_cube_signed, + sampler, swizzled_signs_all_signed); + } + if_data_is_3d.makeEndIf(); + result[0] = if_data_is_3d.createMergePhi(lod_3d, lod_stacked); } else { uint32_t lod_query_coordinate_component_count = instr.dimension == xenos::FetchOpDimension::kCube ? 3 : 2; @@ -1512,6 +1475,8 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( } } } + spv::Id is_any_unsigned = builder_->createUnaryOp( + spv::OpLogicalNot, type_bool_, is_all_signed); // Load the fetch constant word 4, needed unconditionally for LOD // biasing, for result exponent biasing, and conditionally for stacked @@ -1765,273 +1730,247 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( // component, 2 gradient components, two fetches if the Z axis is // linear-filtered). - spv::Block& block_dimension_head = *builder_->getBuildPoint(); - spv::Block& block_dimension_3d_start = builder_->makeNewBlock(); - spv::Block& block_dimension_stacked_start = builder_->makeNewBlock(); - spv::Block& block_dimension_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_dimension_merge, - spv::SelectionControlDontFlattenMask); assert_true(data_is_3d != spv::NoResult); - builder_->createConditionalBranch(data_is_3d, - &block_dimension_3d_start, - &block_dimension_stacked_start); - - // 3D. - builder_->setBuildPoint(&block_dimension_3d_start); - if (use_computed_lod) { - texture_parameters.gradX = gradients_h; - texture_parameters.gradY = gradients_v; - } - id_vector_temp_.clear(); - for (uint32_t i = 0; i < 3; ++i) { - id_vector_temp_.push_back(coordinates[i]); - } - texture_parameters.coords = - builder_->createCompositeConstruct(type_float3_, id_vector_temp_); + SpirvBuilder::IfBuilder if_data_is_3d( + data_is_3d, spv::SelectionControlDontFlattenMask, *builder_); spv::Id sample_result_unsigned_3d, sample_result_signed_3d; - SampleTexture(texture_parameters, image_operands_mask, - image_3d_unsigned, image_3d_signed, sampler, - is_all_signed, is_any_signed, sample_result_unsigned_3d, - sample_result_signed_3d); - // Get the actual build point after the SampleTexture call for phi. - spv::Block& block_dimension_3d_end = *builder_->getBuildPoint(); - builder_->createBranch(&block_dimension_merge); - - // 2D stacked. - builder_->setBuildPoint(&block_dimension_stacked_start); - if (use_computed_lod) { - // Extract 2D gradients for stacked textures which are 2D arrays. - uint_vector_temp_.clear(); - uint_vector_temp_.push_back(0); - uint_vector_temp_.push_back(1); - texture_parameters.gradX = builder_->createRvalueSwizzle( - spv::NoPrecision, type_float2_, gradients_h, uint_vector_temp_); - texture_parameters.gradY = builder_->createRvalueSwizzle( - spv::NoPrecision, type_float2_, gradients_v, uint_vector_temp_); - } - // Check if linear filtering is needed. - bool vol_mag_filter_is_fetch_const = - instr.attributes.vol_mag_filter == - xenos::TextureFilter::kUseFetchConst; - bool vol_min_filter_is_fetch_const = - instr.attributes.vol_min_filter == - xenos::TextureFilter::kUseFetchConst; - bool vol_mag_filter_is_linear = - instr.attributes.vol_mag_filter == xenos::TextureFilter::kLinear; - bool vol_min_filter_is_linear = - instr.attributes.vol_min_filter == xenos::TextureFilter::kLinear; - spv::Id vol_filter_is_linear = spv::NoResult; - if (use_computed_lod && - (vol_mag_filter_is_fetch_const || vol_min_filter_is_fetch_const || - vol_mag_filter_is_linear != vol_min_filter_is_linear)) { - // Check if minifying along layers (derivative > 1 along any axis). - spv::Id layer_max_gradient = builder_->createBinBuiltinCall( - type_float_, ext_inst_glsl_std_450_, GLSLstd450NMax, - builder_->createCompositeExtract(gradients_h, type_float_, 2), - builder_->createCompositeExtract(gradients_v, type_float_, 2)); - if (!instr.attributes.unnormalized_coordinates) { - // Denormalize the gradient if provided as normalized. - assert_true(size[2] != spv::NoResult); - layer_max_gradient = builder_->createNoContractionBinOp( - spv::OpFMul, type_float_, layer_max_gradient, size[2]); + { + // 3D. + if (use_computed_lod) { + texture_parameters.gradX = gradients_h; + texture_parameters.gradY = gradients_v; } - // For NaN, considering that magnification is being done. - spv::Id is_minifying_z = builder_->createBinOp( - spv::OpFOrdLessThan, type_bool_, layer_max_gradient, - builder_->makeFloatConstant(1.0f)); - // Choose what filter is actually used, the minification or the - // magnification one. - spv::Id vol_mag_filter_is_linear_loaded = - vol_mag_filter_is_fetch_const - ? builder_->createBinOp( - spv::OpINotEqual, type_bool_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, - fetch_constant_word_4, - builder_->makeUintConstant(UINT32_C(1) << 0)), - const_uint_0_) - : builder_->makeBoolConstant(vol_mag_filter_is_linear); - spv::Id vol_min_filter_is_linear_loaded = - vol_min_filter_is_fetch_const - ? builder_->createBinOp( - spv::OpINotEqual, type_bool_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, - fetch_constant_word_4, - builder_->makeUintConstant(UINT32_C(1) << 1)), - const_uint_0_) - : builder_->makeBoolConstant(vol_min_filter_is_linear); - vol_filter_is_linear = - builder_->createTriOp(spv::OpSelect, type_bool_, is_minifying_z, - vol_min_filter_is_linear_loaded, - vol_mag_filter_is_linear_loaded); - } else { - // No gradients, or using the same filter overrides for magnifying - // and minifying. Assume always magnifying if no gradients (LOD 0, - // always <= 0). LOD is within 2D layers, not between them (unlike - // in 3D textures, which have mips with depth reduced), so it - // shouldn't have effect on filtering between layers. - if (vol_mag_filter_is_fetch_const) { - vol_filter_is_linear = builder_->createBinOp( - spv::OpINotEqual, type_bool_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, fetch_constant_word_4, - builder_->makeUintConstant(UINT32_C(1) << 0)), - const_uint_0_); + id_vector_temp_.clear(); + for (uint32_t i = 0; i < 3; ++i) { + id_vector_temp_.push_back(coordinates[i]); } + texture_parameters.coords = builder_->createCompositeConstruct( + type_float3_, id_vector_temp_); + SampleTexture(texture_parameters, image_operands_mask, + image_3d_unsigned, image_3d_signed, sampler, + is_any_unsigned, is_any_signed, + sample_result_unsigned_3d, sample_result_signed_3d); } - spv::Id layer_coordinate = coordinates[2]; - // Linear filtering may be needed either based on a dynamic condition - // (the filtering mode is taken from the fetch constant, or it's - // different for magnification and minification), or on a static one - // (with gradients - specified in the instruction for both - // magnification and minification as linear, without gradients - - // specified for magnification as linear). - // If the filter is linear, subtract 0.5 from the Z coordinate of the - // first layer in filtering because 0.5 is in the middle of it. - if (vol_filter_is_linear != spv::NoResult) { - layer_coordinate = builder_->createTriOp( - spv::OpSelect, type_float_, vol_filter_is_linear, - builder_->createNoContractionBinOp( - spv::OpFSub, type_float_, layer_coordinate, - builder_->makeFloatConstant(0.5f)), - layer_coordinate); - } else if (vol_mag_filter_is_linear) { - layer_coordinate = builder_->createNoContractionBinOp( - spv::OpFSub, type_float_, layer_coordinate, - builder_->makeFloatConstant(0.5f)); - } - // Sample the first layer, needed regardless of whether filtering is - // needed. - // Floor the array layer (Vulkan does rounding to nearest or + 0.5 and - // floor even for the layer index, but on the Xenos, addressing is - // similar to that of 3D textures). This is needed for both point and - // linear filtering (with linear, 0.5 was subtracted previously). - spv::Id layer_0_coordinate = builder_->createUnaryBuiltinCall( - type_float_, ext_inst_glsl_std_450_, GLSLstd450Floor, - layer_coordinate); - id_vector_temp_.clear(); - id_vector_temp_.push_back(coordinates[0]); - id_vector_temp_.push_back(coordinates[1]); - id_vector_temp_.push_back(layer_0_coordinate); - texture_parameters.coords = - builder_->createCompositeConstruct(type_float3_, id_vector_temp_); + if_data_is_3d.makeBeginElse(); spv::Id sample_result_unsigned_stacked, sample_result_signed_stacked; - SampleTexture(texture_parameters, image_operands_mask, - image_2d_array_or_cube_unsigned, - image_2d_array_or_cube_signed, sampler, is_all_signed, - is_any_signed, sample_result_unsigned_stacked, - sample_result_signed_stacked); - // Sample the second layer if linear filtering is potentially needed - // (conditionally or unconditionally, depending on whether the filter - // needs to be chosen at runtime), and filter. - if (vol_filter_is_linear != spv::NoResult || - vol_mag_filter_is_linear) { - spv::Block& block_z_head = *builder_->getBuildPoint(); - spv::Block& block_z_linear = (vol_filter_is_linear != spv::NoResult) - ? builder_->makeNewBlock() - : block_z_head; - spv::Block& block_z_merge = (vol_filter_is_linear != spv::NoResult) - ? builder_->makeNewBlock() - : block_z_head; - if (vol_filter_is_linear != spv::NoResult) { - builder_->createSelectionMerge( - &block_z_merge, spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch( - vol_filter_is_linear, &block_z_linear, &block_z_merge); - builder_->setBuildPoint(&block_z_linear); + { + // 2D stacked. + if (use_computed_lod) { + // Extract 2D gradients for stacked textures which are 2D arrays. + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + texture_parameters.gradX = + builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_, + gradients_h, uint_vector_temp_); + texture_parameters.gradY = + builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_, + gradients_v, uint_vector_temp_); } - spv::Id layer_1_coordinate = builder_->createBinOp( - spv::OpFAdd, type_float_, layer_0_coordinate, - builder_->makeFloatConstant(1.0f)); + // Check if linear filtering is needed. + bool vol_mag_filter_is_fetch_const = + instr.attributes.vol_mag_filter == + xenos::TextureFilter::kUseFetchConst; + bool vol_min_filter_is_fetch_const = + instr.attributes.vol_min_filter == + xenos::TextureFilter::kUseFetchConst; + bool vol_mag_filter_is_linear = instr.attributes.vol_mag_filter == + xenos::TextureFilter::kLinear; + bool vol_min_filter_is_linear = instr.attributes.vol_min_filter == + xenos::TextureFilter::kLinear; + spv::Id vol_filter_is_linear = spv::NoResult; + if (use_computed_lod && + (vol_mag_filter_is_fetch_const || + vol_min_filter_is_fetch_const || + vol_mag_filter_is_linear != vol_min_filter_is_linear)) { + // Check if minifying along layers (derivative > 1 along any + // axis). + spv::Id layer_max_gradient = builder_->createBinBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450NMax, + builder_->createCompositeExtract(gradients_h, type_float_, 2), + builder_->createCompositeExtract(gradients_v, type_float_, + 2)); + if (!instr.attributes.unnormalized_coordinates) { + // Denormalize the gradient if provided as normalized. + assert_true(size[2] != spv::NoResult); + layer_max_gradient = builder_->createNoContractionBinOp( + spv::OpFMul, type_float_, layer_max_gradient, size[2]); + } + // For NaN, considering that magnification is being done. + spv::Id is_minifying_z = builder_->createBinOp( + spv::OpFOrdLessThan, type_bool_, layer_max_gradient, + builder_->makeFloatConstant(1.0f)); + // Choose what filter is actually used, the minification or the + // magnification one. + spv::Id vol_mag_filter_is_linear_loaded = + vol_mag_filter_is_fetch_const + ? builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + fetch_constant_word_4, + builder_->makeUintConstant(UINT32_C(1) << 0)), + const_uint_0_) + : builder_->makeBoolConstant(vol_mag_filter_is_linear); + spv::Id vol_min_filter_is_linear_loaded = + vol_min_filter_is_fetch_const + ? builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + fetch_constant_word_4, + builder_->makeUintConstant(UINT32_C(1) << 1)), + const_uint_0_) + : builder_->makeBoolConstant(vol_min_filter_is_linear); + vol_filter_is_linear = builder_->createTriOp( + spv::OpSelect, type_bool_, is_minifying_z, + vol_min_filter_is_linear_loaded, + vol_mag_filter_is_linear_loaded); + } else { + // No gradients, or using the same filter overrides for magnifying + // and minifying. Assume always magnifying if no gradients (LOD 0, + // always <= 0). LOD is within 2D layers, not between them (unlike + // in 3D textures, which have mips with depth reduced), so it + // shouldn't have effect on filtering between layers. + if (vol_mag_filter_is_fetch_const) { + vol_filter_is_linear = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, fetch_constant_word_4, + builder_->makeUintConstant(UINT32_C(1) << 0)), + const_uint_0_); + } + } + spv::Id layer_coordinate = coordinates[2]; + // Linear filtering may be needed either based on a dynamic + // condition (the filtering mode is taken from the fetch constant, + // or it's different for magnification and minification), or on a + // static one (with gradients - specified in the instruction for + // both magnification and minification as linear, without + // gradients - specified for magnification as linear). + // If the filter is linear, subtract 0.5 from the Z coordinate of + // the first layer in filtering because 0.5 is in the middle of it. + if (vol_filter_is_linear != spv::NoResult) { + layer_coordinate = builder_->createTriOp( + spv::OpSelect, type_float_, vol_filter_is_linear, + builder_->createNoContractionBinOp( + spv::OpFSub, type_float_, layer_coordinate, + builder_->makeFloatConstant(0.5f)), + layer_coordinate); + } else if (vol_mag_filter_is_linear) { + layer_coordinate = builder_->createNoContractionBinOp( + spv::OpFSub, type_float_, layer_coordinate, + builder_->makeFloatConstant(0.5f)); + } + // Sample the first layer, needed regardless of whether filtering is + // needed. + // Floor the array layer (Vulkan does rounding to nearest or + 0.5 + // and floor even for the layer index, but on the Xenos, addressing + // is similar to that of 3D textures). This is needed for both point + // and linear filtering (with linear, 0.5 was subtracted + // previously). + spv::Id layer_0_coordinate = builder_->createUnaryBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450Floor, + layer_coordinate); id_vector_temp_.clear(); id_vector_temp_.push_back(coordinates[0]); id_vector_temp_.push_back(coordinates[1]); - id_vector_temp_.push_back(layer_1_coordinate); + id_vector_temp_.push_back(layer_0_coordinate); texture_parameters.coords = builder_->createCompositeConstruct( type_float3_, id_vector_temp_); - spv::Id layer_lerp_factor = builder_->createUnaryBuiltinCall( - type_float_, ext_inst_glsl_std_450_, GLSLstd450Fract, - layer_coordinate); - spv::Id sample_result_unsigned_stacked_filtered; - spv::Id sample_result_signed_stacked_filtered; SampleTexture( texture_parameters, image_operands_mask, image_2d_array_or_cube_unsigned, image_2d_array_or_cube_signed, - sampler, is_all_signed, is_any_signed, - sample_result_unsigned_stacked_filtered, - sample_result_signed_stacked_filtered, layer_lerp_factor, + sampler, is_any_unsigned, is_any_signed, sample_result_unsigned_stacked, sample_result_signed_stacked); - if (vol_filter_is_linear != spv::NoResult) { - // Get the actual build point after the SampleTexture call for - // phi. - spv::Block& block_z_linear_end = *builder_->getBuildPoint(); - builder_->createBranch(&block_z_merge); - builder_->setBuildPoint(&block_z_merge); - { - std::unique_ptr filter_phi_op = - std::make_unique( - builder_->getUniqueId(), type_float4_, spv::OpPhi); - filter_phi_op->addIdOperand( - sample_result_unsigned_stacked_filtered); - filter_phi_op->addIdOperand(block_z_linear_end.getId()); - filter_phi_op->addIdOperand(sample_result_unsigned_stacked); - filter_phi_op->addIdOperand(block_z_head.getId()); - sample_result_unsigned_stacked = filter_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(filter_phi_op)); + // Sample the second layer if linear filtering is potentially needed + // (conditionally or unconditionally, depending on whether the + // filter needs to be chosen at runtime), and filter. + if (vol_filter_is_linear != spv::NoResult || + vol_mag_filter_is_linear) { + spv::Block& block_z_head = *builder_->getBuildPoint(); + spv::Block& block_z_linear = + (vol_filter_is_linear != spv::NoResult) + ? builder_->makeNewBlock() + : block_z_head; + spv::Block& block_z_merge = + (vol_filter_is_linear != spv::NoResult) + ? builder_->makeNewBlock() + : block_z_head; + if (vol_filter_is_linear != spv::NoResult) { + builder_->createSelectionMerge( + &block_z_merge, spv::SelectionControlDontFlattenMask); + builder_->createConditionalBranch( + vol_filter_is_linear, &block_z_linear, &block_z_merge); + builder_->setBuildPoint(&block_z_linear); } - { - std::unique_ptr filter_phi_op = - std::make_unique( - builder_->getUniqueId(), type_float4_, spv::OpPhi); - filter_phi_op->addIdOperand( - sample_result_signed_stacked_filtered); - filter_phi_op->addIdOperand(block_z_linear_end.getId()); - filter_phi_op->addIdOperand(sample_result_signed_stacked); - filter_phi_op->addIdOperand(block_z_head.getId()); - sample_result_signed_stacked = filter_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(filter_phi_op)); + spv::Id layer_1_coordinate = builder_->createBinOp( + spv::OpFAdd, type_float_, layer_0_coordinate, + builder_->makeFloatConstant(1.0f)); + id_vector_temp_.clear(); + id_vector_temp_.push_back(coordinates[0]); + id_vector_temp_.push_back(coordinates[1]); + id_vector_temp_.push_back(layer_1_coordinate); + texture_parameters.coords = builder_->createCompositeConstruct( + type_float3_, id_vector_temp_); + spv::Id layer_lerp_factor = builder_->createUnaryBuiltinCall( + type_float_, ext_inst_glsl_std_450_, GLSLstd450Fract, + layer_coordinate); + spv::Id sample_result_unsigned_stacked_filtered; + spv::Id sample_result_signed_stacked_filtered; + SampleTexture( + texture_parameters, image_operands_mask, + image_2d_array_or_cube_unsigned, + image_2d_array_or_cube_signed, sampler, is_any_unsigned, + is_any_signed, sample_result_unsigned_stacked_filtered, + sample_result_signed_stacked_filtered, layer_lerp_factor, + sample_result_unsigned_stacked, sample_result_signed_stacked); + if (vol_filter_is_linear != spv::NoResult) { + // Get the actual build point after the SampleTexture call for + // phi. + spv::Block& block_z_linear_end = *builder_->getBuildPoint(); + builder_->createBranch(&block_z_merge); + builder_->setBuildPoint(&block_z_merge); + { + std::unique_ptr filter_phi_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpPhi); + filter_phi_op->addIdOperand( + sample_result_unsigned_stacked_filtered); + filter_phi_op->addIdOperand(block_z_linear_end.getId()); + filter_phi_op->addIdOperand(sample_result_unsigned_stacked); + filter_phi_op->addIdOperand(block_z_head.getId()); + sample_result_unsigned_stacked = filter_phi_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(filter_phi_op)); + } + { + std::unique_ptr filter_phi_op = + std::make_unique( + builder_->getUniqueId(), type_float4_, spv::OpPhi); + filter_phi_op->addIdOperand( + sample_result_signed_stacked_filtered); + filter_phi_op->addIdOperand(block_z_linear_end.getId()); + filter_phi_op->addIdOperand(sample_result_signed_stacked); + filter_phi_op->addIdOperand(block_z_head.getId()); + sample_result_signed_stacked = filter_phi_op->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(filter_phi_op)); + } + } else { + sample_result_unsigned_stacked = + sample_result_unsigned_stacked_filtered; + sample_result_signed_stacked = + sample_result_signed_stacked_filtered; } - } else { - sample_result_unsigned_stacked = - sample_result_unsigned_stacked_filtered; - sample_result_signed_stacked = - sample_result_signed_stacked_filtered; } } - // Get the actual build point for phi. - spv::Block& block_dimension_stacked_end = *builder_->getBuildPoint(); - builder_->createBranch(&block_dimension_merge); + if_data_is_3d.makeEndIf(); - // Choose between the 3D and the stacked result based on the actual - // data dimensionality. - builder_->setBuildPoint(&block_dimension_merge); - { - std::unique_ptr dimension_phi_op = - std::make_unique(builder_->getUniqueId(), - type_float4_, spv::OpPhi); - dimension_phi_op->addIdOperand(sample_result_unsigned_3d); - dimension_phi_op->addIdOperand(block_dimension_3d_end.getId()); - dimension_phi_op->addIdOperand(sample_result_unsigned_stacked); - dimension_phi_op->addIdOperand(block_dimension_stacked_end.getId()); - sample_result_unsigned = dimension_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(dimension_phi_op)); - } - { - std::unique_ptr dimension_phi_op = - std::make_unique(builder_->getUniqueId(), - type_float4_, spv::OpPhi); - dimension_phi_op->addIdOperand(sample_result_signed_3d); - dimension_phi_op->addIdOperand(block_dimension_3d_end.getId()); - dimension_phi_op->addIdOperand(sample_result_signed_stacked); - dimension_phi_op->addIdOperand(block_dimension_stacked_end.getId()); - sample_result_signed = dimension_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(dimension_phi_op)); - } + sample_result_unsigned = if_data_is_3d.createMergePhi( + sample_result_unsigned_3d, sample_result_unsigned_stacked); + sample_result_signed = if_data_is_3d.createMergePhi( + sample_result_signed_3d, sample_result_signed_stacked); } else { if (use_computed_lod) { texture_parameters.gradX = gradients_h; @@ -2045,7 +1984,7 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( builder_->createCompositeConstruct(type_float3_, id_vector_temp_); SampleTexture(texture_parameters, image_operands_mask, image_2d_array_or_cube_unsigned, - image_2d_array_or_cube_signed, sampler, is_all_signed, + image_2d_array_or_cube_signed, sampler, is_any_unsigned, is_any_signed, sample_result_unsigned, sample_result_signed); } @@ -2095,26 +2034,20 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( spv::OpBitwiseAnd, type_uint_, swizzle_word, builder_->makeUintConstant(swizzle_bit_0_value << 2)), const_uint_0_); - spv::Block& block_swizzle_head = *builder_->getBuildPoint(); - spv::Block& block_swizzle_constant = builder_->makeNewBlock(); - spv::Block& block_swizzle_component = builder_->makeNewBlock(); - spv::Block& block_swizzle_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_swizzle_merge, spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(swizzle_bit_2, - &block_swizzle_constant, - &block_swizzle_component); - // Constant values. - builder_->setBuildPoint(&block_swizzle_constant); - // Bit 0 - 0 or 1. - spv::Id swizzle_result_constant = - builder_->createTriOp(spv::OpSelect, type_float_, swizzle_bit_0, - const_float_1, const_float_0_); - builder_->createBranch(&block_swizzle_merge); - // Fetched components. + SpirvBuilder::IfBuilder if_swizzle_constant( + swizzle_bit_2, spv::SelectionControlDontFlattenMask, *builder_); + spv::Id swizzle_result_constant; + { + // Constant values. + // Bit 0 - 0 or 1. + swizzle_result_constant = builder_->createTriOp( + spv::OpSelect, type_float_, swizzle_bit_0, const_float_1, + const_float_0_); + } + if_swizzle_constant.makeBeginElse(); spv::Id swizzle_result_component; { - builder_->setBuildPoint(&block_swizzle_component); + // Fetched components. // Select whether the result is signed or unsigned (or biased or // gamma-corrected) based on the post-swizzle signedness. spv::Id swizzle_sample_result = builder_->createTriOp( @@ -2146,22 +2079,11 @@ void SpirvShaderTranslator::ProcessTextureFetchInstruction( swizzle_result_component = builder_->createTriOp( spv::OpSelect, type_float_, swizzle_bit_1, swizzle_z_or_w, swizzle_x_or_y); - builder_->createBranch(&block_swizzle_merge); } + if_swizzle_constant.makeEndIf(); // Select between the constants and the fetched components. - builder_->setBuildPoint(&block_swizzle_merge); - { - std::unique_ptr swizzle_phi_op = - std::make_unique(builder_->getUniqueId(), - type_float_, spv::OpPhi); - swizzle_phi_op->addIdOperand(swizzle_result_constant); - swizzle_phi_op->addIdOperand(block_swizzle_constant.getId()); - swizzle_phi_op->addIdOperand(swizzle_result_component); - swizzle_phi_op->addIdOperand(block_swizzle_component.getId()); - result[result_component_index] = swizzle_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction( - std::move(swizzle_phi_op)); - } + result[result_component_index] = if_swizzle_constant.createMergePhi( + swizzle_result_constant, swizzle_result_component); } } @@ -2441,58 +2363,43 @@ size_t SpirvShaderTranslator::FindOrAddSamplerBinding( void SpirvShaderTranslator::SampleTexture( spv::Builder::TextureParameters& texture_parameters, spv::ImageOperandsMask image_operands_mask, spv::Id image_unsigned, - spv::Id image_signed, spv::Id sampler, spv::Id is_all_signed, + spv::Id image_signed, spv::Id sampler, spv::Id is_any_unsigned, spv::Id is_any_signed, spv::Id& result_unsigned_out, spv::Id& result_signed_out, spv::Id lerp_factor, spv::Id lerp_first_unsigned, spv::Id lerp_first_signed) { for (uint32_t i = 0; i < 2; ++i) { - spv::Block& block_sign_head = *builder_->getBuildPoint(); - spv::Block& block_sign = builder_->makeNewBlock(); - spv::Block& block_sign_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_sign_merge, - spv::SelectionControlDontFlattenMask); - // Unsigned (i == 0) - if there are any non-signed components. - // Signed (i == 1) - if there are any signed components. - builder_->createConditionalBranch(i ? is_any_signed : is_all_signed, - i ? &block_sign : &block_sign_merge, - i ? &block_sign_merge : &block_sign); - builder_->setBuildPoint(&block_sign); - spv::Id image = i ? image_signed : image_unsigned; - // OpSampledImage must be in the same block as where its result is used. - texture_parameters.sampler = builder_->createBinOp( - spv::OpSampledImage, - builder_->makeSampledImageType(builder_->getTypeId(image)), image, - sampler); - spv::Id result = builder_->createTextureCall( - spv::NoPrecision, type_float4_, false, false, false, false, false, - texture_parameters, image_operands_mask); - if (lerp_factor != spv::NoResult) { - spv::Id lerp_first = i ? lerp_first_signed : lerp_first_unsigned; - if (lerp_first != spv::NoResult) { - spv::Id lerp_difference = builder_->createNoContractionBinOp( - spv::OpVectorTimesScalar, type_float4_, - builder_->createNoContractionBinOp(spv::OpFSub, type_float4_, - result, lerp_first), - lerp_factor); - result = builder_->createNoContractionBinOp(spv::OpFAdd, type_float4_, - result, lerp_difference); + SpirvBuilder::IfBuilder sign_if(i ? is_any_signed : is_any_unsigned, + spv::SelectionControlDontFlattenMask, + *builder_); + spv::Id sign_result; + { + spv::Id image = i ? image_signed : image_unsigned; + // OpSampledImage must be in the same block as where its result is used. + texture_parameters.sampler = builder_->createBinOp( + spv::OpSampledImage, + builder_->makeSampledImageType(builder_->getTypeId(image)), image, + sampler); + sign_result = builder_->createTextureCall( + spv::NoPrecision, type_float4_, false, false, false, false, false, + texture_parameters, image_operands_mask); + if (lerp_factor != spv::NoResult) { + spv::Id lerp_first = i ? lerp_first_signed : lerp_first_unsigned; + if (lerp_first != spv::NoResult) { + spv::Id lerp_difference = builder_->createNoContractionBinOp( + spv::OpVectorTimesScalar, type_float4_, + builder_->createNoContractionBinOp(spv::OpFSub, type_float4_, + sign_result, lerp_first), + lerp_factor); + sign_result = builder_->createNoContractionBinOp( + spv::OpFAdd, type_float4_, sign_result, lerp_difference); + } } } - builder_->createBranch(&block_sign_merge); - builder_->setBuildPoint(&block_sign_merge); - { - std::unique_ptr phi_op = - std::make_unique(builder_->getUniqueId(), - type_float4_, spv::OpPhi); - phi_op->addIdOperand(result); - phi_op->addIdOperand(block_sign.getId()); - phi_op->addIdOperand(const_float4_0_); - phi_op->addIdOperand(block_sign_head.getId()); - // This may overwrite the first lerp endpoint for the sign (such usage of - // this function is allowed). - (i ? result_signed_out : result_unsigned_out) = phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(phi_op)); - } + sign_if.makeEndIf(); + // This may overwrite the first lerp endpoint for the sign (such usage of + // this function is allowed). + (i ? result_signed_out : result_unsigned_out) = + sign_if.createMergePhi(sign_result, const_float4_0_); } } @@ -2500,48 +2407,33 @@ spv::Id SpirvShaderTranslator::QueryTextureLod( spv::Builder::TextureParameters& texture_parameters, spv::Id image_unsigned, spv::Id image_signed, spv::Id sampler, spv::Id is_all_signed) { // OpSampledImage must be in the same block as where its result is used. - spv::Block& block_sign_head = *builder_->getBuildPoint(); - spv::Block& block_sign_signed = builder_->makeNewBlock(); - spv::Block& block_sign_unsigned = builder_->makeNewBlock(); - spv::Block& block_sign_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_sign_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(is_all_signed, &block_sign_signed, - &block_sign_unsigned); - builder_->setBuildPoint(&block_sign_signed); - texture_parameters.sampler = builder_->createBinOp( - spv::OpSampledImage, - builder_->makeSampledImageType(builder_->getTypeId(image_signed)), - image_signed, sampler); - spv::Id lod_signed = builder_->createCompositeExtract( - builder_->createTextureQueryCall(spv::OpImageQueryLod, texture_parameters, - false), - type_float_, 1); - builder_->createBranch(&block_sign_merge); - builder_->setBuildPoint(&block_sign_unsigned); - texture_parameters.sampler = builder_->createBinOp( - spv::OpSampledImage, - builder_->makeSampledImageType(builder_->getTypeId(image_unsigned)), - image_unsigned, sampler); - spv::Id lod_unsigned = builder_->createCompositeExtract( - builder_->createTextureQueryCall(spv::OpImageQueryLod, texture_parameters, - false), - type_float_, 1); - builder_->createBranch(&block_sign_merge); - builder_->setBuildPoint(&block_sign_merge); - spv::Id result; + SpirvBuilder::IfBuilder if_signed( + is_all_signed, spv::SelectionControlDontFlattenMask, *builder_); + spv::Id lod_signed; { - std::unique_ptr sign_phi_op = - std::make_unique(builder_->getUniqueId(), type_float_, - spv::OpPhi); - sign_phi_op->addIdOperand(lod_signed); - sign_phi_op->addIdOperand(block_sign_signed.getId()); - sign_phi_op->addIdOperand(lod_unsigned); - sign_phi_op->addIdOperand(block_sign_unsigned.getId()); - result = sign_phi_op->getResultId(); - builder_->getBuildPoint()->addInstruction(std::move(sign_phi_op)); + texture_parameters.sampler = builder_->createBinOp( + spv::OpSampledImage, + builder_->makeSampledImageType(builder_->getTypeId(image_signed)), + image_signed, sampler); + lod_signed = builder_->createCompositeExtract( + builder_->createTextureQueryCall(spv::OpImageQueryLod, + texture_parameters, false), + type_float_, 1); } - return result; + if_signed.makeBeginElse(); + spv::Id lod_unsigned; + { + texture_parameters.sampler = builder_->createBinOp( + spv::OpSampledImage, + builder_->makeSampledImageType(builder_->getTypeId(image_unsigned)), + image_unsigned, sampler); + lod_unsigned = builder_->createCompositeExtract( + builder_->createTextureQueryCall(spv::OpImageQueryLod, + texture_parameters, false), + type_float_, 1); + } + if_signed.makeEndIf(); + return if_signed.createMergePhi(lod_signed, lod_unsigned); } } // namespace gpu diff --git a/src/xenia/gpu/spirv_shader_translator_rb.cc b/src/xenia/gpu/spirv_shader_translator_rb.cc index 65a01209d..e19fdd540 100644 --- a/src/xenia/gpu/spirv_shader_translator_rb.cc +++ b/src/xenia/gpu/spirv_shader_translator_rb.cc @@ -457,22 +457,14 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { // Kill the pixel once the guest control flow and derivatives are not // needed anymore. assert_true(var_main_kill_pixel_ != spv::NoResult); - // Load the condition before the OpSelectionMerge, which must be the - // penultimate instruction. - spv::Id kill_pixel = - builder_->createLoad(var_main_kill_pixel_, spv::NoPrecision); - spv::Block& block_kill = builder_->makeNewBlock(); - spv::Block& block_kill_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_kill_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(kill_pixel, &block_kill, - &block_kill_merge); - builder_->setBuildPoint(&block_kill); + SpirvBuilder::IfBuilder kill_pixel_if( + builder_->createLoad(var_main_kill_pixel_, spv::NoPrecision), + spv::SelectionControlMaskNone, *builder_); // TODO(Triang3l): Use OpTerminateInvocation when SPIR-V 1.6 is // targeted. builder_->createNoResultOp(spv::OpKill); // OpKill terminates the block. - builder_->setBuildPoint(&block_kill_merge); + kill_pixel_if.makeEndIf(false); } } } @@ -533,17 +525,11 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { builder_->makeUintConstant(3)); // Check if the comparison function is not "always" - that should pass even // for NaN likely, unlike "less, equal or greater". - spv::Id alpha_test_function_is_non_always = builder_->createBinOp( - spv::OpINotEqual, type_bool_, alpha_test_function, - builder_->makeUintConstant(uint32_t(xenos::CompareFunction::kAlways))); - spv::Block& block_alpha_test = builder_->makeNewBlock(); - spv::Block& block_alpha_test_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_alpha_test_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(alpha_test_function_is_non_always, - &block_alpha_test, - &block_alpha_test_merge); - builder_->setBuildPoint(&block_alpha_test); + SpirvBuilder::IfBuilder if_alpha_test_function_is_non_always( + builder_->createBinOp(spv::OpINotEqual, type_bool_, alpha_test_function, + builder_->makeUintConstant( + uint32_t(xenos::CompareFunction::kAlways))), + spv::SelectionControlDontFlattenMask, *builder_); { id_vector_temp_.clear(); id_vector_temp_.push_back(builder_->makeIntConstant(3)); @@ -564,28 +550,20 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { // The comparison function is not "always" - perform the alpha test. // Handle "not equal" specially (specifically as "not equal" so it's true // for NaN, not "less or greater" which is false for NaN). - spv::Id alpha_test_function_is_not_equal = builder_->createBinOp( - spv::OpIEqual, type_bool_, alpha_test_function, - builder_->makeUintConstant( - uint32_t(xenos::CompareFunction::kNotEqual))); - spv::Block& block_alpha_test_not_equal = builder_->makeNewBlock(); - spv::Block& block_alpha_test_non_not_equal = builder_->makeNewBlock(); - spv::Block& block_alpha_test_not_equal_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_alpha_test_not_equal_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(alpha_test_function_is_not_equal, - &block_alpha_test_not_equal, - &block_alpha_test_non_not_equal); - spv::Id alpha_test_result_not_equal, alpha_test_result_non_not_equal; - builder_->setBuildPoint(&block_alpha_test_not_equal); + SpirvBuilder::IfBuilder if_alpha_test_function_is_not_equal( + builder_->createBinOp(spv::OpIEqual, type_bool_, alpha_test_function, + builder_->makeUintConstant(uint32_t( + xenos::CompareFunction::kNotEqual))), + spv::SelectionControlDontFlattenMask, *builder_, 1, 2); + spv::Id alpha_test_result_not_equal; { // "Not equal" function. alpha_test_result_not_equal = builder_->createBinOp(spv::OpFUnordNotEqual, type_bool_, alpha_test_alpha, alpha_test_reference); - builder_->createBranch(&block_alpha_test_not_equal_merge); } - builder_->setBuildPoint(&block_alpha_test_non_not_equal); + if_alpha_test_function_is_not_equal.makeBeginElse(); + spv::Id alpha_test_result_non_not_equal; { // Function other than "not equal". static const spv::Op kAlphaTestOps[] = { @@ -609,16 +587,11 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { alpha_test_result_non_not_equal = alpha_test_comparison_result; } } - builder_->createBranch(&block_alpha_test_not_equal_merge); } - builder_->setBuildPoint(&block_alpha_test_not_equal_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(alpha_test_result_not_equal); - id_vector_temp_.push_back(block_alpha_test_not_equal.getId()); - id_vector_temp_.push_back(alpha_test_result_non_not_equal); - id_vector_temp_.push_back(block_alpha_test_non_not_equal.getId()); + if_alpha_test_function_is_not_equal.makeEndIf(); spv::Id alpha_test_result = - builder_->createOp(spv::OpPhi, type_bool_, id_vector_temp_); + if_alpha_test_function_is_not_equal.createMergePhi( + alpha_test_result_not_equal, alpha_test_result_non_not_equal); // Discard the pixel if the alpha test has failed. if (edram_fragment_shader_interlock_ && !features_.demote_to_helper_invocation) { @@ -627,16 +600,11 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { spv::OpSelect, type_uint_, alpha_test_result, fsi_sample_mask_in_rt_0_alpha_tests, const_uint_0_); } else { - // Creating a merge block even though it will contain just one OpBranch - // since SPIR-V requires structured control flow in shaders. - spv::Block& block_alpha_test_kill = builder_->makeNewBlock(); - spv::Block& block_alpha_test_kill_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_alpha_test_kill_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(alpha_test_result, - &block_alpha_test_kill_merge, - &block_alpha_test_kill); - builder_->setBuildPoint(&block_alpha_test_kill); + SpirvBuilder::IfBuilder alpha_test_kill_if( + builder_->createUnaryOp(spv::OpLogicalNot, type_bool_, + alpha_test_result), + spv::SelectionControlDontFlattenMask, *builder_); + bool branch_to_alpha_test_kill_merge = true; if (edram_fragment_shader_interlock_) { assert_true(features_.demote_to_helper_invocation); fsi_pixel_potentially_killed = true; @@ -645,18 +613,17 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { builder_->addExtension("SPV_EXT_demote_to_helper_invocation"); builder_->addCapability(spv::CapabilityDemoteToHelperInvocationEXT); builder_->createNoResultOp(spv::OpDemoteToHelperInvocationEXT); - builder_->createBranch(&block_alpha_test_kill_merge); } else { // TODO(Triang3l): Use OpTerminateInvocation when SPIR-V 1.6 is // targeted. builder_->createNoResultOp(spv::OpKill); // OpKill terminates the block. + branch_to_alpha_test_kill_merge = false; } - builder_->setBuildPoint(&block_alpha_test_kill_merge); - builder_->createBranch(&block_alpha_test_merge); + alpha_test_kill_if.makeEndIf(branch_to_alpha_test_kill_merge); } } - builder_->setBuildPoint(&block_alpha_test_merge); + if_alpha_test_function_is_non_always.makeEndIf(); // TODO(Triang3l): Alpha to coverage. @@ -725,18 +692,9 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { spv::OpBitwiseAnd, type_uint_, main_fsi_sample_mask_, builder_->makeUintConstant(uint32_t(1) << (4 + i))), const_uint_0_); - spv::Block& block_sample_late_depth_stencil_write = - builder_->makeNewBlock(); - spv::Block& block_sample_late_depth_stencil_write_merge = - builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_sample_late_depth_stencil_write_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch( + SpirvBuilder::IfBuilder if_sample_late_depth_stencil_write_needed( sample_late_depth_stencil_write_needed, - &block_sample_late_depth_stencil_write, - &block_sample_late_depth_stencil_write_merge); - builder_->setBuildPoint(&block_sample_late_depth_stencil_write); + spv::SelectionControlDontFlattenMask, *builder_); spv::Id depth_stencil_sample_address = FSI_AddSampleOffset(main_fsi_address_depth_, i); id_vector_temp_.clear(); @@ -749,8 +707,7 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { ? spv::StorageClassStorageBuffer : spv::StorageClassUniform, buffer_edram_, id_vector_temp_)); - builder_->createBranch(&block_sample_late_depth_stencil_write_merge); - builder_->setBuildPoint(&block_sample_late_depth_stencil_write_merge); + if_sample_late_depth_stencil_write_needed.makeEndIf(); } if (color_targets_written) { // Only take the remaining coverage bits, not the late depth / stencil @@ -852,28 +809,10 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { spv::OpBitwiseAnd, type_uint_, fsi_color_targets_written, builder_->makeUintConstant(uint32_t(1) << color_target_index)), const_uint_0_); - spv::Block& fsi_color_written_if_head = *builder_->getBuildPoint(); - spv::Block& fsi_color_written_if = builder_->makeNewBlock(); - spv::Block& fsi_color_written_if_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&fsi_color_written_if_merge, - spv::SelectionControlDontFlattenMask); - { - std::unique_ptr rt_written_branch_conditional_op = - std::make_unique(spv::OpBranchConditional); - rt_written_branch_conditional_op->addIdOperand(fsi_color_written); - rt_written_branch_conditional_op->addIdOperand( - fsi_color_written_if.getId()); - rt_written_branch_conditional_op->addIdOperand( - fsi_color_written_if_merge.getId()); - // More likely to write to the render target than not. - rt_written_branch_conditional_op->addImmediateOperand(2); - rt_written_branch_conditional_op->addImmediateOperand(1); - builder_->getBuildPoint()->addInstruction( - std::move(rt_written_branch_conditional_op)); - } - fsi_color_written_if.addPredecessor(&fsi_color_written_if_head); - fsi_color_written_if_merge.addPredecessor(&fsi_color_written_if_head); - builder_->setBuildPoint(&fsi_color_written_if); + // More likely to write to the render target than not. + SpirvBuilder::IfBuilder if_fsi_color_written( + fsi_color_written, spv::SelectionControlDontFlattenMask, *builder_, + 2, 1); // For accessing uint2 arrays of per-render-target data which are passed // as uint4 arrays due to std140 array element alignment. @@ -914,14 +853,9 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { const_uint32_max), builder_->createBinOp(spv::OpINotEqual, type_bool_, rt_keep_mask[1], const_uint32_max)); - spv::Block& rt_write_mask_not_empty_if = builder_->makeNewBlock(); - spv::Block& rt_write_mask_not_empty_if_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&rt_write_mask_not_empty_if_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(rt_write_mask_not_empty, - &rt_write_mask_not_empty_if, - &rt_write_mask_not_empty_if_merge); - builder_->setBuildPoint(&rt_write_mask_not_empty_if); + SpirvBuilder::IfBuilder if_rt_write_mask_not_empty( + rt_write_mask_not_empty, spv::SelectionControlDontFlattenMask, + *builder_); spv::Id const_int_rt_index = builder_->makeIntConstant(color_target_index); @@ -982,17 +916,10 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { spv::Id rt_blend_enabled = builder_->createBinOp( spv::OpINotEqual, type_bool_, rt_blend_factors_equations, builder_->makeUintConstant(0x00010001)); - spv::Block& rt_blend_enabled_if = builder_->makeNewBlock(); - spv::Block& rt_blend_enabled_else = builder_->makeNewBlock(); - spv::Block& rt_blend_enabled_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&rt_blend_enabled_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch( - rt_blend_enabled, &rt_blend_enabled_if, &rt_blend_enabled_else); - - // Blending path. + SpirvBuilder::IfBuilder if_rt_blend_enabled( + rt_blend_enabled, spv::SelectionControlDontFlattenMask, *builder_); { - builder_->setBuildPoint(&rt_blend_enabled_if); + // Blending path. // Get various parameters used in blending. spv::Id rt_color_is_fixed_point = builder_->createBinOp( @@ -1097,15 +1024,9 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { // Blend and mask each sample. for (uint32_t i = 0; i < 4; ++i) { - spv::Block& block_sample_covered = builder_->makeNewBlock(); - spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_sample_covered_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(fsi_samples_covered[i], - &block_sample_covered, - &block_sample_covered_merge); - builder_->setBuildPoint(&block_sample_covered); + SpirvBuilder::IfBuilder if_sample_covered( + fsi_samples_covered[i], spv::SelectionControlDontFlattenMask, + *builder_); spv::Id rt_sample_address = FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); @@ -1131,26 +1052,13 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { dest_packed[0] = builder_->createLoad(rt_access_chain_0, spv::NoPrecision); { - spv::Block& block_load_64bpp_head = *builder_->getBuildPoint(); - spv::Block& block_load_64bpp = builder_->makeNewBlock(); - spv::Block& block_load_64bpp_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_load_64bpp_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(rt_is_64bpp, &block_load_64bpp, - &block_load_64bpp_merge); - builder_->setBuildPoint(&block_load_64bpp); + SpirvBuilder::IfBuilder if_64bpp( + rt_is_64bpp, spv::SelectionControlDontFlattenMask, *builder_); spv::Id dest_packed_64bpp_high = builder_->createLoad(rt_access_chain_1, spv::NoPrecision); - builder_->createBranch(&block_load_64bpp_merge); - builder_->setBuildPoint(&block_load_64bpp_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(dest_packed_64bpp_high); - id_vector_temp_.push_back(block_load_64bpp.getId()); - id_vector_temp_.push_back(const_uint_0_); - id_vector_temp_.push_back(block_load_64bpp_head.getId()); - dest_packed[1] = - builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if_64bpp.makeEndIf(); + dest_packed[1] = if_64bpp.createMergePhi(dest_packed_64bpp_high, + const_uint_0_); } std::array dest_unpacked = FSI_UnpackColor(dest_packed, rt_format_with_flags); @@ -1203,35 +1111,27 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { result_packed[0], rt_replace_mask[0])), rt_access_chain_0); - spv::Block& block_store_64bpp = builder_->makeNewBlock(); - spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_store_64bpp_merge, spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, - &block_store_64bpp_merge); - builder_->setBuildPoint(&block_store_64bpp); - builder_->createStore( - builder_->createBinOp( - spv::OpBitwiseOr, type_uint_, - builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, - dest_packed[1], rt_keep_mask[1]), - builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, - result_packed[1], - rt_replace_mask[1])), - rt_access_chain_0); - builder_->createBranch(&block_store_64bpp_merge); - builder_->setBuildPoint(&block_store_64bpp_merge); + SpirvBuilder::IfBuilder if_64bpp( + rt_is_64bpp, spv::SelectionControlDontFlattenMask, *builder_); + { + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + dest_packed[1], rt_keep_mask[1]), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + result_packed[1], + rt_replace_mask[1])), + rt_access_chain_1); + } + if_64bpp.makeEndIf(); - builder_->createBranch(&block_sample_covered_merge); - builder_->setBuildPoint(&block_sample_covered_merge); + if_sample_covered.makeEndIf(); } - - builder_->createBranch(&rt_blend_enabled_merge); } - - // Non-blending paths. + if_rt_blend_enabled.makeBeginElse(); { - builder_->setBuildPoint(&rt_blend_enabled_else); + // Non-blending paths. // Pack the new color for all samples. std::array color_packed = @@ -1244,19 +1144,12 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { rt_keep_mask[0], const_uint_0_), builder_->createBinOp(spv::OpINotEqual, type_bool_, rt_keep_mask[1], const_uint_0_)); - spv::Block& rt_keep_mask_not_empty_if = builder_->makeNewBlock(); - spv::Block& rt_keep_mask_not_empty_if_else = builder_->makeNewBlock(); - spv::Block& rt_keep_mask_not_empty_if_merge = - builder_->makeNewBlock(); - builder_->createSelectionMerge(&rt_keep_mask_not_empty_if_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(rt_keep_mask_not_empty, - &rt_keep_mask_not_empty_if, - &rt_keep_mask_not_empty_if_else); - // Loading and masking path. + SpirvBuilder::IfBuilder if_rt_keep_mask_not_empty( + rt_keep_mask_not_empty, spv::SelectionControlDontFlattenMask, + *builder_); { - builder_->setBuildPoint(&rt_keep_mask_not_empty_if); + // Loading and masking path. std::array color_packed_masked; for (uint32_t i = 0; i < 2; ++i) { color_packed_masked[i] = builder_->createBinOp( @@ -1265,15 +1158,9 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { rt_keep_mask[i])); } for (uint32_t i = 0; i < 4; ++i) { - spv::Block& block_sample_covered = builder_->makeNewBlock(); - spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_sample_covered_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(fsi_samples_covered[i], - &block_sample_covered, - &block_sample_covered_merge); - builder_->setBuildPoint(&block_sample_covered); + SpirvBuilder::IfBuilder if_sample_covered( + fsi_samples_covered[i], spv::SelectionControlDontFlattenMask, + *builder_); spv::Id rt_sample_address = FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); id_vector_temp_.clear(); @@ -1295,52 +1182,38 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { rt_keep_mask[0]), color_packed_masked[0]), rt_access_chain_0); - spv::Block& block_store_64bpp = builder_->makeNewBlock(); - spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_store_64bpp_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, - &block_store_64bpp_merge); - builder_->setBuildPoint(&block_store_64bpp); - id_vector_temp_.back() = builder_->createBinOp( - spv::OpIAdd, type_int_, rt_sample_address, fsi_const_int_1); - spv::Id rt_access_chain_1 = builder_->createAccessChain( - features_.spirv_version >= spv::Spv_1_3 - ? spv::StorageClassStorageBuffer - : spv::StorageClassUniform, - buffer_edram_, id_vector_temp_); - builder_->createStore( - builder_->createBinOp( - spv::OpBitwiseOr, type_uint_, - builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, - builder_->createLoad(rt_access_chain_1, - spv::NoPrecision), - rt_keep_mask[1]), - color_packed_masked[1]), - rt_access_chain_1); - builder_->createBranch(&block_store_64bpp_merge); - builder_->setBuildPoint(&block_store_64bpp_merge); - builder_->createBranch(&block_sample_covered_merge); - builder_->setBuildPoint(&block_sample_covered_merge); + SpirvBuilder::IfBuilder if_64bpp( + rt_is_64bpp, spv::SelectionControlDontFlattenMask, *builder_); + { + id_vector_temp_.back() = builder_->createBinOp( + spv::OpIAdd, type_int_, rt_sample_address, fsi_const_int_1); + spv::Id rt_access_chain_1 = builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createLoad(rt_access_chain_1, + spv::NoPrecision), + rt_keep_mask[1]), + color_packed_masked[1]), + rt_access_chain_1); + } + if_64bpp.makeEndIf(); + if_sample_covered.makeEndIf(); } - builder_->createBranch(&rt_keep_mask_not_empty_if_merge); } - - // Fully overwriting path. + if_rt_keep_mask_not_empty.makeBeginElse(); { - builder_->setBuildPoint(&rt_keep_mask_not_empty_if_else); + // Fully overwriting path. for (uint32_t i = 0; i < 4; ++i) { - spv::Block& block_sample_covered = builder_->makeNewBlock(); - spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_sample_covered_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(fsi_samples_covered[i], - &block_sample_covered, - &block_sample_covered_merge); - builder_->setBuildPoint(&block_sample_covered); + SpirvBuilder::IfBuilder if_sample_covered( + fsi_samples_covered[i], spv::SelectionControlDontFlattenMask, + *builder_); spv::Id rt_sample_address = FSI_AddSampleOffset(rt_sample_0_address, i, rt_is_64bpp); id_vector_temp_.clear(); @@ -1353,40 +1226,29 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { ? spv::StorageClassStorageBuffer : spv::StorageClassUniform, buffer_edram_, id_vector_temp_)); - spv::Block& block_store_64bpp = builder_->makeNewBlock(); - spv::Block& block_store_64bpp_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge( - &block_store_64bpp_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(rt_is_64bpp, &block_store_64bpp, - &block_store_64bpp_merge); - builder_->setBuildPoint(&block_store_64bpp); - id_vector_temp_.back() = builder_->createBinOp( - spv::OpIAdd, type_int_, id_vector_temp_.back(), - fsi_const_int_1); - builder_->createStore(color_packed[1], - builder_->createAccessChain( - features_.spirv_version >= spv::Spv_1_3 - ? spv::StorageClassStorageBuffer - : spv::StorageClassUniform, - buffer_edram_, id_vector_temp_)); - builder_->createBranch(&block_store_64bpp_merge); - builder_->setBuildPoint(&block_store_64bpp_merge); - builder_->createBranch(&block_sample_covered_merge); - builder_->setBuildPoint(&block_sample_covered_merge); + SpirvBuilder::IfBuilder if_64bpp( + rt_is_64bpp, spv::SelectionControlDontFlattenMask, *builder_); + { + id_vector_temp_.back() = builder_->createBinOp( + spv::OpIAdd, type_int_, id_vector_temp_.back(), + fsi_const_int_1); + builder_->createStore( + color_packed[1], builder_->createAccessChain( + features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform, + buffer_edram_, id_vector_temp_)); + } + if_64bpp.makeEndIf(); + if_sample_covered.makeEndIf(); } - builder_->createBranch(&rt_keep_mask_not_empty_if_merge); } - - builder_->setBuildPoint(&rt_keep_mask_not_empty_if_merge); - builder_->createBranch(&rt_blend_enabled_merge); + if_rt_keep_mask_not_empty.makeEndIf(); } + if_rt_blend_enabled.makeEndIf(); - builder_->setBuildPoint(&rt_blend_enabled_merge); - builder_->createBranch(&rt_write_mask_not_empty_if_merge); - builder_->setBuildPoint(&rt_write_mask_not_empty_if_merge); - builder_->createBranch(&fsi_color_written_if_merge); - builder_->setBuildPoint(&fsi_color_written_if_merge); + if_rt_write_mask_not_empty.makeEndIf(); + if_fsi_color_written.makeEndIf(); } else { // Convert to gamma space - this is incorrect, since it must be done // after blending on the Xbox 360, but this is just one of many blending @@ -1405,24 +1267,11 @@ void SpirvShaderTranslator::CompleteFragmentShaderInMain() { builder_->makeUintConstant(kSysFlag_ConvertColor0ToGamma << color_target_index)), const_uint_0_); - spv::Block& block_gamma_head = *builder_->getBuildPoint(); - spv::Block& block_gamma = builder_->makeNewBlock(); - spv::Block& block_gamma_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_gamma_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(is_gamma, &block_gamma, - &block_gamma_merge); - builder_->setBuildPoint(&block_gamma); + SpirvBuilder::IfBuilder if_gamma( + is_gamma, spv::SelectionControlDontFlattenMask, *builder_); spv::Id color_rgb_gamma = LinearToPWLGamma(color_rgb, false); - builder_->createBranch(&block_gamma_merge); - builder_->setBuildPoint(&block_gamma_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(color_rgb_gamma); - id_vector_temp_.push_back(block_gamma.getId()); - id_vector_temp_.push_back(color_rgb); - id_vector_temp_.push_back(block_gamma_head.getId()); - color_rgb = - builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); + if_gamma.makeEndIf(); + color_rgb = if_gamma.createMergePhi(color_rgb_gamma, color_rgb); { std::unique_ptr color_rgba_shuffle_op = std::make_unique( @@ -1752,15 +1601,8 @@ void SpirvShaderTranslator::FSI_DepthStencilTest( spv::OpBitwiseAnd, type_uint_, main_system_constant_flags_, builder_->makeUintConstant(kSysFlag_FSIDepthStencil)), const_uint_0_); - spv::Block& block_depth_stencil_enabled_head = *builder_->getBuildPoint(); - spv::Block& block_depth_stencil_enabled = builder_->makeNewBlock(); - spv::Block& block_depth_stencil_enabled_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_depth_stencil_enabled_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(depth_stencil_enabled, - &block_depth_stencil_enabled, - &block_depth_stencil_enabled_merge); - builder_->setBuildPoint(&block_depth_stencil_enabled); + SpirvBuilder::IfBuilder if_depth_stencil_enabled( + depth_stencil_enabled, spv::SelectionControlDontFlattenMask, *builder_); // Load the depth in the center of the pixel and calculate the derivatives of // the depth outside non-uniform control flow. @@ -1976,14 +1818,8 @@ void SpirvShaderTranslator::FSI_DepthStencilTest( builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, new_sample_mask, builder_->makeUintConstant(uint32_t(1) << i)), const_uint_0_); - spv::Block& block_sample_covered_head = *builder_->getBuildPoint(); - spv::Block& block_sample_covered = builder_->makeNewBlock(); - spv::Block& block_sample_covered_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_sample_covered_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(sample_covered, &block_sample_covered, - &block_sample_covered_merge); - builder_->setBuildPoint(&block_sample_covered); + SpirvBuilder::IfBuilder if_sample_covered( + sample_covered, spv::SelectionControlDontFlattenMask, *builder_); // Load the original depth and stencil for the sample. spv::Id sample_address = FSI_AddSampleOffset(main_fsi_address_depth_, i); @@ -2074,21 +1910,11 @@ void SpirvShaderTranslator::FSI_DepthStencilTest( const_float_0_, const_float_1_); // Convert the new depth to 24-bit. - spv::Block& block_depth_format_float = builder_->makeNewBlock(); - spv::Block& block_depth_format_unorm = builder_->makeNewBlock(); - spv::Block& block_depth_format_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_depth_format_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch( - depth_is_float24, &block_depth_format_float, &block_depth_format_unorm); - // Float24 case. - builder_->setBuildPoint(&block_depth_format_float); + SpirvBuilder::IfBuilder depth_format_if( + depth_is_float24, spv::SelectionControlDontFlattenMask, *builder_); spv::Id sample_depth_float24 = SpirvShaderTranslator::PreClampedDepthTo20e4( *builder_, sample_depth32, true, false, ext_inst_glsl_std_450_); - builder_->createBranch(&block_depth_format_merge); - spv::Block& block_depth_format_float_end = *builder_->getBuildPoint(); - // Unorm24 case. - builder_->setBuildPoint(&block_depth_format_unorm); + depth_format_if.makeBeginElse(); // Round to the nearest even integer. This seems to be the correct // conversion, adding +0.5 and rounding towards zero results in red instead // of black in the 4D5307E6 clear shader. @@ -2099,17 +1925,10 @@ void SpirvShaderTranslator::FSI_DepthStencilTest( builder_->createNoContractionBinOp( spv::OpFMul, type_float_, sample_depth32, builder_->makeFloatConstant(float(0xFFFFFF))))); - builder_->createBranch(&block_depth_format_merge); - spv::Block& block_depth_format_unorm_end = *builder_->getBuildPoint(); + depth_format_if.makeEndIf(); // Merge between the two formats. - builder_->setBuildPoint(&block_depth_format_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(sample_depth_float24); - id_vector_temp_.push_back(block_depth_format_float_end.getId()); - id_vector_temp_.push_back(sample_depth_unorm24); - id_vector_temp_.push_back(block_depth_format_unorm_end.getId()); - spv::Id sample_depth24 = - builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + spv::Id sample_depth24 = depth_format_if.createMergePhi( + sample_depth_float24, sample_depth_unorm24); // Perform the depth test. spv::Id old_depth = builder_->createBinOp( @@ -2131,206 +1950,188 @@ void SpirvShaderTranslator::FSI_DepthStencilTest( builder_->createBinOp(spv::OpUGreaterThan, type_bool_, sample_depth24, old_depth))); - // Begin the stencil test. - spv::Block& block_stencil_enabled_head = *builder_->getBuildPoint(); - spv::Block& block_stencil_enabled = builder_->makeNewBlock(); - spv::Block& block_stencil_enabled_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_stencil_enabled_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(stencil_enabled, &block_stencil_enabled, - &block_stencil_enabled_merge); - builder_->setBuildPoint(&block_stencil_enabled); - - // Perform the stencil test. - // The read mask has zeros in the upper bits, applying it to the combined - // stencil and depth will remove the depth part. - spv::Id old_stencil_read_masked = builder_->createBinOp( - spv::OpBitwiseAnd, type_uint_, old_depth_stencil, stencil_read_mask); - spv::Id stencil_passed_if_enabled = builder_->createBinOp( - spv::OpLogicalAnd, type_bool_, stencil_pass_if_less, - builder_->createBinOp(spv::OpULessThan, type_bool_, - stencil_reference_read_masked, - old_stencil_read_masked)); - stencil_passed_if_enabled = builder_->createBinOp( - spv::OpLogicalOr, type_bool_, stencil_passed_if_enabled, - builder_->createBinOp( - spv::OpLogicalAnd, type_bool_, stencil_pass_if_equal, - builder_->createBinOp(spv::OpIEqual, type_bool_, - stencil_reference_read_masked, - old_stencil_read_masked))); - stencil_passed_if_enabled = builder_->createBinOp( - spv::OpLogicalOr, type_bool_, stencil_passed_if_enabled, - builder_->createBinOp( - spv::OpLogicalAnd, type_bool_, stencil_pass_if_greater, - builder_->createBinOp(spv::OpUGreaterThan, type_bool_, - stencil_reference_read_masked, - old_stencil_read_masked))); - spv::Id stencil_op = builder_->createTriOp( - spv::OpBitFieldUExtract, type_uint_, stencil_func_ops, - builder_->createTriOp( - spv::OpSelect, type_uint_, stencil_passed_if_enabled, - builder_->createTriOp(spv::OpSelect, type_uint_, depth_passed, - builder_->makeUintConstant(6), - builder_->makeUintConstant(9)), - builder_->makeUintConstant(3)), - builder_->makeUintConstant(3)); - spv::Block& block_stencil_op_head = *builder_->getBuildPoint(); - spv::Block& block_stencil_op_keep = builder_->makeNewBlock(); - spv::Block& block_stencil_op_zero = builder_->makeNewBlock(); - spv::Block& block_stencil_op_replace = builder_->makeNewBlock(); - spv::Block& block_stencil_op_increment_clamp = builder_->makeNewBlock(); - spv::Block& block_stencil_op_decrement_clamp = builder_->makeNewBlock(); - spv::Block& block_stencil_op_invert = builder_->makeNewBlock(); - spv::Block& block_stencil_op_increment_wrap = builder_->makeNewBlock(); - spv::Block& block_stencil_op_decrement_wrap = builder_->makeNewBlock(); - spv::Block& block_stencil_op_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_stencil_op_merge, - spv::SelectionControlDontFlattenMask); + // Perform the stencil test if enabled. + SpirvBuilder::IfBuilder stencil_if( + stencil_enabled, spv::SelectionControlDontFlattenMask, *builder_); + spv::Id stencil_passed_if_enabled; + spv::Id new_stencil_and_old_depth_if_stencil_enabled; { - std::unique_ptr stencil_op_switch_op = - std::make_unique(spv::OpSwitch); - stencil_op_switch_op->addIdOperand(stencil_op); - // Make keep the default. - stencil_op_switch_op->addIdOperand(block_stencil_op_keep.getId()); - stencil_op_switch_op->addImmediateOperand( - int32_t(xenos::StencilOp::kZero)); - stencil_op_switch_op->addIdOperand(block_stencil_op_zero.getId()); - stencil_op_switch_op->addImmediateOperand( - int32_t(xenos::StencilOp::kReplace)); - stencil_op_switch_op->addIdOperand(block_stencil_op_replace.getId()); - stencil_op_switch_op->addImmediateOperand( - int32_t(xenos::StencilOp::kIncrementClamp)); - stencil_op_switch_op->addIdOperand( - block_stencil_op_increment_clamp.getId()); - stencil_op_switch_op->addImmediateOperand( - int32_t(xenos::StencilOp::kDecrementClamp)); - stencil_op_switch_op->addIdOperand( - block_stencil_op_decrement_clamp.getId()); - stencil_op_switch_op->addImmediateOperand( - int32_t(xenos::StencilOp::kInvert)); - stencil_op_switch_op->addIdOperand(block_stencil_op_invert.getId()); - stencil_op_switch_op->addImmediateOperand( - int32_t(xenos::StencilOp::kIncrementWrap)); - stencil_op_switch_op->addIdOperand( - block_stencil_op_increment_wrap.getId()); - stencil_op_switch_op->addImmediateOperand( - int32_t(xenos::StencilOp::kDecrementWrap)); - stencil_op_switch_op->addIdOperand( - block_stencil_op_decrement_wrap.getId()); - builder_->getBuildPoint()->addInstruction( - std::move(stencil_op_switch_op)); + // The read mask has zeros in the upper bits, applying it to the combined + // stencil and depth will remove the depth part. + spv::Id old_stencil_read_masked = builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, old_depth_stencil, stencil_read_mask); + stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_less, + builder_->createBinOp(spv::OpULessThan, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked)); + stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, stencil_passed_if_enabled, + builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_equal, + builder_->createBinOp(spv::OpIEqual, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked))); + stencil_passed_if_enabled = builder_->createBinOp( + spv::OpLogicalOr, type_bool_, stencil_passed_if_enabled, + builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, stencil_pass_if_greater, + builder_->createBinOp(spv::OpUGreaterThan, type_bool_, + stencil_reference_read_masked, + old_stencil_read_masked))); + spv::Id stencil_op = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, stencil_func_ops, + builder_->createTriOp( + spv::OpSelect, type_uint_, stencil_passed_if_enabled, + builder_->createTriOp(spv::OpSelect, type_uint_, depth_passed, + builder_->makeUintConstant(6), + builder_->makeUintConstant(9)), + builder_->makeUintConstant(3)), + builder_->makeUintConstant(3)); + spv::Block& block_stencil_op_head = *builder_->getBuildPoint(); + spv::Block& block_stencil_op_keep = builder_->makeNewBlock(); + spv::Block& block_stencil_op_zero = builder_->makeNewBlock(); + spv::Block& block_stencil_op_replace = builder_->makeNewBlock(); + spv::Block& block_stencil_op_increment_clamp = builder_->makeNewBlock(); + spv::Block& block_stencil_op_decrement_clamp = builder_->makeNewBlock(); + spv::Block& block_stencil_op_invert = builder_->makeNewBlock(); + spv::Block& block_stencil_op_increment_wrap = builder_->makeNewBlock(); + spv::Block& block_stencil_op_decrement_wrap = builder_->makeNewBlock(); + spv::Block& block_stencil_op_merge = builder_->makeNewBlock(); + builder_->createSelectionMerge(&block_stencil_op_merge, + spv::SelectionControlDontFlattenMask); + { + std::unique_ptr stencil_op_switch_op = + std::make_unique(spv::OpSwitch); + stencil_op_switch_op->addIdOperand(stencil_op); + // Make keep the default. + stencil_op_switch_op->addIdOperand(block_stencil_op_keep.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kZero)); + stencil_op_switch_op->addIdOperand(block_stencil_op_zero.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kReplace)); + stencil_op_switch_op->addIdOperand(block_stencil_op_replace.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kIncrementClamp)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_increment_clamp.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kDecrementClamp)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_decrement_clamp.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kInvert)); + stencil_op_switch_op->addIdOperand(block_stencil_op_invert.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kIncrementWrap)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_increment_wrap.getId()); + stencil_op_switch_op->addImmediateOperand( + int32_t(xenos::StencilOp::kDecrementWrap)); + stencil_op_switch_op->addIdOperand( + block_stencil_op_decrement_wrap.getId()); + builder_->getBuildPoint()->addInstruction( + std::move(stencil_op_switch_op)); + } + block_stencil_op_keep.addPredecessor(&block_stencil_op_head); + block_stencil_op_zero.addPredecessor(&block_stencil_op_head); + block_stencil_op_replace.addPredecessor(&block_stencil_op_head); + block_stencil_op_increment_clamp.addPredecessor(&block_stencil_op_head); + block_stencil_op_decrement_clamp.addPredecessor(&block_stencil_op_head); + block_stencil_op_invert.addPredecessor(&block_stencil_op_head); + block_stencil_op_increment_wrap.addPredecessor(&block_stencil_op_head); + block_stencil_op_decrement_wrap.addPredecessor(&block_stencil_op_head); + // Keep - will use the old stencil in the phi. + builder_->setBuildPoint(&block_stencil_op_keep); + builder_->createBranch(&block_stencil_op_merge); + // Zero - will use the zero constant in the phi. + builder_->setBuildPoint(&block_stencil_op_zero); + builder_->createBranch(&block_stencil_op_merge); + // Replace - will use the stencil reference in the phi. + builder_->setBuildPoint(&block_stencil_op_replace); + builder_->createBranch(&block_stencil_op_merge); + // Increment and clamp. + builder_->setBuildPoint(&block_stencil_op_increment_clamp); + spv::Id new_stencil_in_low_bits_increment_clamp = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinBuiltinCall( + type_uint_, ext_inst_glsl_std_450_, GLSLstd450UMin, + builder_->makeUintConstant(UINT8_MAX - 1), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + old_depth_stencil, + builder_->makeUintConstant(UINT8_MAX))), + const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Decrement and clamp. + builder_->setBuildPoint(&block_stencil_op_decrement_clamp); + spv::Id new_stencil_in_low_bits_decrement_clamp = builder_->createBinOp( + spv::OpISub, type_uint_, + builder_->createBinBuiltinCall( + type_uint_, ext_inst_glsl_std_450_, GLSLstd450UMax, const_uint_1, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + old_depth_stencil, + builder_->makeUintConstant(UINT8_MAX))), + const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Invert. + builder_->setBuildPoint(&block_stencil_op_invert); + spv::Id new_stencil_in_low_bits_invert = + builder_->createUnaryOp(spv::OpNot, type_uint_, old_depth_stencil); + builder_->createBranch(&block_stencil_op_merge); + // Increment and wrap. + // The upper bits containing the old depth have no effect on the behavior. + builder_->setBuildPoint(&block_stencil_op_increment_wrap); + spv::Id new_stencil_in_low_bits_increment_wrap = builder_->createBinOp( + spv::OpIAdd, type_uint_, old_depth_stencil, const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Decrement and wrap. + // The upper bits containing the old depth have no effect on the behavior. + builder_->setBuildPoint(&block_stencil_op_decrement_wrap); + spv::Id new_stencil_in_low_bits_decrement_wrap = builder_->createBinOp( + spv::OpISub, type_uint_, old_depth_stencil, const_uint_1); + builder_->createBranch(&block_stencil_op_merge); + // Select the new stencil (with undefined data in bits starting from 8) + // based on the stencil operation. + builder_->setBuildPoint(&block_stencil_op_merge); + id_vector_temp_.clear(); + id_vector_temp_.reserve(2 * 8); + id_vector_temp_.push_back(old_depth_stencil); + id_vector_temp_.push_back(block_stencil_op_keep.getId()); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(block_stencil_op_zero.getId()); + id_vector_temp_.push_back(stencil_reference); + id_vector_temp_.push_back(block_stencil_op_replace.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_increment_clamp); + id_vector_temp_.push_back(block_stencil_op_increment_clamp.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_decrement_clamp); + id_vector_temp_.push_back(block_stencil_op_decrement_clamp.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_invert); + id_vector_temp_.push_back(block_stencil_op_invert.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_increment_wrap); + id_vector_temp_.push_back(block_stencil_op_increment_wrap.getId()); + id_vector_temp_.push_back(new_stencil_in_low_bits_decrement_wrap); + id_vector_temp_.push_back(block_stencil_op_decrement_wrap.getId()); + spv::Id new_stencil_in_low_bits_if_enabled = + builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + // Merge the old depth / stencil (old depth kept from the old depth / + // stencil so the separate old depth register is not needed anymore after + // the depth test) and the new stencil based on the write mask. + new_stencil_and_old_depth_if_stencil_enabled = builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + old_depth_stencil, stencil_write_keep_mask), + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + new_stencil_in_low_bits_if_enabled, + stencil_write_mask)); } - block_stencil_op_keep.addPredecessor(&block_stencil_op_head); - block_stencil_op_zero.addPredecessor(&block_stencil_op_head); - block_stencil_op_replace.addPredecessor(&block_stencil_op_head); - block_stencil_op_increment_clamp.addPredecessor(&block_stencil_op_head); - block_stencil_op_decrement_clamp.addPredecessor(&block_stencil_op_head); - block_stencil_op_invert.addPredecessor(&block_stencil_op_head); - block_stencil_op_increment_wrap.addPredecessor(&block_stencil_op_head); - block_stencil_op_decrement_wrap.addPredecessor(&block_stencil_op_head); - // Keep - will use the old stencil in the phi. - builder_->setBuildPoint(&block_stencil_op_keep); - builder_->createBranch(&block_stencil_op_merge); - // Zero - will use the zero constant in the phi. - builder_->setBuildPoint(&block_stencil_op_zero); - builder_->createBranch(&block_stencil_op_merge); - // Replace - will use the stencil reference in the phi. - builder_->setBuildPoint(&block_stencil_op_replace); - builder_->createBranch(&block_stencil_op_merge); - // Increment and clamp. - builder_->setBuildPoint(&block_stencil_op_increment_clamp); - spv::Id new_stencil_in_low_bits_increment_clamp = builder_->createBinOp( - spv::OpIAdd, type_uint_, - builder_->createBinBuiltinCall( - type_uint_, ext_inst_glsl_std_450_, GLSLstd450UMin, - builder_->makeUintConstant(UINT8_MAX - 1), - builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, - old_depth_stencil, - builder_->makeUintConstant(UINT8_MAX))), - const_uint_1); - builder_->createBranch(&block_stencil_op_merge); - // Decrement and clamp. - builder_->setBuildPoint(&block_stencil_op_decrement_clamp); - spv::Id new_stencil_in_low_bits_decrement_clamp = builder_->createBinOp( - spv::OpISub, type_uint_, - builder_->createBinBuiltinCall( - type_uint_, ext_inst_glsl_std_450_, GLSLstd450UMax, const_uint_1, - builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, - old_depth_stencil, - builder_->makeUintConstant(UINT8_MAX))), - const_uint_1); - builder_->createBranch(&block_stencil_op_merge); - // Invert. - builder_->setBuildPoint(&block_stencil_op_invert); - spv::Id new_stencil_in_low_bits_invert = - builder_->createUnaryOp(spv::OpNot, type_uint_, old_depth_stencil); - builder_->createBranch(&block_stencil_op_merge); - // Increment and wrap. - // The upper bits containing the old depth have no effect on the behavior. - builder_->setBuildPoint(&block_stencil_op_increment_wrap); - spv::Id new_stencil_in_low_bits_increment_wrap = builder_->createBinOp( - spv::OpIAdd, type_uint_, old_depth_stencil, const_uint_1); - builder_->createBranch(&block_stencil_op_merge); - // Decrement and wrap. - // The upper bits containing the old depth have no effect on the behavior. - builder_->setBuildPoint(&block_stencil_op_decrement_wrap); - spv::Id new_stencil_in_low_bits_decrement_wrap = builder_->createBinOp( - spv::OpISub, type_uint_, old_depth_stencil, const_uint_1); - builder_->createBranch(&block_stencil_op_merge); - // Select the new stencil (with undefined data in bits starting from 8) - // based on the stencil operation. - builder_->setBuildPoint(&block_stencil_op_merge); - id_vector_temp_.clear(); - id_vector_temp_.reserve(2 * 8); - id_vector_temp_.push_back(old_depth_stencil); - id_vector_temp_.push_back(block_stencil_op_keep.getId()); - id_vector_temp_.push_back(const_uint_0_); - id_vector_temp_.push_back(block_stencil_op_zero.getId()); - id_vector_temp_.push_back(stencil_reference); - id_vector_temp_.push_back(block_stencil_op_replace.getId()); - id_vector_temp_.push_back(new_stencil_in_low_bits_increment_clamp); - id_vector_temp_.push_back(block_stencil_op_increment_clamp.getId()); - id_vector_temp_.push_back(new_stencil_in_low_bits_decrement_clamp); - id_vector_temp_.push_back(block_stencil_op_decrement_clamp.getId()); - id_vector_temp_.push_back(new_stencil_in_low_bits_invert); - id_vector_temp_.push_back(block_stencil_op_invert.getId()); - id_vector_temp_.push_back(new_stencil_in_low_bits_increment_wrap); - id_vector_temp_.push_back(block_stencil_op_increment_wrap.getId()); - id_vector_temp_.push_back(new_stencil_in_low_bits_decrement_wrap); - id_vector_temp_.push_back(block_stencil_op_decrement_wrap.getId()); - spv::Id new_stencil_in_low_bits_if_enabled = - builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); - // Merge the old depth / stencil (old depth kept from the old depth / - // stencil so the separate old depth register is not needed anymore after - // the depth test) and the new stencil based on the write mask. - spv::Id new_stencil_and_old_depth_if_stencil_enabled = - builder_->createBinOp( - spv::OpBitwiseOr, type_uint_, - builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, - old_depth_stencil, stencil_write_keep_mask), - builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, - new_stencil_in_low_bits_if_enabled, - stencil_write_mask)); - + stencil_if.makeEndIf(); // Choose the result based on whether the stencil test was done. // All phi operations must be the first in the block. - builder_->createBranch(&block_stencil_enabled_merge); - spv::Block& block_stencil_enabled_end = *builder_->getBuildPoint(); - builder_->setBuildPoint(&block_stencil_enabled_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(stencil_passed_if_enabled); - id_vector_temp_.push_back(block_stencil_enabled_end.getId()); - id_vector_temp_.push_back(builder_->makeBoolConstant(true)); - id_vector_temp_.push_back(block_stencil_enabled_head.getId()); - spv::Id stencil_passed = - builder_->createOp(spv::OpPhi, type_bool_, id_vector_temp_); - id_vector_temp_.clear(); - id_vector_temp_.push_back(new_stencil_and_old_depth_if_stencil_enabled); - id_vector_temp_.push_back(block_stencil_enabled_end.getId()); - id_vector_temp_.push_back(old_depth_stencil); - id_vector_temp_.push_back(block_stencil_enabled_head.getId()); - spv::Id new_stencil_and_old_depth = - builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + spv::Id stencil_passed = stencil_if.createMergePhi( + stencil_passed_if_enabled, builder_->makeBoolConstant(true)); + spv::Id new_stencil_and_old_depth = stencil_if.createMergePhi( + new_stencil_and_old_depth_if_stencil_enabled, old_depth_stencil); // Check whether the tests have passed, and exclude the bit from the // coverage if not. @@ -2384,37 +2185,19 @@ void SpirvShaderTranslator::FSI_DepthStencilTest( new_depth_stencil_write_condition = new_depth_stencil_different; } if (new_depth_stencil_write_condition != spv::NoResult) { - spv::Block& block_depth_stencil_write = builder_->makeNewBlock(); - spv::Block& block_depth_stencil_write_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_depth_stencil_write_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(new_depth_stencil_write_condition, - &block_depth_stencil_write, - &block_depth_stencil_write_merge); - builder_->setBuildPoint(&block_depth_stencil_write); + SpirvBuilder::IfBuilder new_depth_stencil_write_if( + new_depth_stencil_write_condition, + spv::SelectionControlDontFlattenMask, *builder_); builder_->createStore(new_depth_stencil, sample_access_chain); - builder_->createBranch(&block_depth_stencil_write_merge); - builder_->setBuildPoint(&block_depth_stencil_write_merge); + new_depth_stencil_write_if.makeEndIf(); } - builder_->createBranch(&block_sample_covered_merge); - spv::Block& block_sample_covered_end = *builder_->getBuildPoint(); - builder_->setBuildPoint(&block_sample_covered_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(new_sample_mask_after_sample); - id_vector_temp_.push_back(block_sample_covered_end.getId()); - id_vector_temp_.push_back(new_sample_mask); - id_vector_temp_.push_back(block_sample_covered_head.getId()); - new_sample_mask = - builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if_sample_covered.makeEndIf(); + new_sample_mask = if_sample_covered.createMergePhi( + new_sample_mask_after_sample, new_sample_mask); if (is_early) { - id_vector_temp_.clear(); - id_vector_temp_.push_back(new_depth_stencil); - id_vector_temp_.push_back(block_sample_covered_end.getId()); - id_vector_temp_.push_back(const_uint_0_); - id_vector_temp_.push_back(block_sample_covered_head.getId()); late_write_depth_stencil[i] = - builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if_sample_covered.createMergePhi(new_depth_stencil, const_uint_0_); } } @@ -2442,25 +2225,14 @@ void SpirvShaderTranslator::FSI_DepthStencilTest( } } } - builder_->createBranch(&block_depth_stencil_enabled_merge); - spv::Block& block_depth_stencil_enabled_end = *builder_->getBuildPoint(); - builder_->setBuildPoint(&block_depth_stencil_enabled_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(new_sample_mask); - id_vector_temp_.push_back(block_depth_stencil_enabled_end.getId()); - id_vector_temp_.push_back(main_fsi_sample_mask_); - id_vector_temp_.push_back(block_depth_stencil_enabled_head.getId()); - main_fsi_sample_mask_ = - builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if_depth_stencil_enabled.makeEndIf(); + main_fsi_sample_mask_ = if_depth_stencil_enabled.createMergePhi( + new_sample_mask, main_fsi_sample_mask_); if (is_early) { for (uint32_t i = 0; i < 4; ++i) { - id_vector_temp_.clear(); - id_vector_temp_.push_back(late_write_depth_stencil[i]); - id_vector_temp_.push_back(block_depth_stencil_enabled_end.getId()); - id_vector_temp_.push_back(const_uint_0_); - id_vector_temp_.push_back(block_depth_stencil_enabled_head.getId()); main_fsi_late_write_depth_stencil_[i] = - builder_->createOp(spv::OpPhi, type_uint_, id_vector_temp_); + if_depth_stencil_enabled.createMergePhi(late_write_depth_stencil[i], + const_uint_0_); } } } @@ -3160,32 +2932,25 @@ spv::Id SpirvShaderTranslator::FSI_FlushNaNClampAndInBlending( assert_true(builder_->getTypeId(min_value) == color_or_alpha_type); assert_true(builder_->getTypeId(max_value) == color_or_alpha_type); - spv::Block& block_is_fixed_point_head = *builder_->getBuildPoint(); - spv::Block& block_is_fixed_point_if = builder_->makeNewBlock(); - spv::Block& block_is_fixed_point_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_is_fixed_point_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(is_fixed_point, &block_is_fixed_point_if, - &block_is_fixed_point_merge); - builder_->setBuildPoint(&block_is_fixed_point_if); - // Flush NaN to 0 even for signed (NMax would flush it to the minimum value). - spv::Id color_or_alpha_clamped = builder_->createTriBuiltinCall( - color_or_alpha_type, ext_inst_glsl_std_450_, GLSLstd450FClamp, - builder_->createTriOp( - spv::OpSelect, color_or_alpha_type, - builder_->createUnaryOp(spv::OpIsNan, - type_bool_vectors_[component_count - 1], - color_or_alpha), - const_float_vectors_0_[component_count - 1], color_or_alpha), - min_value, max_value); - builder_->createBranch(&block_is_fixed_point_merge); - builder_->setBuildPoint(&block_is_fixed_point_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(color_or_alpha_clamped); - id_vector_temp_.push_back(block_is_fixed_point_if.getId()); - id_vector_temp_.push_back(color_or_alpha); - id_vector_temp_.push_back(block_is_fixed_point_head.getId()); - return builder_->createOp(spv::OpPhi, color_or_alpha_type, id_vector_temp_); + SpirvBuilder::IfBuilder if_fixed_point( + is_fixed_point, spv::SelectionControlDontFlattenMask, *builder_); + spv::Id color_or_alpha_clamped; + { + // Flush NaN to 0 even for signed (NMax would flush it to the minimum + // value). + color_or_alpha_clamped = builder_->createTriBuiltinCall( + color_or_alpha_type, ext_inst_glsl_std_450_, GLSLstd450FClamp, + builder_->createTriOp( + spv::OpSelect, color_or_alpha_type, + builder_->createUnaryOp(spv::OpIsNan, + type_bool_vectors_[component_count - 1], + color_or_alpha), + const_float_vectors_0_[component_count - 1], color_or_alpha), + min_value, max_value); + } + if_fixed_point.makeEndIf(); + + return if_fixed_point.createMergePhi(color_or_alpha_clamped, color_or_alpha); } spv::Id SpirvShaderTranslator::FSI_ApplyColorBlendFactor( @@ -3197,21 +2962,14 @@ spv::Id SpirvShaderTranslator::FSI_ApplyColorBlendFactor( // infinity and NaN are not potentially involved in the multiplication. // Calculate the condition before the selection merge, which must be the // penultimate instruction in the block. - spv::Id factor_not_zero = builder_->createBinOp( - spv::OpINotEqual, type_bool_, factor, - builder_->makeUintConstant(uint32_t(xenos::BlendFactor::kZero))); - spv::Block& block_not_zero_head = *builder_->getBuildPoint(); - spv::Block& block_not_zero_if = builder_->makeNewBlock(); - spv::Block& block_not_zero_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_not_zero_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(factor_not_zero, &block_not_zero_if, - &block_not_zero_merge); + SpirvBuilder::IfBuilder factor_not_zero_if( + builder_->createBinOp( + spv::OpINotEqual, type_bool_, factor, + builder_->makeUintConstant(uint32_t(xenos::BlendFactor::kZero))), + spv::SelectionControlDontFlattenMask, *builder_); // Non-zero factor case. - builder_->setBuildPoint(&block_not_zero_if); - spv::Block& block_factor_head = *builder_->getBuildPoint(); spv::Block& block_factor_one = builder_->makeNewBlock(); std::array color_factor_blocks; @@ -3386,18 +3144,11 @@ spv::Id SpirvShaderTranslator::FSI_ApplyColorBlendFactor( builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); spv::Id result = FSI_FlushNaNClampAndInBlending( result_unclamped, is_fixed_point, clamp_min_value, clamp_max_value); - builder_->createBranch(&block_not_zero_merge); - // Get the latest block for a non-zero factor after all the control flow. - spv::Block& block_not_zero_if_end = *builder_->getBuildPoint(); + + factor_not_zero_if.makeEndIf(); // Make the result zero if the factor is zero. - builder_->setBuildPoint(&block_not_zero_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(result); - id_vector_temp_.push_back(block_not_zero_if_end.getId()); - id_vector_temp_.push_back(const_float3_0_); - id_vector_temp_.push_back(block_not_zero_head.getId()); - return builder_->createOp(spv::OpPhi, type_float3_, id_vector_temp_); + return factor_not_zero_if.createMergePhi(result, const_float3_0_); } spv::Id SpirvShaderTranslator::FSI_ApplyAlphaBlendFactor( @@ -3408,21 +3159,14 @@ spv::Id SpirvShaderTranslator::FSI_ApplyAlphaBlendFactor( // infinity and NaN are not potentially involved in the multiplication. // Calculate the condition before the selection merge, which must be the // penultimate instruction in the block. - spv::Id factor_not_zero = builder_->createBinOp( - spv::OpINotEqual, type_bool_, factor, - builder_->makeUintConstant(uint32_t(xenos::BlendFactor::kZero))); - spv::Block& block_not_zero_head = *builder_->getBuildPoint(); - spv::Block& block_not_zero_if = builder_->makeNewBlock(); - spv::Block& block_not_zero_merge = builder_->makeNewBlock(); - builder_->createSelectionMerge(&block_not_zero_merge, - spv::SelectionControlDontFlattenMask); - builder_->createConditionalBranch(factor_not_zero, &block_not_zero_if, - &block_not_zero_merge); + SpirvBuilder::IfBuilder factor_not_zero_if( + builder_->createBinOp( + spv::OpINotEqual, type_bool_, factor, + builder_->makeUintConstant(uint32_t(xenos::BlendFactor::kZero))), + spv::SelectionControlDontFlattenMask, *builder_); // Non-zero factor case. - builder_->setBuildPoint(&block_not_zero_if); - spv::Block& block_factor_head = *builder_->getBuildPoint(); spv::Block& block_factor_one = builder_->makeNewBlock(); std::array alpha_factor_blocks; @@ -3557,18 +3301,11 @@ spv::Id SpirvShaderTranslator::FSI_ApplyAlphaBlendFactor( builder_->createOp(spv::OpPhi, type_float_, id_vector_temp_); spv::Id result = FSI_FlushNaNClampAndInBlending( result_unclamped, is_fixed_point, clamp_min_value, clamp_max_value); - builder_->createBranch(&block_not_zero_merge); - // Get the latest block for a non-zero factor after all the control flow. - spv::Block& block_not_zero_if_end = *builder_->getBuildPoint(); + + factor_not_zero_if.makeEndIf(); // Make the result zero if the factor is zero. - builder_->setBuildPoint(&block_not_zero_merge); - id_vector_temp_.clear(); - id_vector_temp_.push_back(result); - id_vector_temp_.push_back(block_not_zero_if_end.getId()); - id_vector_temp_.push_back(const_float_0_); - id_vector_temp_.push_back(block_not_zero_head.getId()); - return builder_->createOp(spv::OpPhi, type_float_, id_vector_temp_); + return factor_not_zero_if.createMergePhi(result, const_float_0_); } spv::Id SpirvShaderTranslator::FSI_BlendColorOrAlphaWithUnclampedResult( diff --git a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc index bf1cda68d..8f7887b4e 100644 --- a/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc +++ b/src/xenia/gpu/vulkan/vulkan_render_target_cache.cc @@ -4156,21 +4156,16 @@ VkShaderModule VulkanRenderTargetCache::GetTransferShader( builder.createAccessChain(spv::StorageClassPushConstant, push_constants, id_vector_temp), spv::NoPrecision); - spv::Id stencil_sample_passed = builder.createBinOp( - spv::OpINotEqual, type_bool, - builder.createBinOp(spv::OpBitwiseAnd, type_uint, packed, - stencil_mask_constant), - builder.makeUintConstant(0)); - spv::Block& stencil_bit_kill_block = builder.makeNewBlock(); - spv::Block& stencil_bit_merge_block = builder.makeNewBlock(); - builder.createSelectionMerge(&stencil_bit_merge_block, - spv::SelectionControlMaskNone); - builder.createConditionalBranch(stencil_sample_passed, - &stencil_bit_merge_block, - &stencil_bit_kill_block); - builder.setBuildPoint(&stencil_bit_kill_block); + SpirvBuilder::IfBuilder stencil_kill_if( + builder.createBinOp( + spv::OpIEqual, type_bool, + builder.createBinOp(spv::OpBitwiseAnd, type_uint, packed, + stencil_mask_constant), + builder.makeUintConstant(0)), + spv::SelectionControlMaskNone, builder); builder.createNoResultOp(spv::OpKill); - builder.setBuildPoint(&stencil_bit_merge_block); + // OpKill terminates the block. + stencil_kill_if.makeEndIf(false); } } break; } From 210ac4b2d26c4b1f238a28aaa8e2984cb0dd20c3 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 18 May 2024 23:53:09 +0300 Subject: [PATCH 7/8] [GPU] Fix gamma ramp writing after RegisterFile API change (#2262) --- src/xenia/gpu/command_processor.cc | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index cc8e80690..ce08d19bd 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -369,7 +369,7 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { case XE_GPU_REG_DC_LUT_SEQ_COLOR: { // Should be in the 256-entry table writing mode. assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1); - auto& gamma_ramp_rw_index = regs.Get(); + auto gamma_ramp_rw_index = regs.Get(); // DC_LUT_SEQ_COLOR is in the red, green, blue order, but the write // enable mask is blue, green, red. bool write_gamma_ramp_component = @@ -395,7 +395,11 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { } if (++gamma_ramp_rw_component_ >= 3) { gamma_ramp_rw_component_ = 0; - ++gamma_ramp_rw_index.rw_index; + reg::DC_LUT_RW_INDEX new_gamma_ramp_rw_index = gamma_ramp_rw_index; + ++new_gamma_ramp_rw_index.rw_index; + WriteRegister( + XE_GPU_REG_DC_LUT_RW_INDEX, + xe::memory::Reinterpret(new_gamma_ramp_rw_index)); } if (write_gamma_ramp_component) { OnGammaRamp256EntryTableValueWritten(); @@ -405,7 +409,7 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { case XE_GPU_REG_DC_LUT_PWL_DATA: { // Should be in the PWL writing mode. assert_not_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1); - auto& gamma_ramp_rw_index = regs.Get(); + auto gamma_ramp_rw_index = regs.Get(); // Bit 7 of the index is ignored for PWL. uint32_t gamma_ramp_rw_index_pwl = gamma_ramp_rw_index.rw_index & 0x7F; // DC_LUT_PWL_DATA is likely in the red, green, blue order because @@ -424,13 +428,17 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { } if (++gamma_ramp_rw_component_ >= 3) { gamma_ramp_rw_component_ = 0; + reg::DC_LUT_RW_INDEX new_gamma_ramp_rw_index = gamma_ramp_rw_index; // TODO(Triang3l): Should this increase beyond 7 bits for PWL? // Direct3D 9 explicitly sets rw_index to 0x80 after writing the last // PWL entry. However, the DC_LUT_RW_INDEX documentation says that for // PWL, the bit 7 is ignored. - gamma_ramp_rw_index.rw_index = + new_gamma_ramp_rw_index.rw_index = (gamma_ramp_rw_index.rw_index & ~UINT32_C(0x7F)) | ((gamma_ramp_rw_index_pwl + 1) & 0x7F); + WriteRegister( + XE_GPU_REG_DC_LUT_RW_INDEX, + xe::memory::Reinterpret(new_gamma_ramp_rw_index)); } if (write_gamma_ramp_component) { OnGammaRampPWLValueWritten(); @@ -440,7 +448,7 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { case XE_GPU_REG_DC_LUT_30_COLOR: { // Should be in the 256-entry table writing mode. assert_zero(regs[XE_GPU_REG_DC_LUT_RW_MODE] & 0b1); - auto& gamma_ramp_rw_index = regs.Get(); + auto gamma_ramp_rw_index = regs.Get(); uint32_t gamma_ramp_write_enable_mask = regs[XE_GPU_REG_DC_LUT_WRITE_EN_MASK] & 0b111; if (gamma_ramp_write_enable_mask) { @@ -457,11 +465,16 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { gamma_ramp_entry.color_10_red = gamma_ramp_value.color_10_red; } } - ++gamma_ramp_rw_index.rw_index; // TODO(Triang3l): Should this reset the component write index? If this // increase is assumed to behave like a full DC_LUT_RW_INDEX write, it - // probably should. + // probably should. Currently this also calls WriteRegister for + // DC_LUT_RW_INDEX, which resets gamma_ramp_rw_component_ as well. gamma_ramp_rw_component_ = 0; + reg::DC_LUT_RW_INDEX new_gamma_ramp_rw_index = gamma_ramp_rw_index; + ++new_gamma_ramp_rw_index.rw_index; + WriteRegister( + XE_GPU_REG_DC_LUT_RW_INDEX, + xe::memory::Reinterpret(new_gamma_ramp_rw_index)); if (gamma_ramp_write_enable_mask) { OnGammaRamp256EntryTableValueWritten(); } From 3d30b2eec3ab1f83140b09745bee881fb5d5dde2 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 25 May 2024 16:00:21 +0300 Subject: [PATCH 8/8] [Vulkan] Shader memory export (#145) --- src/xenia/gpu/spirv_builder.cc | 90 ++ src/xenia/gpu/spirv_builder.h | 47 + src/xenia/gpu/spirv_shader_translator.cc | 351 +++++-- src/xenia/gpu/spirv_shader_translator.h | 73 +- src/xenia/gpu/spirv_shader_translator_alu.cc | 51 +- .../gpu/spirv_shader_translator_memexport.cc | 950 ++++++++++++++++++ .../gpu/vulkan/vulkan_command_processor.cc | 64 +- .../gpu/vulkan/vulkan_command_processor.h | 3 + 8 files changed, 1535 insertions(+), 94 deletions(-) create mode 100644 src/xenia/gpu/spirv_shader_translator_memexport.cc diff --git a/src/xenia/gpu/spirv_builder.cc b/src/xenia/gpu/spirv_builder.cc index 2ed78bd65..fc2e92850 100644 --- a/src/xenia/gpu/spirv_builder.cc +++ b/src/xenia/gpu/spirv_builder.cc @@ -203,5 +203,95 @@ spv::Id SpirvBuilder::IfBuilder::createMergePhi(spv::Id then_variable, getElsePhiParent()); } +SpirvBuilder::SwitchBuilder::SwitchBuilder(spv::Id selector, + unsigned int selection_control, + SpirvBuilder& builder) + : builder_(builder), + selector_(selector), + selection_control_(selection_control), + function_(builder.getBuildPoint()->getParent()), + header_block_(builder.getBuildPoint()), + default_phi_parent_(builder.getBuildPoint()->getId()) { + merge_block_ = new spv::Block(builder_.getUniqueId(), function_); +} + +void SpirvBuilder::SwitchBuilder::makeBeginDefault() { + assert_null(default_block_); + + endSegment(); + + default_block_ = new spv::Block(builder_.getUniqueId(), function_); + function_.addBlock(default_block_); + default_block_->addPredecessor(header_block_); + builder_.setBuildPoint(default_block_); + + current_branch_ = Branch::kDefault; +} + +void SpirvBuilder::SwitchBuilder::makeBeginCase(unsigned int literal) { + endSegment(); + + auto case_block = new spv::Block(builder_.getUniqueId(), function_); + function_.addBlock(case_block); + cases_.emplace_back(literal, case_block->getId()); + case_block->addPredecessor(header_block_); + builder_.setBuildPoint(case_block); + + current_branch_ = Branch::kCase; +} + +void SpirvBuilder::SwitchBuilder::addCurrentCaseLiteral(unsigned int literal) { + assert_true(current_branch_ == Branch::kCase); + + cases_.emplace_back(literal, cases_.back().second); +} + +void SpirvBuilder::SwitchBuilder::makeEndSwitch() { + endSegment(); + + builder_.setBuildPoint(header_block_); + + builder_.createSelectionMerge(merge_block_, selection_control_); + + std::unique_ptr switch_instruction = + std::make_unique(spv::OpSwitch); + switch_instruction->addIdOperand(selector_); + if (default_block_) { + switch_instruction->addIdOperand(default_block_->getId()); + } else { + switch_instruction->addIdOperand(merge_block_->getId()); + merge_block_->addPredecessor(header_block_); + } + for (const std::pair& case_pair : cases_) { + switch_instruction->addImmediateOperand(case_pair.first); + switch_instruction->addIdOperand(case_pair.second); + } + builder_.getBuildPoint()->addInstruction(std::move(switch_instruction)); + + function_.addBlock(merge_block_); + builder_.setBuildPoint(merge_block_); + + current_branch_ = Branch::kMerge; +} + +void SpirvBuilder::SwitchBuilder::endSegment() { + assert_true(current_branch_ == Branch::kSelection || + current_branch_ == Branch::kDefault || + current_branch_ == Branch::kCase); + + if (current_branch_ == Branch::kSelection) { + return; + } + + if (!builder_.getBuildPoint()->isTerminated()) { + builder_.createBranch(merge_block_); + if (current_branch_ == Branch::kDefault) { + default_phi_parent_ = builder_.getBuildPoint()->getId(); + } + } + + current_branch_ = Branch::kSelection; +} + } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/spirv_builder.h b/src/xenia/gpu/spirv_builder.h index 1bb2e6851..7422d7c63 100644 --- a/src/xenia/gpu/spirv_builder.h +++ b/src/xenia/gpu/spirv_builder.h @@ -10,7 +10,10 @@ #ifndef XENIA_GPU_SPIRV_BUILDER_H_ #define XENIA_GPU_SPIRV_BUILDER_H_ +#include #include +#include +#include #include "third_party/glslang/SPIRV/SpvBuilder.h" #include "xenia/base/assert.h" @@ -99,6 +102,50 @@ class SpirvBuilder : public spv::Builder { Branch currentBranch = Branch::kThen; #endif }; + + // Simpler and more flexible (such as multiple cases pointing to the same + // block) compared to makeSwitch. + class SwitchBuilder { + public: + SwitchBuilder(spv::Id selector, unsigned int selection_control, + SpirvBuilder& builder); + ~SwitchBuilder() { assert_true(current_branch_ == Branch::kMerge); } + + void makeBeginDefault(); + void makeBeginCase(unsigned int literal); + void addCurrentCaseLiteral(unsigned int literal); + void makeEndSwitch(); + + // If there's no default block that branches to the merge block, the phi + // parent is the header block - this simplifies case-only usage. + spv::Id getDefaultPhiParent() const { return default_phi_parent_; } + + private: + enum class Branch { + kSelection, + kDefault, + kCase, + kMerge, + }; + + void endSegment(); + + SpirvBuilder& builder_; + spv::Id selector_; + unsigned int selection_control_; + + spv::Function& function_; + + spv::Block* header_block_; + spv::Block* merge_block_; + spv::Block* default_block_ = nullptr; + + std::vector> cases_; + + spv::Id default_phi_parent_; + + Branch current_branch_ = Branch::kSelection; + }; }; } // namespace gpu diff --git a/src/xenia/gpu/spirv_shader_translator.cc b/src/xenia/gpu/spirv_shader_translator.cc index e34193219..399b7079f 100644 --- a/src/xenia/gpu/spirv_shader_translator.cc +++ b/src/xenia/gpu/spirv_shader_translator.cc @@ -30,30 +30,35 @@ namespace gpu { SpirvShaderTranslator::Features::Features(bool all) : spirv_version(all ? spv::Spv_1_5 : spv::Spv_1_0), max_storage_buffer_range(all ? UINT32_MAX : (128 * 1024 * 1024)), + full_draw_index_uint32(all), + vertex_pipeline_stores_and_atomics(all), + fragment_stores_and_atomics(all), clip_distance(all), cull_distance(all), - demote_to_helper_invocation(all), - fragment_shader_sample_interlock(all), - full_draw_index_uint32(all), image_view_format_swizzle(all), signed_zero_inf_nan_preserve_float32(all), denorm_flush_to_zero_float32(all), - rounding_mode_rte_float32(all) {} + rounding_mode_rte_float32(all), + fragment_shader_sample_interlock(all), + demote_to_helper_invocation(all) {} SpirvShaderTranslator::Features::Features( const ui::vulkan::VulkanProvider::DeviceInfo& device_info) : max_storage_buffer_range(device_info.maxStorageBufferRange), + full_draw_index_uint32(device_info.fullDrawIndexUint32), + vertex_pipeline_stores_and_atomics( + device_info.vertexPipelineStoresAndAtomics), + fragment_stores_and_atomics(device_info.fragmentStoresAndAtomics), clip_distance(device_info.shaderClipDistance), cull_distance(device_info.shaderCullDistance), - demote_to_helper_invocation(device_info.shaderDemoteToHelperInvocation), - fragment_shader_sample_interlock( - device_info.fragmentShaderSampleInterlock), - full_draw_index_uint32(device_info.fullDrawIndexUint32), image_view_format_swizzle(device_info.imageViewFormatSwizzle), signed_zero_inf_nan_preserve_float32( device_info.shaderSignedZeroInfNanPreserveFloat32), denorm_flush_to_zero_float32(device_info.shaderDenormFlushToZeroFloat32), - rounding_mode_rte_float32(device_info.shaderRoundingModeRTEFloat32) { + rounding_mode_rte_float32(device_info.shaderRoundingModeRTEFloat32), + fragment_shader_sample_interlock( + device_info.fragmentShaderSampleInterlock), + demote_to_helper_invocation(device_info.shaderDemoteToHelperInvocation) { if (device_info.apiVersion >= VK_MAKE_API_VERSION(0, 1, 2, 0)) { spirv_version = spv::Spv_1_5; } else if (device_info.ext_1_2_VK_KHR_spirv_1_4) { @@ -117,6 +122,14 @@ void SpirvShaderTranslator::Reset() { main_interface_.clear(); var_main_registers_ = spv::NoResult; + var_main_memexport_address_ = spv::NoResult; + for (size_t memexport_eM_index = 0; + memexport_eM_index < xe::countof(var_main_memexport_data_); + ++memexport_eM_index) { + var_main_memexport_data_[memexport_eM_index] = spv::NoResult; + } + var_main_memexport_data_written_ = spv::NoResult; + main_memexport_allowed_ = spv::NoResult; var_main_point_size_edge_flag_kill_vertex_ = spv::NoResult; var_main_kill_pixel_ = spv::NoResult; var_main_fsi_color_written_ = spv::NoResult; @@ -310,6 +323,8 @@ void SpirvShaderTranslator::StartTranslation() { main_interface_.push_back(uniform_system_constants_); } + bool memexport_used = IsMemoryExportUsed(); + if (!is_depth_only_fragment_shader_) { // Common uniform buffer - float constants. uint32_t float_constant_count = @@ -420,9 +435,10 @@ void SpirvShaderTranslator::StartTranslation() { builder_->addMemberName(type_shared_memory, 0, "shared_memory"); builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationRestrict); - // TODO(Triang3l): Make writable when memexport is implemented. - builder_->addMemberDecoration(type_shared_memory, 0, - spv::DecorationNonWritable); + if (!memexport_used) { + builder_->addMemberDecoration(type_shared_memory, 0, + spv::DecorationNonWritable); + } builder_->addMemberDecoration(type_shared_memory, 0, spv::DecorationOffset, 0); builder_->addDecoration(type_shared_memory, @@ -509,6 +525,24 @@ void SpirvShaderTranslator::StartTranslation() { builder_->createVariable(spv::NoPrecision, spv::StorageClassFunction, type_register_array, "xe_var_registers"); } + if (memexport_used) { + var_main_memexport_address_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float4_, + "xe_var_memexport_address", const_float4_0_); + uint8_t memexport_eM_remaining = current_shader().memexport_eM_written(); + uint32_t memexport_eM_index; + while ( + xe::bit_scan_forward(memexport_eM_remaining, &memexport_eM_index)) { + memexport_eM_remaining &= ~(uint8_t(1) << memexport_eM_index); + var_main_memexport_data_[memexport_eM_index] = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_float4_, + fmt::format("xe_var_memexport_data_{}", memexport_eM_index).c_str(), + const_float4_0_); + } + var_main_memexport_data_written_ = builder_->createVariable( + spv::NoPrecision, spv::StorageClassFunction, type_uint_, + "xe_var_memexport_data_written", const_uint_0_); + } } // Write the execution model-specific prologue with access to variables in the @@ -647,6 +681,10 @@ std::vector SpirvShaderTranslator::CompleteTranslation() { builder_->setBuildPoint(main_loop_merge_); } + // Write data for the last memexport. + ExportToMemory( + current_shader().memexport_eM_potentially_written_before_end()); + if (is_vertex_shader()) { CompleteVertexOrTessEvalShaderInMain(); } else if (is_pixel_shader()) { @@ -1077,6 +1115,34 @@ void SpirvShaderTranslator::ProcessJumpInstruction( builder_->createBranch(main_loop_continue_); } +void SpirvShaderTranslator::ProcessAllocInstruction( + const ParsedAllocInstruction& instr, uint8_t export_eM) { + bool start_memexport = instr.type == ucode::AllocType::kMemory && + current_shader().memexport_eM_written(); + if (export_eM || start_memexport) { + CloseExecConditionals(); + } + + if (export_eM) { + ExportToMemory(export_eM); + // Reset which eM# elements have been written. + builder_->createStore(const_uint_0_, var_main_memexport_data_written_); + // Break dependencies from the previous memexport. + uint8_t export_eM_remaining = export_eM; + uint32_t eM_index; + while (xe::bit_scan_forward(export_eM_remaining, &eM_index)) { + export_eM_remaining &= ~(uint8_t(1) << eM_index); + builder_->createStore(const_float4_0_, + var_main_memexport_data_[eM_index]); + } + } + + if (start_memexport) { + // Initialize eA to an invalid address. + builder_->createStore(const_float4_0_, var_main_memexport_address_); + } +} + spv::Id SpirvShaderTranslator::SpirvSmearScalarResultOrConstant( spv::Id scalar, spv::Id vector_type) { bool is_constant = builder_->isConstant(scalar); @@ -1205,6 +1271,8 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderBeforeMain() { } void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { + Modification shader_modification = GetSpirvShaderModification(); + // The edge flag isn't used for any purpose by the translator. if (current_shader().writes_point_size_edge_flag_kill_vertex() & 0b101) { id_vector_temp_.clear(); @@ -1244,11 +1312,40 @@ void SpirvShaderTranslator::StartVertexOrTessEvalShaderInMain() { } } - Modification shader_modification = GetSpirvShaderModification(); - // TODO(Triang3l): For HostVertexShaderType::kRectangeListAsTriangleStrip, // start the vertex loop, and load the index there. + // Check if memory export should be allowed for this host vertex of the guest + // primitive to make sure export is done only once for each guest vertex. + if (IsMemoryExportUsed()) { + spv::Id memexport_allowed_for_host_vertex_of_guest_primitive = + spv::NoResult; + if (shader_modification.vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kPointListAsTriangleStrip) { + // Only for one host vertex for the point. + memexport_allowed_for_host_vertex_of_guest_primitive = + builder_->createBinOp( + spv::OpIEqual, type_bool_, + builder_->createBinOp( + spv::OpBitwiseAnd, type_uint_, + builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createLoad(input_vertex_index_, + spv::NoPrecision)), + builder_->makeUintConstant(3)), + const_uint_0_); + } + + if (memexport_allowed_for_host_vertex_of_guest_primitive != spv::NoResult) { + main_memexport_allowed_ = + main_memexport_allowed_ != spv::NoResult + ? builder_->createBinOp( + spv::OpLogicalAnd, type_bool_, main_memexport_allowed_, + memexport_allowed_for_host_vertex_of_guest_primitive) + : memexport_allowed_for_host_vertex_of_guest_primitive; + } + } + // Load the vertex index or the tessellation parameters. if (register_count()) { // TODO(Triang3l): Barycentric coordinates and patch index. @@ -1827,6 +1924,13 @@ void SpirvShaderTranslator::StartFragmentShaderBeforeMain() { } void SpirvShaderTranslator::StartFragmentShaderInMain() { + // TODO(Triang3l): Allow memory export with resolution scaling only for the + // center host pixel, with sample shading (for depth format conversion) only + // for the bottom-right sample (unlike in Direct3D, the sample mask input + // doesn't include covered samples of the primitive that correspond to other + // invocations, so use the sample that's the most friendly to the half-pixel + // offset). + // Set up pixel killing from within the translated shader without affecting // the control flow (unlike with OpKill), similarly to how pixel killing works // on the Xenos, and also keeping a single critical section exit and return @@ -2460,6 +2564,26 @@ void SpirvShaderTranslator::StoreResult(const InstructionResult& result, var_main_fsi_color_written_); } } break; + case InstructionStorageTarget::kExportAddress: { + // spv::NoResult if memory export usage is unsupported or invalid. + target_pointer = var_main_memexport_address_; + } break; + case InstructionStorageTarget::kExportData: { + // spv::NoResult if memory export usage is unsupported or invalid. + target_pointer = var_main_memexport_data_[result.storage_index]; + if (target_pointer != spv::NoResult) { + // Mark that the eM# has been written to and needs to be exported. + assert_true(var_main_memexport_data_written_ != spv::NoResult); + builder_->createStore( + builder_->createBinOp( + spv::OpBitwiseOr, type_uint_, + builder_->createLoad(var_main_memexport_data_written_, + spv::NoPrecision), + builder_->makeUintConstant(uint32_t(1) + << result.storage_index)), + var_main_memexport_data_written_); + } + } break; default: // TODO(Triang3l): All storage targets. break; @@ -2814,16 +2938,59 @@ spv::Id SpirvShaderTranslator::EndianSwap32Uint(spv::Id value, spv::Id endian) { return value; } +spv::Id SpirvShaderTranslator::EndianSwap128Uint4(spv::Id value, + spv::Id endian) { + // Change 8-in-64 and 8-in-128 to 8-in-32, and then swap within 32 bits. + + spv::Id is_8in64 = builder_->createBinOp( + spv::OpIEqual, type_bool_, endian, + builder_->makeUintConstant( + static_cast(xenos::Endian128::k8in64))); + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(3); + uint_vector_temp_.push_back(2); + value = builder_->createTriOp( + spv::OpSelect, type_uint4_, is_8in64, + builder_->createRvalueSwizzle(spv::NoPrecision, type_uint4_, value, + uint_vector_temp_), + value); + + spv::Id is_8in128 = builder_->createBinOp( + spv::OpIEqual, type_bool_, endian, + builder_->makeUintConstant( + static_cast(xenos::Endian128::k8in128))); + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(3); + uint_vector_temp_.push_back(2); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(0); + value = builder_->createTriOp( + spv::OpSelect, type_uint4_, is_8in128, + builder_->createRvalueSwizzle(spv::NoPrecision, type_uint4_, value, + uint_vector_temp_), + value); + + endian = builder_->createTriOp( + spv::OpSelect, type_uint_, + builder_->createBinOp(spv::OpLogicalOr, type_bool_, is_8in64, is_8in128), + builder_->makeUintConstant( + static_cast(xenos::Endian128::k8in32)), + endian); + + return EndianSwap32Uint(value, endian); +} + spv::Id SpirvShaderTranslator::LoadUint32FromSharedMemory( spv::Id address_dwords_int) { - spv::Block& head_block = *builder_->getBuildPoint(); - assert_false(head_block.isTerminated()); - spv::StorageClass storage_class = features_.spirv_version >= spv::Spv_1_3 ? spv::StorageClassStorageBuffer : spv::StorageClassUniform; - uint32_t buffer_count_log2 = GetSharedMemoryStorageBufferCountLog2(); - if (!buffer_count_log2) { + + uint32_t binding_count_log2 = GetSharedMemoryStorageBufferCountLog2(); + + if (!binding_count_log2) { // Single binding - load directly. id_vector_temp_.clear(); // The only SSBO struct member. @@ -2837,8 +3004,10 @@ spv::Id SpirvShaderTranslator::LoadUint32FromSharedMemory( // The memory is split into multiple bindings - check which binding to load // from. 29 is log2(512 MB), but addressing in dwords (4 B). Not indexing the - // array with the variable itself because it needs VK_EXT_descriptor_indexing. - uint32_t binding_address_bits = (29 - 2) - buffer_count_log2; + // array with the variable itself because it needs non-uniform storage buffer + // indexing. + + uint32_t binding_address_bits = (29 - 2) - binding_count_log2; spv::Id binding_index = builder_->createBinOp( spv::OpShiftRightLogical, type_uint_, builder_->createUnaryOp(spv::OpBitcast, type_uint_, address_dwords_int), @@ -2847,51 +3016,119 @@ spv::Id SpirvShaderTranslator::LoadUint32FromSharedMemory( spv::OpBitwiseAnd, type_int_, address_dwords_int, builder_->makeIntConstant( int((uint32_t(1) << binding_address_bits) - 1))); - uint32_t buffer_count = 1 << buffer_count_log2; - spv::Block* switch_case_blocks[512 / 128]; - for (uint32_t i = 0; i < buffer_count; ++i) { - switch_case_blocks[i] = &builder_->makeNewBlock(); - } - spv::Block& switch_merge_block = builder_->makeNewBlock(); - spv::Id value_phi_result = builder_->getUniqueId(); - std::unique_ptr value_phi_op = - std::make_unique(value_phi_result, type_uint_, - spv::OpPhi); - builder_->createSelectionMerge(&switch_merge_block, - spv::SelectionControlDontFlattenMask); - { - std::unique_ptr switch_op = - std::make_unique(spv::OpSwitch); - switch_op->addIdOperand(binding_index); - // Highest binding index is the default case. - switch_op->addIdOperand(switch_case_blocks[buffer_count - 1]->getId()); - switch_case_blocks[buffer_count - 1]->addPredecessor(&head_block); - for (uint32_t i = 0; i < buffer_count - 1; ++i) { - switch_op->addImmediateOperand(int(i)); - switch_op->addIdOperand(switch_case_blocks[i]->getId()); - switch_case_blocks[i]->addPredecessor(&head_block); - } - builder_->getBuildPoint()->addInstruction(std::move(switch_op)); - } - for (uint32_t i = 0; i < buffer_count; ++i) { - builder_->setBuildPoint(switch_case_blocks[i]); - id_vector_temp_.clear(); - id_vector_temp_.push_back(builder_->makeIntConstant(int(i))); - // The only SSBO struct member. - id_vector_temp_.push_back(const_int_0_); - id_vector_temp_.push_back(binding_address); + + auto value_phi_op = std::make_unique( + builder_->getUniqueId(), type_uint_, spv::OpPhi); + // Zero if out of bounds. + value_phi_op->addIdOperand(const_uint_0_); + value_phi_op->addIdOperand(builder_->getBuildPoint()->getId()); + + SpirvBuilder::SwitchBuilder binding_switch( + binding_index, spv::SelectionControlDontFlattenMask, *builder_); + uint32_t binding_count = uint32_t(1) << binding_count_log2; + + id_vector_temp_.clear(); + id_vector_temp_.push_back(spv::NoResult); + // The only SSBO struct member. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(binding_address); + + for (uint32_t i = 0; i < binding_count; ++i) { + binding_switch.makeBeginCase(i); + id_vector_temp_[0] = builder_->makeIntConstant(int(i)); value_phi_op->addIdOperand(builder_->createLoad( builder_->createAccessChain(storage_class, buffers_shared_memory_, id_vector_temp_), spv::NoPrecision)); - value_phi_op->addIdOperand(switch_case_blocks[i]->getId()); - builder_->createBranch(&switch_merge_block); + value_phi_op->addIdOperand(builder_->getBuildPoint()->getId()); } - builder_->setBuildPoint(&switch_merge_block); + + binding_switch.makeEndSwitch(); + + spv::Id value_phi_result = value_phi_op->getResultId(); builder_->getBuildPoint()->addInstruction(std::move(value_phi_op)); return value_phi_result; } +void SpirvShaderTranslator::StoreUint32ToSharedMemory( + spv::Id value, spv::Id address_dwords_int, spv::Id replace_mask) { + spv::StorageClass storage_class = features_.spirv_version >= spv::Spv_1_3 + ? spv::StorageClassStorageBuffer + : spv::StorageClassUniform; + + spv::Id keep_mask = spv::NoResult; + if (replace_mask != spv::NoResult) { + keep_mask = builder_->createUnaryOp(spv::OpNot, type_uint_, replace_mask); + value = builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, value, + replace_mask); + } + + auto store = [&](spv::Id pointer) { + if (replace_mask != spv::NoResult) { + // Don't touch the other bits in the buffer, just modify the needed bits + // in the most up to date uint32 at the address. + spv::Id const_scope_device = builder_->makeUintConstant( + static_cast(spv::ScopeDevice)); + spv::Id const_semantics_relaxed = const_uint_0_; + builder_->createQuadOp(spv::OpAtomicAnd, type_uint_, pointer, + const_scope_device, const_semantics_relaxed, + keep_mask); + builder_->createQuadOp(spv::OpAtomicOr, type_uint_, pointer, + const_scope_device, const_semantics_relaxed, + value); + } else { + builder_->createStore(value, pointer); + } + }; + + uint32_t binding_count_log2 = GetSharedMemoryStorageBufferCountLog2(); + + if (!binding_count_log2) { + // Single binding - store directly. + id_vector_temp_.clear(); + // The only SSBO struct member. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(address_dwords_int); + store(builder_->createAccessChain(storage_class, buffers_shared_memory_, + id_vector_temp_)); + return; + } + + // The memory is split into multiple bindings - check which binding to store + // to. 29 is log2(512 MB), but addressing in dwords (4 B). Not indexing the + // array with the variable itself because it needs non-uniform storage buffer + // indexing. + + uint32_t binding_address_bits = (29 - 2) - binding_count_log2; + spv::Id binding_index = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, + builder_->createUnaryOp(spv::OpBitcast, type_uint_, address_dwords_int), + builder_->makeUintConstant(binding_address_bits)); + spv::Id binding_address = builder_->createBinOp( + spv::OpBitwiseAnd, type_int_, address_dwords_int, + builder_->makeIntConstant( + int((uint32_t(1) << binding_address_bits) - 1))); + + SpirvBuilder::SwitchBuilder binding_switch( + binding_index, spv::SelectionControlDontFlattenMask, *builder_); + uint32_t binding_count = uint32_t(1) << binding_count_log2; + + id_vector_temp_.clear(); + id_vector_temp_.push_back(spv::NoResult); + // The only SSBO struct member. + id_vector_temp_.push_back(const_int_0_); + id_vector_temp_.push_back(binding_address); + + for (uint32_t i = 0; i < binding_count; ++i) { + binding_switch.makeBeginCase(i); + id_vector_temp_[0] = builder_->makeIntConstant(int(i)); + store(builder_->createAccessChain(storage_class, buffers_shared_memory_, + id_vector_temp_)); + } + + binding_switch.makeEndSwitch(); +} + spv::Id SpirvShaderTranslator::PWLGammaToLinear(spv::Id gamma, bool gamma_pre_saturated) { spv::Id value_type = builder_->getTypeId(gamma); diff --git a/src/xenia/gpu/spirv_shader_translator.h b/src/xenia/gpu/spirv_shader_translator.h index 8c4942156..aefb00bf6 100644 --- a/src/xenia/gpu/spirv_shader_translator.h +++ b/src/xenia/gpu/spirv_shader_translator.h @@ -323,17 +323,28 @@ class SpirvShaderTranslator : public ShaderTranslator { explicit Features( const ui::vulkan::VulkanProvider::DeviceInfo& device_info); explicit Features(bool all = false); + unsigned int spirv_version; + uint32_t max_storage_buffer_range; + + bool full_draw_index_uint32; + + bool vertex_pipeline_stores_and_atomics; + bool fragment_stores_and_atomics; + bool clip_distance; bool cull_distance; - bool demote_to_helper_invocation; - bool fragment_shader_sample_interlock; - bool full_draw_index_uint32; + bool image_view_format_swizzle; + bool signed_zero_inf_nan_preserve_float32; bool denorm_flush_to_zero_float32; bool rounding_mode_rte_float32; + + bool fragment_shader_sample_interlock; + + bool demote_to_helper_invocation; }; SpirvShaderTranslator(const Features& features, @@ -424,6 +435,8 @@ class SpirvShaderTranslator : public ShaderTranslator { void ProcessLoopEndInstruction( const ParsedLoopEndInstruction& instr) override; void ProcessJumpInstruction(const ParsedJumpInstruction& instr) override; + void ProcessAllocInstruction(const ParsedAllocInstruction& instr, + uint8_t export_eM) override; void ProcessVertexFetchInstruction( const ParsedVertexFetchInstruction& instr) override; @@ -470,6 +483,11 @@ class SpirvShaderTranslator : public ShaderTranslator { Shader::IsHostVertexShaderTypeDomain( GetSpirvShaderModification().vertex.host_vertex_shader_type); } + bool IsSpirvComputeShader() const { + return is_vertex_shader() && + GetSpirvShaderModification().vertex.host_vertex_shader_type == + Shader::HostVertexShaderType::kMemExportCompute; + } bool IsExecutionModeEarlyFragmentTests() const { return is_pixel_shader() && @@ -567,24 +585,48 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id ZeroIfAnyOperandIsZero(spv::Id value, spv::Id operand_0_abs, spv::Id operand_1_abs); // Conditionally discard the current fragment. Changes the build point. - void KillPixel(spv::Id condition); + void KillPixel(spv::Id condition, + uint8_t memexport_eM_potentially_written_before); // Return type is a xe::bit_count(result.GetUsedResultComponents())-component // float vector or a single float, depending on whether it's a reduction // instruction (check getTypeId of the result), or returns spv::NoResult if // nothing to store. - spv::Id ProcessVectorAluOperation(const ParsedAluInstruction& instr, - bool& predicate_written); + spv::Id ProcessVectorAluOperation( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written); // Returns a float value to write to the previous scalar register and to the // destination. If the return value is ps itself (in the retain_prev case), // returns spv::NoResult (handled as a special case, so if it's retain_prev, // but don't need to write to anywhere, no OpLoad(ps) will be done). - spv::Id ProcessScalarAluOperation(const ParsedAluInstruction& instr, - bool& predicate_written); + spv::Id ProcessScalarAluOperation( + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written); // Perform endian swap of a uint scalar or vector. spv::Id EndianSwap32Uint(spv::Id value, spv::Id endian); + // Perform endian swap of a uint4 vector. + spv::Id EndianSwap128Uint4(spv::Id value, spv::Id endian); spv::Id LoadUint32FromSharedMemory(spv::Id address_dwords_int); + // If `replace_mask` is provided, the bits specified in the mask will be + // replaced with those from the value via OpAtomicAnd/Or. + // Bits of `value` not in `replace_mask` will be ignored. + void StoreUint32ToSharedMemory(spv::Id value, spv::Id address_dwords_int, + spv::Id replace_mask = spv::NoResult); + + bool IsMemoryExportSupported() const { + if (is_pixel_shader()) { + return features_.fragment_stores_and_atomics; + } + return features_.vertex_pipeline_stores_and_atomics || + IsSpirvComputeShader(); + } + + bool IsMemoryExportUsed() const { + return current_shader().memexport_eM_written() && IsMemoryExportSupported(); + } + + void ExportToMemory(uint8_t export_eM); // The source may be a floating-point scalar or a vector. spv::Id PWLGammaToLinear(spv::Id gamma, bool gamma_pre_saturated); @@ -872,6 +914,21 @@ class SpirvShaderTranslator : public ShaderTranslator { spv::Id var_main_tfetch_gradients_v_; // float4[register_count()]. spv::Id var_main_registers_; + // Memory export variables are created only when needed. + // float4. + spv::Id var_main_memexport_address_; + // Each is float4. + spv::Id var_main_memexport_data_[ucode::kMaxMemExportElementCount]; + // Bit field of which eM# elements have been written so far by the invocation + // since the last memory write - uint. + spv::Id var_main_memexport_data_written_; + // If memory export is disabled in certain invocations or (if emulating some + // primitive types without a geometry shader) at specific guest vertex loop + // iterations because the translated shader is executed multiple times for the + // same guest vertex or pixel, this contains whether memory export is allowed + // in the current execution of the translated code. + // bool. + spv::Id main_memexport_allowed_; // VS only - float3 (special exports). spv::Id var_main_point_size_edge_flag_kill_vertex_; // PS, only when needed - bool. diff --git a/src/xenia/gpu/spirv_shader_translator_alu.cc b/src/xenia/gpu/spirv_shader_translator_alu.cc index ecc88f57b..1e7580e34 100644 --- a/src/xenia/gpu/spirv_shader_translator_alu.cc +++ b/src/xenia/gpu/spirv_shader_translator_alu.cc @@ -39,10 +39,14 @@ spv::Id SpirvShaderTranslator::ZeroIfAnyOperandIsZero(spv::Id value, const_float_vectors_0_[num_components - 1], value); } -void SpirvShaderTranslator::KillPixel(spv::Id condition) { +void SpirvShaderTranslator::KillPixel( + spv::Id condition, uint8_t memexport_eM_potentially_written_before) { SpirvBuilder::IfBuilder kill_if(condition, spv::SelectionControlMaskNone, *builder_); { + // Perform outstanding memory exports before the invocation becomes inactive + // and storage writes are disabled. + ExportToMemory(memexport_eM_potentially_written_before); if (var_main_kill_pixel_ != spv::NoResult) { builder_->createStore(builder_->makeBoolConstant(true), var_main_kill_pixel_); @@ -77,12 +81,12 @@ void SpirvShaderTranslator::ProcessAluInstruction( // Whether the instruction has changed the predicate, and it needs to be // checked again later. bool predicate_written_vector = false; - spv::Id vector_result = - ProcessVectorAluOperation(instr, predicate_written_vector); + spv::Id vector_result = ProcessVectorAluOperation( + instr, memexport_eM_potentially_written_before, predicate_written_vector); bool predicate_written_scalar = false; - spv::Id scalar_result = - ProcessScalarAluOperation(instr, predicate_written_scalar); + spv::Id scalar_result = ProcessScalarAluOperation( + instr, memexport_eM_potentially_written_before, predicate_written_scalar); if (scalar_result != spv::NoResult) { EnsureBuildPointAvailable(); builder_->createStore(scalar_result, var_main_previous_scalar_); @@ -106,7 +110,8 @@ void SpirvShaderTranslator::ProcessAluInstruction( } spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( - const ParsedAluInstruction& instr, bool& predicate_written) { + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written) { predicate_written = false; uint32_t used_result_components = @@ -769,14 +774,16 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( case ucode::AluVectorOpcode::kKillGt: case ucode::AluVectorOpcode::kKillGe: case ucode::AluVectorOpcode::kKillNe: { - KillPixel(builder_->createUnaryOp( - spv::OpAny, type_bool_, - builder_->createBinOp( - spv::Op(kOps[size_t(instr.vector_opcode)]), type_bool4_, - GetOperandComponents(operand_storage[0], instr.vector_operands[0], - 0b1111), - GetOperandComponents(operand_storage[1], instr.vector_operands[1], - 0b1111)))); + KillPixel( + builder_->createUnaryOp( + spv::OpAny, type_bool_, + builder_->createBinOp( + spv::Op(kOps[size_t(instr.vector_opcode)]), type_bool4_, + GetOperandComponents(operand_storage[0], + instr.vector_operands[0], 0b1111), + GetOperandComponents(operand_storage[1], + instr.vector_operands[1], 0b1111))), + memexport_eM_potentially_written_before); return const_float_0_; } @@ -862,7 +869,8 @@ spv::Id SpirvShaderTranslator::ProcessVectorAluOperation( } spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( - const ParsedAluInstruction& instr, bool& predicate_written) { + const ParsedAluInstruction& instr, + uint8_t memexport_eM_potentially_written_before, bool& predicate_written) { predicate_written = false; spv::Id operand_storage[2] = {}; @@ -1257,12 +1265,13 @@ spv::Id SpirvShaderTranslator::ProcessScalarAluOperation( case ucode::AluScalarOpcode::kKillsNe: case ucode::AluScalarOpcode::kKillsOne: { KillPixel(builder_->createBinOp( - spv::Op(kOps[size_t(instr.scalar_opcode)]), type_bool_, - GetOperandComponents(operand_storage[0], instr.scalar_operands[0], - 0b0001), - instr.scalar_opcode == ucode::AluScalarOpcode::kKillsOne - ? const_float_1_ - : const_float_0_)); + spv::Op(kOps[size_t(instr.scalar_opcode)]), type_bool_, + GetOperandComponents(operand_storage[0], + instr.scalar_operands[0], 0b0001), + instr.scalar_opcode == ucode::AluScalarOpcode::kKillsOne + ? const_float_1_ + : const_float_0_), + memexport_eM_potentially_written_before); return const_float_0_; } diff --git a/src/xenia/gpu/spirv_shader_translator_memexport.cc b/src/xenia/gpu/spirv_shader_translator_memexport.cc new file mode 100644 index 000000000..94c0adf54 --- /dev/null +++ b/src/xenia/gpu/spirv_shader_translator_memexport.cc @@ -0,0 +1,950 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2024 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/spirv_shader_translator.h" + +#include +#include +#include +#include +#include +#include + +#include "third_party/glslang/SPIRV/GLSL.std.450.h" +#include "xenia/base/assert.h" +#include "xenia/base/math.h" +#include "xenia/gpu/ucode.h" + +namespace xe { +namespace gpu { + +void SpirvShaderTranslator::ExportToMemory(uint8_t export_eM) { + if (!export_eM) { + return; + } + + assert_zero(export_eM & ~current_shader().memexport_eM_written()); + + if (!IsMemoryExportSupported()) { + return; + } + + // Check if memory export is allowed in this guest shader invocation. + std::optional if_memexport_allowed; + if (main_memexport_allowed_ != spv::NoResult) { + if_memexport_allowed.emplace(main_memexport_allowed_, + spv::SelectionControlDontFlattenMask, + *builder_); + } + + // If the pixel was killed (but the actual killing on the SPIR-V side has not + // been performed yet because the device doesn't support demotion to helper + // invocation that doesn't interfere with control flow), the current + // invocation is not considered active anymore. + std::optional if_pixel_not_killed; + if (var_main_kill_pixel_ != spv::NoResult) { + if_pixel_not_killed.emplace( + builder_->createUnaryOp( + spv::OpLogicalNot, type_bool_, + builder_->createLoad(var_main_kill_pixel_, spv::NoPrecision)), + spv::SelectionControlDontFlattenMask, *builder_); + } + + // Check if the address with the correct sign and exponent was written, and + // that the index doesn't overflow the mantissa bits. + // all((eA_vector >> uvec4(30, 23, 23, 23)) == uvec4(0x1, 0x96, 0x96, 0x96)) + spv::Id eA_vector = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, + builder_->createLoad(var_main_memexport_address_, spv::NoPrecision)); + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeUintConstant(30)); + id_vector_temp_.push_back(builder_->makeUintConstant(23)); + id_vector_temp_.push_back(id_vector_temp_.back()); + id_vector_temp_.push_back(id_vector_temp_.back()); + spv::Id address_validation_shift = + builder_->makeCompositeConstant(type_uint4_, id_vector_temp_); + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->makeUintConstant(0x1)); + id_vector_temp_.push_back(builder_->makeUintConstant(0x96)); + id_vector_temp_.push_back(id_vector_temp_.back()); + id_vector_temp_.push_back(id_vector_temp_.back()); + spv::Id address_validation_value = + builder_->makeCompositeConstant(type_uint4_, id_vector_temp_); + SpirvBuilder::IfBuilder if_address_valid( + builder_->createUnaryOp( + spv::OpAll, type_bool_, + builder_->createBinOp( + spv::OpIEqual, type_bool4_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint4_, + eA_vector, address_validation_shift), + address_validation_value)), + spv::SelectionControlDontFlattenMask, *builder_, 2, 1); + + using EMIdArray = std::array; + + auto for_each_eM = [&](std::function fn) { + uint8_t eM_remaining = export_eM; + uint32_t eM_index; + while (xe::bit_scan_forward(eM_remaining, &eM_index)) { + eM_remaining &= ~(uint8_t(1) << eM_index); + fn(eM_index); + } + }; + + // Load the original eM. + EMIdArray eM_original; + for_each_eM([&](uint32_t eM_index) { + eM_original[eM_index] = builder_->createLoad( + var_main_memexport_data_[eM_index], spv::NoPrecision); + }); + + // Swap red and blue if needed. + spv::Id format_info = + builder_->createCompositeExtract(eA_vector, type_uint_, 2); + spv::Id swap_red_blue = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, format_info, + builder_->makeUintConstant(uint32_t(1) << 19)), + const_uint_0_); + EMIdArray eM_swapped; + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(2); + uint_vector_temp_.push_back(1); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(3); + for_each_eM([&](uint32_t eM_index) { + eM_swapped[eM_index] = builder_->createTriOp( + spv::OpSelect, type_float4_, swap_red_blue, + builder_->createRvalueSwizzle(spv::NoPrecision, type_float4_, + eM_original[eM_index], uint_vector_temp_), + eM_original[eM_index]); + }); + + // Extract the numeric format. + spv::Id is_signed = builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, format_info, + builder_->makeUintConstant(uint32_t(1) << 16)), + const_uint_0_); + spv::Id is_norm = builder_->createBinOp( + spv::OpIEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, format_info, + builder_->makeUintConstant(uint32_t(1) << 17)), + const_uint_0_); + + // Perform format packing. + + auto flush_nan = [&](const EMIdArray& eM) -> EMIdArray { + EMIdArray eM_flushed; + for_each_eM([&](uint32_t eM_index) { + spv::Id element_unflushed = eM[eM_index]; + unsigned int component_count = + builder_->getNumComponents(element_unflushed); + eM_flushed[eM_index] = builder_->createTriOp( + spv::OpSelect, type_float_vectors_[component_count - 1], + builder_->createUnaryOp(spv::OpIsNan, + type_bool_vectors_[component_count - 1], + element_unflushed), + const_float_vectors_0_[component_count - 1], element_unflushed); + }); + return eM_flushed; + }; + + auto make_float_constant_vectors = + [&](float value) -> std::array { + std::array const_vectors; + const_vectors[0] = builder_->makeFloatConstant(value); + id_vector_temp_.clear(); + id_vector_temp_.push_back(const_vectors[0]); + for (unsigned int component_count_minus_1 = 1; component_count_minus_1 < 4; + ++component_count_minus_1) { + id_vector_temp_.push_back(const_vectors[0]); + const_vectors[component_count_minus_1] = builder_->makeCompositeConstant( + type_float_vectors_[component_count_minus_1], id_vector_temp_); + } + return const_vectors; + }; + std::array const_float_vectors_minus_1 = + make_float_constant_vectors(-1.0f); + std::array const_float_vectors_minus_0_5 = + make_float_constant_vectors(-0.5f); + std::array const_float_vectors_0_5 = + make_float_constant_vectors(0.5f); + + // The widths must be without holes (R, RG, RGB, RGBA), and expecting the + // widths to add up to the size of the stored texel (8, 16 or 32 bits), as the + // unused upper bits will contain junk from the sign extension of X if the + // number is signed. + auto pack_8_16_32 = [&](std::array widths) -> EMIdArray { + unsigned int component_count; + std::array offsets{}; + for (component_count = 0; component_count < widths.size(); + ++component_count) { + if (!widths[component_count]) { + break; + } + // Only formats for which max + 0.5 can be represented exactly. + assert(widths[component_count] <= 23); + if (component_count) { + offsets[component_count] = + offsets[component_count - 1] + widths[component_count - 1]; + } + } + assert_not_zero(component_count); + + // Extract the needed components. + EMIdArray eM_unflushed = eM_swapped; + if (component_count < 4) { + if (component_count == 1) { + for_each_eM([&](uint32_t eM_index) { + eM_unflushed[eM_index] = builder_->createCompositeExtract( + eM_unflushed[eM_index], type_float_, 0); + }); + } else { + uint_vector_temp_.clear(); + for (unsigned int component_index = 0; + component_index < component_count; ++component_index) { + uint_vector_temp_.push_back(component_index); + } + for_each_eM([&](uint32_t eM_index) { + eM_unflushed[eM_index] = builder_->createRvalueSwizzle( + spv::NoPrecision, type_float_vectors_[component_count - 1], + eM_unflushed[eM_index], uint_vector_temp_); + }); + } + } + + // Flush NaNs. + EMIdArray eM_flushed = flush_nan(eM_unflushed); + + // Convert to integers. + SpirvBuilder::IfBuilder if_signed( + is_signed, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray eM_signed; + { + // Signed. + SpirvBuilder::IfBuilder if_norm( + is_norm, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray eM_norm; + { + // Signed normalized. + id_vector_temp_.clear(); + for (unsigned int component_index = 0; + component_index < component_count; ++component_index) { + id_vector_temp_.push_back(builder_->makeFloatConstant( + float((uint32_t(1) << (widths[component_index] - 1)) - 1))); + } + spv::Id const_max_value = + component_count > 1 + ? builder_->makeCompositeConstant( + type_float_vectors_[component_count - 1], id_vector_temp_) + : id_vector_temp_.front(); + for_each_eM([&](uint32_t eM_index) { + eM_norm[eM_index] = builder_->createNoContractionBinOp( + spv::OpFMul, type_float_vectors_[component_count - 1], + builder_->createTriBuiltinCall( + type_float_vectors_[component_count - 1], + ext_inst_glsl_std_450_, GLSLstd450FClamp, + eM_flushed[eM_index], + const_float_vectors_minus_1[component_count - 1], + const_float_vectors_1_[component_count - 1]), + const_max_value); + }); + } + if_norm.makeEndIf(); + // All phi instructions must be in the beginning of the block. + for_each_eM([&](uint32_t eM_index) { + eM_signed[eM_index] = + if_norm.createMergePhi(eM_norm[eM_index], eM_flushed[eM_index]); + }); + // Convert to signed integer, adding plus/minus 0.5 before truncating + // according to the Direct3D format conversion rules. + for_each_eM([&](uint32_t eM_index) { + eM_signed[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint_vectors_[component_count - 1], + builder_->createUnaryOp( + spv::OpConvertFToS, type_int_vectors_[component_count - 1], + builder_->createNoContractionBinOp( + spv::OpFAdd, type_float_vectors_[component_count - 1], + eM_signed[eM_index], + builder_->createTriOp( + spv::OpSelect, type_float_vectors_[component_count - 1], + builder_->createBinOp( + spv::OpFOrdLessThan, + type_bool_vectors_[component_count - 1], + eM_signed[eM_index], + const_float_vectors_0_[component_count - 1]), + const_float_vectors_minus_0_5[component_count - 1], + const_float_vectors_0_5[component_count - 1])))); + }); + } + if_signed.makeBeginElse(); + EMIdArray eM_unsigned; + { + SpirvBuilder::IfBuilder if_norm( + is_norm, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray eM_norm; + { + // Unsigned normalized. + id_vector_temp_.clear(); + for (unsigned int component_index = 0; + component_index < component_count; ++component_index) { + id_vector_temp_.push_back(builder_->makeFloatConstant( + float((uint32_t(1) << widths[component_index]) - 1))); + } + spv::Id const_max_value = + component_count > 1 + ? builder_->makeCompositeConstant( + type_float_vectors_[component_count - 1], id_vector_temp_) + : id_vector_temp_.front(); + for_each_eM([&](uint32_t eM_index) { + eM_norm[eM_index] = builder_->createNoContractionBinOp( + spv::OpFMul, type_float_vectors_[component_count - 1], + builder_->createTriBuiltinCall( + type_float_vectors_[component_count - 1], + ext_inst_glsl_std_450_, GLSLstd450FClamp, + eM_flushed[eM_index], + const_float_vectors_0_[component_count - 1], + const_float_vectors_1_[component_count - 1]), + const_max_value); + }); + } + if_norm.makeEndIf(); + // All phi instructions must be in the beginning of the block. + for_each_eM([&](uint32_t eM_index) { + eM_unsigned[eM_index] = + if_norm.createMergePhi(eM_norm[eM_index], eM_flushed[eM_index]); + }); + // Convert to unsigned integer, adding 0.5 before truncating according to + // the Direct3D format conversion rules. + for_each_eM([&](uint32_t eM_index) { + eM_unsigned[eM_index] = builder_->createUnaryOp( + spv::OpConvertFToU, type_uint_vectors_[component_count - 1], + builder_->createNoContractionBinOp( + spv::OpFAdd, type_float_vectors_[component_count - 1], + eM_unsigned[eM_index], + const_float_vectors_0_5[component_count - 1])); + }); + } + if_signed.makeEndIf(); + EMIdArray eM_unpacked; + for_each_eM([&](uint32_t eM_index) { + eM_unpacked[eM_index] = + if_signed.createMergePhi(eM_signed[eM_index], eM_unsigned[eM_index]); + }); + + // Pack into a 32-bit value, and pad to a 4-component vector for the phi. + EMIdArray eM_packed; + for_each_eM([&](uint32_t eM_index) { + spv::Id element_unpacked = eM_unpacked[eM_index]; + eM_packed[eM_index] = component_count > 1 + ? builder_->createCompositeExtract( + element_unpacked, type_uint_, 0) + : element_unpacked; + for (unsigned int component_index = 1; component_index < component_count; + ++component_index) { + eM_packed[eM_index] = builder_->createQuadOp( + spv::OpBitFieldInsert, type_uint_, eM_packed[eM_index], + builder_->createCompositeExtract(element_unpacked, type_uint_, + component_index), + builder_->makeUintConstant(offsets[component_index]), + builder_->makeUintConstant(widths[component_index])); + } + id_vector_temp_.clear(); + id_vector_temp_.resize(4, const_uint_0_); + id_vector_temp_.front() = eM_packed[eM_index]; + eM_packed[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + + return eM_packed; + }; + + SpirvBuilder::SwitchBuilder format_switch( + builder_->createTriOp(spv::OpBitFieldUExtract, type_uint_, format_info, + builder_->makeUintConstant(8), + builder_->makeUintConstant(6)), + spv::SelectionControlDontFlattenMask, *builder_); + + struct FormatCase { + EMIdArray eM_packed; + uint32_t element_bytes_log2; + spv::Id phi_parent; + }; + std::vector format_cases; + // Must be called at the end of the switch case segment for the correct phi + // parent. + auto add_format_case = [&](const EMIdArray& eM_packed, + uint32_t element_bytes_log2) { + FormatCase& format_case = format_cases.emplace_back(); + format_case.eM_packed = eM_packed; + format_case.element_bytes_log2 = element_bytes_log2; + format_case.phi_parent = builder_->getBuildPoint()->getId(); + }; + + // k_8, k_8_A, k_8_B + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_8)); + // TODO(Triang3l): Investigate how input should be treated for k_8_A, k_8_B. + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_8_A)); + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_8_B)); + add_format_case(pack_8_16_32({8}), 0); + + // k_1_5_5_5 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_1_5_5_5)); + add_format_case(pack_8_16_32({5, 5, 5, 1}), 1); + + // k_5_6_5 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_5_6_5)); + add_format_case(pack_8_16_32({5, 6, 5}), 1); + + // k_6_5_5 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_6_5_5)); + add_format_case(pack_8_16_32({5, 5, 6}), 1); + + // k_8_8_8_8, k_8_8_8_8_A, k_8_8_8_8_AS_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_8_8_8_8)); + // TODO(Triang3l): Investigate how input should be treated for k_8_8_8_8_A. + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_8_8_8_8_A)); + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_8_8_8_8_AS_16_16_16_16)); + add_format_case(pack_8_16_32({8, 8, 8, 8}), 2); + + // k_2_10_10_10, k_2_10_10_10_AS_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_2_10_10_10)); + format_switch.addCurrentCaseLiteral(static_cast( + xenos::ColorFormat::k_2_10_10_10_AS_16_16_16_16)); + add_format_case(pack_8_16_32({10, 10, 10, 2}), 2); + + // k_8_8 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_8_8)); + add_format_case(pack_8_16_32({8, 8}), 1); + + // k_4_4_4_4 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_4_4_4_4)); + add_format_case(pack_8_16_32({4, 4, 4, 4}), 1); + + // k_10_11_11, k_10_11_11_AS_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_10_11_11)); + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_10_11_11_AS_16_16_16_16)); + add_format_case(pack_8_16_32({11, 11, 10}), 2); + + // k_11_11_10, k_11_11_10_AS_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_11_11_10)); + format_switch.addCurrentCaseLiteral( + static_cast(xenos::ColorFormat::k_11_11_10_AS_16_16_16_16)); + add_format_case(pack_8_16_32({10, 11, 11}), 2); + + // k_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16)); + add_format_case(pack_8_16_32({16}), 1); + + // k_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_16)); + add_format_case(pack_8_16_32({16, 16}), 2); + + // k_16_16_16_16 + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_16_16_16)); + { + // Flush NaNs. + EMIdArray fixed16_flushed = flush_nan(eM_swapped); + + // Convert to integers. + SpirvBuilder::IfBuilder if_signed( + is_signed, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray fixed16_signed; + { + // Signed. + SpirvBuilder::IfBuilder if_norm( + is_norm, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray fixed16_norm; + { + // Signed normalized. + id_vector_temp_.clear(); + id_vector_temp_.resize(4, builder_->makeFloatConstant( + float((uint32_t(1) << (16 - 1)) - 1))); + spv::Id const_snorm16_max_value = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + for_each_eM([&](uint32_t eM_index) { + fixed16_norm[eM_index] = builder_->createNoContractionBinOp( + spv::OpFMul, type_float4_, + builder_->createTriBuiltinCall( + type_float4_, ext_inst_glsl_std_450_, GLSLstd450FClamp, + fixed16_flushed[eM_index], const_float_vectors_minus_1[3], + const_float4_1_), + const_snorm16_max_value); + }); + } + if_norm.makeEndIf(); + // All phi instructions must be in the beginning of the block. + for_each_eM([&](uint32_t eM_index) { + fixed16_signed[eM_index] = if_norm.createMergePhi( + fixed16_norm[eM_index], fixed16_flushed[eM_index]); + }); + // Convert to signed integer, adding plus/minus 0.5 before truncating + // according to the Direct3D format conversion rules. + for_each_eM([&](uint32_t eM_index) { + fixed16_signed[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, + builder_->createUnaryOp( + spv::OpConvertFToS, type_int4_, + builder_->createNoContractionBinOp( + spv::OpFAdd, type_float4_, fixed16_signed[eM_index], + builder_->createTriOp( + spv::OpSelect, type_float4_, + builder_->createBinOp(spv::OpFOrdLessThan, type_bool4_, + fixed16_signed[eM_index], + const_float4_0_), + const_float_vectors_minus_0_5[3], + const_float_vectors_0_5[3])))); + }); + } + if_signed.makeBeginElse(); + EMIdArray fixed16_unsigned; + { + // Unsigned. + SpirvBuilder::IfBuilder if_norm( + is_norm, spv::SelectionControlDontFlattenMask, *builder_); + EMIdArray fixed16_norm; + { + // Unsigned normalized. + id_vector_temp_.clear(); + id_vector_temp_.resize( + 4, builder_->makeFloatConstant(float((uint32_t(1) << 16) - 1))); + spv::Id const_unorm16_max_value = + builder_->makeCompositeConstant(type_float4_, id_vector_temp_); + for_each_eM([&](uint32_t eM_index) { + fixed16_norm[eM_index] = builder_->createNoContractionBinOp( + spv::OpFMul, type_float4_, + builder_->createTriBuiltinCall( + type_float4_, ext_inst_glsl_std_450_, GLSLstd450FClamp, + fixed16_flushed[eM_index], const_float4_0_, const_float4_1_), + const_unorm16_max_value); + }); + } + if_norm.makeEndIf(); + // All phi instructions must be in the beginning of the block. + for_each_eM([&](uint32_t eM_index) { + fixed16_unsigned[eM_index] = if_norm.createMergePhi( + fixed16_norm[eM_index], fixed16_flushed[eM_index]); + }); + // Convert to unsigned integer, adding 0.5 before truncating according to + // the Direct3D format conversion rules. + for_each_eM([&](uint32_t eM_index) { + fixed16_unsigned[eM_index] = builder_->createUnaryOp( + spv::OpConvertFToU, type_uint4_, + builder_->createNoContractionBinOp(spv::OpFAdd, type_float4_, + fixed16_unsigned[eM_index], + const_float_vectors_0_5[3])); + }); + } + if_signed.makeEndIf(); + EMIdArray fixed16_unpacked; + for_each_eM([&](uint32_t eM_index) { + fixed16_unpacked[eM_index] = if_signed.createMergePhi( + fixed16_signed[eM_index], fixed16_unsigned[eM_index]); + }); + + // Pack into two 32-bit values, and pad to a 4-component vector for the phi. + EMIdArray fixed16_packed; + spv::Id const_uint_16 = builder_->makeUintConstant(16); + for_each_eM([&](uint32_t eM_index) { + spv::Id fixed16_element_unpacked = fixed16_unpacked[eM_index]; + id_vector_temp_.clear(); + for (uint32_t component_index = 0; component_index < 2; + ++component_index) { + id_vector_temp_.push_back(builder_->createQuadOp( + spv::OpBitFieldInsert, type_uint_, + builder_->createCompositeExtract(fixed16_element_unpacked, + type_uint_, 2 * component_index), + builder_->createCompositeExtract( + fixed16_element_unpacked, type_uint_, 2 * component_index + 1), + const_uint_16, const_uint_16)); + } + for (uint32_t component_index = 2; component_index < 4; + ++component_index) { + id_vector_temp_.push_back(const_uint_0_); + } + fixed16_packed[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + + add_format_case(fixed16_packed, 3); + } + + // TODO(Triang3l): Use the extended range float16 conversion. + + // k_16_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_FLOAT)); + { + EMIdArray format_packed_16_float; + for_each_eM([&](uint32_t eM_index) { + id_vector_temp_.clear(); + id_vector_temp_.push_back(builder_->createCompositeExtract( + eM_swapped[eM_index], type_float_, 0)); + id_vector_temp_.push_back(const_float_0_); + spv::Id format_packed_16_float_x = builder_->createUnaryBuiltinCall( + type_uint_, ext_inst_glsl_std_450_, GLSLstd450PackHalf2x16, + builder_->createCompositeConstruct(type_float2_, id_vector_temp_)); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, const_uint_0_); + id_vector_temp_.front() = format_packed_16_float_x; + format_packed_16_float[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + add_format_case(format_packed_16_float, 1); + } + + // k_16_16_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_16_FLOAT)); + { + EMIdArray format_packed_16_16_float; + for_each_eM([&](uint32_t eM_index) { + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(0); + uint_vector_temp_.push_back(1); + spv::Id format_packed_16_16_float_xy = builder_->createUnaryBuiltinCall( + type_uint_, ext_inst_glsl_std_450_, GLSLstd450PackHalf2x16, + builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_, + eM_swapped[eM_index], + uint_vector_temp_)); + id_vector_temp_.clear(); + id_vector_temp_.resize(4, const_uint_0_); + id_vector_temp_.front() = format_packed_16_16_float_xy; + format_packed_16_16_float[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + add_format_case(format_packed_16_16_float, 2); + } + + // k_16_16_16_16_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_16_16_16_16_FLOAT)); + { + EMIdArray format_packed_16_16_16_16_float; + for_each_eM([&](uint32_t eM_index) { + spv::Id format_packed_16_16_16_16_float_xy_zw[2]; + for (uint32_t component_index = 0; component_index < 2; + ++component_index) { + uint_vector_temp_.clear(); + uint_vector_temp_.push_back(2 * component_index); + uint_vector_temp_.push_back(2 * component_index + 1); + format_packed_16_16_16_16_float_xy_zw[component_index] = + builder_->createUnaryBuiltinCall( + type_uint_, ext_inst_glsl_std_450_, GLSLstd450PackHalf2x16, + builder_->createRvalueSwizzle(spv::NoPrecision, type_float2_, + eM_swapped[eM_index], + uint_vector_temp_)); + } + id_vector_temp_.clear(); + id_vector_temp_.push_back(format_packed_16_16_16_16_float_xy_zw[0]); + id_vector_temp_.push_back(format_packed_16_16_16_16_float_xy_zw[1]); + id_vector_temp_.push_back(const_uint_0_); + id_vector_temp_.push_back(const_uint_0_); + format_packed_16_16_16_16_float[eM_index] = + builder_->createCompositeConstruct(type_uint4_, id_vector_temp_); + }); + add_format_case(format_packed_16_16_16_16_float, 3); + } + + // k_32_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_32_FLOAT)); + { + EMIdArray format_packed_32_float; + for_each_eM([&](uint32_t eM_index) { + format_packed_32_float[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, eM_swapped[eM_index]); + }); + add_format_case(format_packed_32_float, 2); + } + + // k_32_32_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_32_32_FLOAT)); + { + EMIdArray format_packed_32_32_float; + for_each_eM([&](uint32_t eM_index) { + format_packed_32_32_float[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, eM_swapped[eM_index]); + }); + add_format_case(format_packed_32_32_float, 3); + } + + // k_32_32_32_32_FLOAT + format_switch.makeBeginCase( + static_cast(xenos::ColorFormat::k_32_32_32_32_FLOAT)); + { + EMIdArray format_packed_32_32_32_32_float; + for_each_eM([&](uint32_t eM_index) { + format_packed_32_32_32_32_float[eM_index] = builder_->createUnaryOp( + spv::OpBitcast, type_uint4_, eM_swapped[eM_index]); + }); + add_format_case(format_packed_32_32_32_32_float, 4); + } + + format_switch.makeEndSwitch(); + + // Select the result and the element size based on the format. + // Phi must be the first instructions in a block. + EMIdArray eM_packed; + for_each_eM([&](uint32_t eM_index) { + auto eM_packed_phi = std::make_unique( + builder_->getUniqueId(), type_uint4_, spv::OpPhi); + // Default case for an invalid format. + eM_packed_phi->addIdOperand(const_uint4_0_); + eM_packed_phi->addIdOperand(format_switch.getDefaultPhiParent()); + for (const FormatCase& format_case : format_cases) { + eM_packed_phi->addIdOperand(format_case.eM_packed[eM_index]); + eM_packed_phi->addIdOperand(format_case.phi_parent); + } + eM_packed[eM_index] = eM_packed_phi->getResultId(); + builder_->getBuildPoint()->addInstruction(std::move(eM_packed_phi)); + }); + spv::Id element_bytes_log2; + { + auto element_bytes_log2_phi = std::make_unique( + builder_->getUniqueId(), type_uint_, spv::OpPhi); + // Default case for an invalid format (doesn't enter any element size + // conditional, skipped). + element_bytes_log2_phi->addIdOperand(builder_->makeUintConstant(5)); + element_bytes_log2_phi->addIdOperand(format_switch.getDefaultPhiParent()); + for (const FormatCase& format_case : format_cases) { + element_bytes_log2_phi->addIdOperand( + builder_->makeUintConstant(format_case.element_bytes_log2)); + element_bytes_log2_phi->addIdOperand(format_case.phi_parent); + } + element_bytes_log2 = element_bytes_log2_phi->getResultId(); + builder_->getBuildPoint()->addInstruction( + std::move(element_bytes_log2_phi)); + } + + // Endian-swap. + spv::Id endian = + builder_->createTriOp(spv::OpBitFieldUExtract, type_uint_, format_info, + const_uint_0_, builder_->makeUintConstant(3)); + for_each_eM([&](uint32_t eM_index) { + eM_packed[eM_index] = EndianSwap128Uint4(eM_packed[eM_index], endian); + }); + + // Load the index of eM0 in the stream. + spv::Id eM0_index = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, + builder_->createCompositeExtract(eA_vector, type_uint_, 1), const_uint_0_, + builder_->makeUintConstant(23)); + + // Check how many elements starting from eM0 are within the bounds of the + // stream, and from the eM# that were written, exclude the out-of-bounds ones. + // The index can't be negative, and the index and the count are limited to 23 + // bits, so it's safe to use 32-bit signed subtraction and clamping to get the + // remaining eM# count. + spv::Id eM_indices_to_store = builder_->createTriOp( + spv::OpBitFieldUExtract, type_uint_, + builder_->createLoad(var_main_memexport_data_written_, spv::NoPrecision), + const_uint_0_, + builder_->createUnaryOp( + spv::OpBitcast, type_uint_, + builder_->createTriBuiltinCall( + type_int_, ext_inst_glsl_std_450_, GLSLstd450SClamp, + builder_->createBinOp( + spv::OpISub, type_int_, + builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createTriOp(spv::OpBitFieldUExtract, type_uint_, + builder_->createCompositeExtract( + eA_vector, type_uint_, 3), + const_uint_0_, + builder_->makeUintConstant(23))), + builder_->createUnaryOp(spv::OpBitcast, type_int_, + eM0_index)), + const_int_0_, + builder_->makeIntConstant(ucode::kMaxMemExportElementCount)))); + + // Get the eM0 address in bytes. + // Left-shift the stream base address by 2 to both convert it from dwords to + // bytes and drop the upper bits. + spv::Id const_uint_2 = builder_->makeUintConstant(2); + spv::Id eM0_address_bytes = builder_->createBinOp( + spv::OpIAdd, type_uint_, + builder_->createBinOp( + spv::OpShiftLeftLogical, type_uint_, + builder_->createCompositeExtract(eA_vector, type_uint_, 0), + const_uint_2), + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, eM0_index, + element_bytes_log2)); + + // Store based on the element size. + auto store_needed_eM = [&](std::function fn) { + for_each_eM([&](uint32_t eM_index) { + SpirvBuilder::IfBuilder if_eM_needed( + builder_->createBinOp( + spv::OpINotEqual, type_bool_, + builder_->createBinOp(spv::OpBitwiseAnd, type_uint_, + eM_indices_to_store, + builder_->makeUintConstant(1u << eM_index)), + const_uint_0_), + spv::SelectionControlDontFlattenMask, *builder_, 2, 1); + fn(eM_index); + if_eM_needed.makeEndIf(); + }); + }; + SpirvBuilder::SwitchBuilder element_size_switch( + element_bytes_log2, spv::SelectionControlDontFlattenMask, *builder_); + element_size_switch.makeBeginCase(0); + { + store_needed_eM([&](uint32_t eM_index) { + spv::Id element_address_bytes = + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_bytes, + builder_->makeUintConstant(eM_index)) + : eM0_address_bytes; + // replace_shift = 8 * (element_address_bytes & 3) + spv::Id replace_shift = builder_->createQuadOp( + spv::OpBitFieldInsert, type_uint_, const_uint_0_, + element_address_bytes, builder_->makeUintConstant(3), const_uint_2); + StoreUint32ToSharedMemory( + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + builder_->createCompositeExtract( + eM_packed[eM_index], type_uint_, 0), + replace_shift), + builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + element_address_bytes, const_uint_2)), + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + builder_->makeUintConstant(0xFFu), + replace_shift)); + }); + } + element_size_switch.makeBeginCase(1); + { + spv::Id const_uint_1 = builder_->makeUintConstant(1); + spv::Id eM0_address_words = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, eM0_address_bytes, const_uint_1); + store_needed_eM([&](uint32_t eM_index) { + spv::Id element_address_words = + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_words, + builder_->makeUintConstant(eM_index)) + : eM0_address_words; + // replace_shift = 16 * (element_address_words & 1) + spv::Id replace_shift = builder_->createQuadOp( + spv::OpBitFieldInsert, type_uint_, const_uint_0_, + element_address_words, builder_->makeUintConstant(4), const_uint_1); + StoreUint32ToSharedMemory( + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + builder_->createCompositeExtract( + eM_packed[eM_index], type_uint_, 0), + replace_shift), + builder_->createUnaryOp( + spv::OpBitcast, type_int_, + builder_->createBinOp(spv::OpShiftRightLogical, type_uint_, + element_address_words, const_uint_1)), + builder_->createBinOp(spv::OpShiftLeftLogical, type_uint_, + builder_->makeUintConstant(0xFFFFu), + replace_shift)); + }); + } + element_size_switch.makeBeginCase(2); + { + spv::Id eM0_address_dwords = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, eM0_address_bytes, const_uint_2); + store_needed_eM([&](uint32_t eM_index) { + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(eM_packed[eM_index], type_uint_, 0), + builder_->createUnaryOp( + spv::OpBitcast, type_int_, + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_dwords, + builder_->makeUintConstant(eM_index)) + : eM0_address_dwords)); + }); + } + element_size_switch.makeBeginCase(3); + { + spv::Id eM0_address_dwords = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, eM0_address_bytes, const_uint_2); + store_needed_eM([&](uint32_t eM_index) { + spv::Id element_value = eM_packed[eM_index]; + spv::Id element_address_dwords_int = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_dwords, + builder_->makeUintConstant(2 * eM_index)) + : eM0_address_dwords); + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(element_value, type_uint_, 0), + element_address_dwords_int); + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(element_value, type_uint_, 1), + builder_->createBinOp(spv::OpIAdd, type_int_, + element_address_dwords_int, + builder_->makeIntConstant(1))); + }); + } + element_size_switch.makeBeginCase(4); + { + spv::Id eM0_address_dwords = builder_->createBinOp( + spv::OpShiftRightLogical, type_uint_, eM0_address_bytes, const_uint_2); + store_needed_eM([&](uint32_t eM_index) { + spv::Id element_value = eM_packed[eM_index]; + spv::Id element_address_dwords_int = builder_->createUnaryOp( + spv::OpBitcast, type_int_, + eM_index != 0 ? builder_->createBinOp( + spv::OpIAdd, type_uint_, eM0_address_dwords, + builder_->makeUintConstant(4 * eM_index)) + : eM0_address_dwords); + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(element_value, type_uint_, 0), + element_address_dwords_int); + for (uint32_t element_dword_index = 1; element_dword_index < 4; + ++element_dword_index) { + StoreUint32ToSharedMemory( + builder_->createCompositeExtract(element_value, type_uint_, + element_dword_index), + builder_->createBinOp(spv::OpIAdd, type_int_, + element_address_dwords_int, + builder_->makeIntConstant( + static_cast(element_dword_index)))); + } + }); + } + element_size_switch.makeEndSwitch(); + + // Close the conditionals for whether memory export is allowed in this + // invocation. + if_address_valid.makeEndIf(); + if (if_pixel_not_killed.has_value()) { + if_pixel_not_killed->makeEndIf(); + } + if (if_memexport_allowed.has_value()) { + if_memexport_allowed->makeEndIf(); + } +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 806382e00..b2af47f30 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -2165,6 +2165,11 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, return IssueCopy(); } + const ui::vulkan::VulkanProvider::DeviceInfo& device_info = + GetVulkanProvider().device_info(); + + memexport_ranges_.clear(); + // Vertex shader analysis. auto vertex_shader = static_cast(active_vertex_shader()); if (!vertex_shader) { @@ -2172,7 +2177,14 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, return false; } pipeline_cache_->AnalyzeShaderUcode(*vertex_shader); - bool memexport_used_vertex = vertex_shader->memexport_eM_written() != 0; + // TODO(Triang3l): If the shader uses memory export, but + // vertexPipelineStoresAndAtomics is not supported, convert the vertex shader + // to a compute shader and dispatch it after the draw if the draw doesn't use + // tessellation. + if (vertex_shader->memexport_eM_written() != 0 && + device_info.vertexPipelineStoresAndAtomics) { + draw_util::AddMemExportRanges(regs, *vertex_shader, memexport_ranges_); + } // Pixel shader analysis. bool primitive_polygonal = draw_util::IsPrimitivePolygonal(regs); @@ -2195,12 +2207,15 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, } else { // Disabling pixel shader for this case is also required by the pipeline // cache. - if (!memexport_used_vertex) { + if (memexport_ranges_.empty()) { // This draw has no effect. return true; } } - // TODO(Triang3l): Memory export. + if (pixel_shader && pixel_shader->memexport_eM_written() != 0 && + device_info.fragmentStoresAndAtomics) { + draw_util::AddMemExportRanges(regs, *pixel_shader, memexport_ranges_); + } uint32_t ps_param_gen_pos = UINT32_MAX; uint32_t interpolator_mask = @@ -2416,9 +2431,6 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, current_guest_graphics_pipeline_layout_ = pipeline_layout; } - const ui::vulkan::VulkanProvider::DeviceInfo& device_info = - GetVulkanProvider().device_info(); - bool host_render_targets_used = render_target_cache_->GetPath() == RenderTargetCache::Path::kHostRenderTargets; @@ -2520,9 +2532,39 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, << (vfetch_index & 63); } + // Synchronize the memory pages backing memory scatter export streams, and + // calculate the range that includes the streams for the buffer barrier. + uint32_t memexport_extent_start = UINT32_MAX, memexport_extent_end = 0; + for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) { + uint32_t memexport_range_base_bytes = memexport_range.base_address_dwords + << 2; + if (!shared_memory_->RequestRange(memexport_range_base_bytes, + memexport_range.size_bytes)) { + XELOGE( + "Failed to request memexport stream at 0x{:08X} (size {}) in the " + "shared memory", + memexport_range_base_bytes, memexport_range.size_bytes); + return false; + } + memexport_extent_start = + std::min(memexport_extent_start, memexport_range_base_bytes); + memexport_extent_end = + std::max(memexport_extent_end, + memexport_range_base_bytes + memexport_range.size_bytes); + } + // Insert the shared memory barrier if needed. - // TODO(Triang3l): Memory export. - shared_memory_->Use(VulkanSharedMemory::Usage::kRead); + // TODO(Triang3l): Find some PM4 command that can be used for indication of + // when memexports should be awaited instead of inserting the barrier in Use + // every time if memory export was done in the previous draw? + if (memexport_extent_start < memexport_extent_end) { + shared_memory_->Use( + VulkanSharedMemory::Usage::kGuestDrawReadWrite, + std::make_pair(memexport_extent_start, + memexport_extent_end - memexport_extent_start)); + } else { + shared_memory_->Use(VulkanSharedMemory::Usage::kRead); + } // After all commands that may dispatch, copy or insert barriers, submit the // barriers (may end the render pass), and (re)enter the render pass before @@ -2567,6 +2609,12 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, primitive_processing_result.host_draw_vertex_count, 1, 0, 0, 0); } + // Invalidate textures in memexported memory and watch for changes. + for (const draw_util::MemExportRange& memexport_range : memexport_ranges_) { + shared_memory_->RangeWrittenByGpu(memexport_range.base_address_dwords << 2, + memexport_range.size_bytes, false); + } + return true; } diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.h b/src/xenia/gpu/vulkan/vulkan_command_processor.h index 8e1df02ef..022fb37b2 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.h +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.h @@ -737,6 +737,9 @@ class VulkanCommandProcessor : public CommandProcessor { // System shader constants. SpirvShaderTranslator::SystemConstants system_constants_; + + // Temporary storage for memexport stream constants used in the draw. + std::vector memexport_ranges_; }; } // namespace vulkan