From 0fd578cafd0465134214e5fd38c53cc3f888ace7 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Thu, 28 Apr 2022 22:25:25 +0300 Subject: [PATCH] [GPU] Get unclipped draw height by running VS on the CPU --- .../gpu/d3d12/d3d12_command_processor.cc | 5 +- .../gpu/d3d12/d3d12_render_target_cache.cc | 4 +- .../gpu/d3d12/d3d12_render_target_cache.h | 7 +- src/xenia/gpu/draw_extent_estimator.cc | 350 +++++ src/xenia/gpu/draw_extent_estimator.h | 76 ++ src/xenia/gpu/registers.h | 25 + src/xenia/gpu/render_target_cache.cc | 65 +- src/xenia/gpu/render_target_cache.h | 13 +- src/xenia/gpu/shader.h | 7 + src/xenia/gpu/shader_interpreter.cc | 1214 +++++++++++++++++ src/xenia/gpu/shader_interpreter.h | 149 ++ src/xenia/gpu/shader_translator.cc | 4 + 12 files changed, 1866 insertions(+), 53 deletions(-) create mode 100644 src/xenia/gpu/draw_extent_estimator.cc create mode 100644 src/xenia/gpu/draw_extent_estimator.h create mode 100644 src/xenia/gpu/shader_interpreter.cc create mode 100644 src/xenia/gpu/shader_interpreter.h diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 2f06cfe54..694529f04 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -847,7 +847,8 @@ bool D3D12CommandProcessor::SetupContext() { // Initialize the render target cache before configuring binding - need to // know if using rasterizer-ordered views for the bindless root signature. render_target_cache_ = std::make_unique( - *register_file_, *this, trace_writer_, bindless_resources_used_); + *register_file_, *memory_, trace_writer_, *this, + bindless_resources_used_); if (!render_target_cache_->Initialize()) { XELOGE("Failed to initialize the render target cache"); return false; @@ -2147,7 +2148,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, : 0; if (!render_target_cache_->Update(is_rasterization_done, normalized_depth_control, - normalized_color_mask)) { + normalized_color_mask, *vertex_shader)) { return false; } diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc index b13f8bda1..6ccdf76e2 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.cc @@ -1251,10 +1251,10 @@ void D3D12RenderTargetCache::BeginSubmission() { bool D3D12RenderTargetCache::Update( bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control, - uint32_t shader_writes_color_targets) { + uint32_t shader_writes_color_targets, const Shader& vertex_shader) { if (!RenderTargetCache::Update(is_rasterization_done, normalized_depth_control, - shader_writes_color_targets)) { + shader_writes_color_targets, vertex_shader)) { return false; } switch (GetPath()) { diff --git a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h index 09512838d..8eecb450f 100644 --- a/src/xenia/gpu/d3d12/d3d12_render_target_cache.h +++ b/src/xenia/gpu/d3d12/d3d12_render_target_cache.h @@ -43,10 +43,10 @@ class D3D12CommandProcessor; class D3D12RenderTargetCache final : public RenderTargetCache { public: D3D12RenderTargetCache(const RegisterFile& register_file, + const Memory& memory, TraceWriter& trace_writer, D3D12CommandProcessor& command_processor, - TraceWriter& trace_writer, bool bindless_resources_used) - : RenderTargetCache(register_file), + : RenderTargetCache(register_file, memory, &trace_writer), command_processor_(command_processor), trace_writer_(trace_writer), bindless_resources_used_(bindless_resources_used) {} @@ -65,7 +65,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache { bool Update(bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control, - uint32_t shader_writes_color_targets) override; + uint32_t shader_writes_color_targets, + const Shader& vertex_shader) override; void InvalidateCommandListRenderTargets() { are_current_command_list_render_targets_valid_ = false; diff --git a/src/xenia/gpu/draw_extent_estimator.cc b/src/xenia/gpu/draw_extent_estimator.cc new file mode 100644 index 000000000..0ab9ed7f2 --- /dev/null +++ b/src/xenia/gpu/draw_extent_estimator.cc @@ -0,0 +1,350 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/draw_extent_estimator.h" + +#include +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/cvar.h" +#include "xenia/base/profiling.h" +#include "xenia/gpu/registers.h" +#include "xenia/gpu/ucode.h" +#include "xenia/gpu/xenos.h" +#include "xenia/ui/graphics_util.h" + +DEFINE_bool( + execute_unclipped_draw_vs_on_cpu, true, + "Execute the vertex shader for draws with clipping disabled, primarily " + "screen-space draws (such as clears), on the CPU when possible to estimate " + "the extent of the EDRAM involved in the draw.\n" + "Enabling this may significantly improve GPU performance as otherwise up " + "to the entire EDRAM may be considered used in draws without clipping, " + "potentially resulting in spurious EDRAM range ownership transfer round " + "trips between host render targets.\n" + "Also, on hosts where certain render target formats have to be emulated in " + "a lossy way (for instance, 16-bit fixed-point via 16-bit floating-point), " + "this prevents corruption of other render targets located after the " + "current ones in the EDRAM by lossy range ownership transfers done for " + "those draws.", + "GPU"); +DEFINE_bool( + execute_unclipped_draw_vs_on_cpu_with_scissor, false, + "Don't restrict the usage of execute_unclipped_draw_vs_on_cpu to only " + "non-scissored draws (with the right and the bottom sides of the scissor " + "rectangle at 8192 or beyond) even though if the scissor rectangle is " + "present, it's usually sufficient for esimating the height of the render " + "target.\n" + "Enabling this may cause excessive processing of vertices on the CPU, as " + "some games draw rectangles (for their UI, for instance) without clipping, " + "but with a proper scissor rectangle.", + "GPU"); + +namespace xe { +namespace gpu { + +void DrawExtentEstimator::PositionYExportSink::Export( + ucode::ExportRegister export_register, const float* value, + uint32_t value_mask) { + if (export_register == ucode::ExportRegister::kVSPosition) { + if (value_mask & 0b0010) { + position_y_ = value[1]; + } + if (value_mask & 0b1000) { + position_w_ = value[3]; + } + } else if (export_register == + ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex) { + if (value_mask & 0b0001) { + point_size_ = value[0]; + } + if (value_mask & 0b0100) { + vertex_kill_ = *reinterpret_cast(&value[2]); + } + } +} + +uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) { + SCOPE_profile_cpu_f("gpu"); + + const RegisterFile& regs = register_file_; + + auto vgt_draw_initiator = regs.Get(); + if (!vgt_draw_initiator.num_indices) { + return 0; + } + if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA && + vgt_draw_initiator.source_select != xenos::SourceSelect::kAutoIndex) { + // TODO(Triang3l): Support immediate indices. + return xenos::kTexture2DCubeMaxWidthHeight; + } + + // Not reproducing tessellation. + if (xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode, + vgt_draw_initiator.prim_type) && + regs.Get().path_select == + xenos::VGTOutputPath::kTessellationEnable) { + return xenos::kTexture2DCubeMaxWidthHeight; + } + + assert_true(vertex_shader.type() == xenos::ShaderType::kVertex); + assert_true(vertex_shader.is_ucode_analyzed()); + if (!ShaderInterpreter::CanInterpretShader(vertex_shader)) { + return xenos::kTexture2DCubeMaxWidthHeight; + } + + auto vgt_dma_size = regs.Get(); + union { + const void* index_buffer; + const uint16_t* index_buffer_16; + const uint32_t* index_buffer_32; + }; + xenos::Endian index_endian = vgt_dma_size.swap_mode; + if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) { + xenos::IndexFormat index_format = vgt_draw_initiator.index_size; + uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32; + uint32_t index_buffer_read_count = + std::min(vgt_draw_initiator.num_indices, vgt_dma_size.num_words); + if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) { + // Handle the index endianness to same way as the PrimitiveProcessor. + if (index_endian == xenos::Endian::k8in32) { + index_endian = xenos::Endian::k8in16; + } else if (index_endian == xenos::Endian::k16in32) { + index_endian = xenos::Endian::kNone; + } + index_buffer_base &= ~uint32_t(sizeof(uint16_t) - 1); + if (trace_writer_) { + trace_writer_->WriteMemoryRead( + index_buffer_base, sizeof(uint16_t) * index_buffer_read_count); + } + } else { + assert_true(vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32); + index_buffer_base &= ~uint32_t(sizeof(uint32_t) - 1); + if (trace_writer_) { + trace_writer_->WriteMemoryRead( + index_buffer_base, sizeof(uint32_t) * index_buffer_read_count); + } + } + index_buffer = memory_.TranslatePhysical(index_buffer_base); + } + auto pa_su_sc_mode_cntl = regs.Get(); + uint32_t reset_index = + regs.Get().reset_indx; + uint32_t index_offset = regs.Get().indx_offset; + uint32_t min_index = regs.Get().min_indx; + uint32_t max_index = regs.Get().max_indx; + + auto pa_cl_vte_cntl = regs.Get(); + float viewport_y_scale = pa_cl_vte_cntl.vport_y_scale_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 + : 1.0f; + float viewport_y_offset = pa_cl_vte_cntl.vport_y_offset_ena + ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 + : 0.0f; + + int32_t point_vertex_min_diameter_float = 0; + int32_t point_vertex_max_diameter_float = 0; + float point_constant_radius_y = 0.0f; + if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) { + auto pa_su_point_minmax = regs.Get(); + *reinterpret_cast(&point_vertex_min_diameter_float) = + float(pa_su_point_minmax.min_size) * (2.0f / 16.0f); + *reinterpret_cast(&point_vertex_max_diameter_float) = + float(pa_su_point_minmax.max_size) * (2.0f / 16.0f); + point_constant_radius_y = + float(regs.Get().height) * (1.0f / 16.0f); + } + + float max_y = -FLT_MAX; + + shader_interpreter_.SetShader(vertex_shader); + + PositionYExportSink position_y_export_sink; + shader_interpreter_.SetExportSink(&position_y_export_sink); + for (uint32_t i = 0; i < vgt_draw_initiator.num_indices; ++i) { + uint32_t vertex_index; + if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) { + if (i < vgt_dma_size.num_words) { + if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) { + vertex_index = index_buffer_16[i]; + } else { + vertex_index = index_buffer_32[i]; + } + // The Xenos only uses 24 bits of the index (reset_indx is 24-bit). + vertex_index = xenos::GpuSwap(vertex_index, index_endian) & 0xFFFFFF; + } else { + vertex_index = 0; + } + if (pa_su_sc_mode_cntl.multi_prim_ib_ena && vertex_index == reset_index) { + continue; + } + } else { + assert_true(vgt_draw_initiator.source_select == + xenos::SourceSelect::kAutoIndex); + vertex_index = i; + } + vertex_index = + std::min(max_index, + std::max(min_index, (vertex_index + index_offset) & 0xFFFFFF)); + + position_y_export_sink.Reset(); + + shader_interpreter_.temp_registers()[0] = float(vertex_index); + shader_interpreter_.Execute(); + + if (position_y_export_sink.vertex_kill().has_value() && + (position_y_export_sink.vertex_kill().value() & ~(UINT32_C(1) << 31))) { + continue; + } + if (!position_y_export_sink.position_y().has_value()) { + continue; + } + float vertex_y = position_y_export_sink.position_y().value(); + if (!pa_cl_vte_cntl.vtx_xy_fmt) { + if (!position_y_export_sink.position_w().has_value()) { + continue; + } + vertex_y /= position_y_export_sink.position_w().value(); + } + + vertex_y = vertex_y * viewport_y_scale + viewport_y_offset; + + if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) { + float point_radius_y; + if (position_y_export_sink.point_size().has_value()) { + // Vertex-specified diameter. Clamped effectively as a signed integer in + // the hardware, -NaN, -Infinity ... -0 to the minimum, +Infinity, +NaN + // to the maximum. + point_radius_y = position_y_export_sink.point_size().value(); + *reinterpret_cast(&point_radius_y) = std::min( + point_vertex_max_diameter_float, + std::max(point_vertex_min_diameter_float, + *reinterpret_cast(&point_radius_y))); + point_radius_y *= 0.5f; + } else { + // Constant radius. + point_radius_y = point_constant_radius_y; + } + vertex_y += point_radius_y; + } + + // std::max is `a < b ? b : a`, thus in case of NaN, the first argument is + // always returned - max_y, which is initialized to a normalized value. + max_y = std::max(max_y, vertex_y); + } + shader_interpreter_.SetExportSink(nullptr); + + int32_t max_y_24p8 = ui::FloatToD3D11Fixed16p8(max_y); + // 16p8 range is -32768 to 32767+255/256, but it's stored as uint32_t here, + // as 24p8, so overflowing up to -8388608 to 8388608+255/256 is safe. The + // range of the window offset plus the half-pixel offset is -16384 to 16384.5, + // so it's safe to add both - adding it will neither move the 16p8 clamping + // bounds -32768 and 32767+255/256 into the 0...8192 screen space range, nor + // cause 24p8 overflow. + if (!regs.Get().pix_center) { + max_y_24p8 += 128; + } + if (pa_su_sc_mode_cntl.vtx_window_offset_enable) { + max_y_24p8 += regs.Get().window_y_offset * 256; + } + // Top-left rule - .5 exclusive without MSAA, 1. exclusive with MSAA. + auto rb_surface_info = regs.Get(); + return (uint32_t(std::max(int32_t(0), max_y_24p8)) + + ((rb_surface_info.msaa_samples == xenos::MsaaSamples::k1X) ? 127 + : 255)) >> + 8; +} + +uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y, + const Shader& vertex_shader) { + SCOPE_profile_cpu_f("gpu"); + + const RegisterFile& regs = register_file_; + + auto pa_sc_window_offset = regs.Get(); + int32_t window_y_offset = pa_sc_window_offset.window_y_offset; + + // Scissor. + auto pa_sc_window_scissor_br = regs.Get(); + int32_t scissor_bottom = int32_t(pa_sc_window_scissor_br.br_y); + bool scissor_window_offset = + !regs.Get().window_offset_disable; + if (scissor_window_offset) { + scissor_bottom += window_y_offset; + } + auto pa_sc_screen_scissor_br = regs.Get(); + scissor_bottom = std::min(scissor_bottom, pa_sc_screen_scissor_br.br_y); + uint32_t max_y = uint32_t(std::max(scissor_bottom, int32_t(0))); + + if (regs.Get().clip_disable) { + // Actual extent from the vertices. + if (try_to_estimate_vertex_max_y && + cvars::execute_unclipped_draw_vs_on_cpu) { + bool estimate_vertex_max_y; + if (cvars::execute_unclipped_draw_vs_on_cpu_with_scissor) { + estimate_vertex_max_y = true; + } else { + estimate_vertex_max_y = false; + if (scissor_bottom >= xenos::kTexture2DCubeMaxWidthHeight) { + // Handle just the usual special 8192x8192 case in Direct3D 9 - 8192 + // may be a normal render target height (80x8192 is well within the + // EDRAM size, for instance), no need to process the vertices on the + // CPU in this case. + int32_t scissor_right = int32_t(pa_sc_window_scissor_br.br_x); + if (scissor_window_offset) { + scissor_right += pa_sc_window_offset.window_x_offset; + } + scissor_right = std::min(scissor_right, pa_sc_screen_scissor_br.br_x); + if (scissor_right >= xenos::kTexture2DCubeMaxWidthHeight) { + estimate_vertex_max_y = true; + } + } + } + if (estimate_vertex_max_y) { + max_y = std::min(max_y, EstimateVertexMaxY(vertex_shader)); + } + } + } else { + // Viewport. Though the Xenos itself doesn't have an implicit viewport + // scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it + // usually exists and can't be disabled. + auto pa_cl_vte_cntl = regs.Get(); + float viewport_bottom = 0.0f; + // First calculate all the integer.0 or integer.5 offsetting exactly at full + // precision. + if (regs.Get().vtx_window_offset_enable) { + viewport_bottom += float(window_y_offset); + } + if (!regs.Get().pix_center) { + viewport_bottom += 0.5f; + } + // Then apply the floating-point viewport offset. + if (pa_cl_vte_cntl.vport_y_offset_ena) { + viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; + } + viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena + ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32) + : 1.0f; + // Using floor, or, rather, truncation (because maxing with zero anyway) + // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia + // GPUs on Direct3D 12 (but not WARP), also like in + // draw_util::GetHostViewportInfo. + // max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first + // argument in the !(a < b) case (always for NaN), min as float (max_y is + // well below 2^24) to safely drop very large values. + max_y = uint32_t(std::min(float(max_y), std::max(0.0f, viewport_bottom))); + } + + return max_y; +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/draw_extent_estimator.h b/src/xenia/gpu/draw_extent_estimator.h new file mode 100644 index 000000000..3e360489e --- /dev/null +++ b/src/xenia/gpu/draw_extent_estimator.h @@ -0,0 +1,76 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_ +#define XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_ + +#include +#include + +#include "xenia/gpu/register_file.h" +#include "xenia/gpu/shader.h" +#include "xenia/gpu/shader_interpreter.h" +#include "xenia/gpu/trace_writer.h" +#include "xenia/memory.h" + +namespace xe { +namespace gpu { + +class DrawExtentEstimator { + public: + DrawExtentEstimator(const RegisterFile& register_file, const Memory& memory, + TraceWriter* trace_writer) + : register_file_(register_file), + memory_(memory), + trace_writer_(trace_writer), + shader_interpreter_(register_file, memory) { + shader_interpreter_.SetTraceWriter(trace_writer); + } + + // The shader must have its ucode analyzed. + uint32_t EstimateVertexMaxY(const Shader& vertex_shader); + uint32_t EstimateMaxY(bool try_to_estimate_vertex_max_y, + const Shader& vertex_shader); + + private: + class PositionYExportSink : public ShaderInterpreter::ExportSink { + public: + void Export(ucode::ExportRegister export_register, const float* value, + uint32_t value_mask) override; + + void Reset() { + position_y_.reset(); + position_w_.reset(); + point_size_.reset(); + vertex_kill_.reset(); + } + + const std::optional& position_y() const { return position_y_; } + const std::optional& position_w() const { return position_w_; } + const std::optional& point_size() const { return point_size_; } + const std::optional& vertex_kill() const { return vertex_kill_; } + + private: + std::optional position_y_; + std::optional position_w_; + std::optional point_size_; + std::optional vertex_kill_; + }; + + const RegisterFile& register_file_; + const Memory& memory_; + TraceWriter* trace_writer_; + + ShaderInterpreter shader_interpreter_; +}; + +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_ diff --git a/src/xenia/gpu/registers.h b/src/xenia/gpu/registers.h index f6425c277..1a7e721ce 100644 --- a/src/xenia/gpu/registers.h +++ b/src/xenia/gpu/registers.h @@ -215,6 +215,31 @@ union alignas(uint32_t) SQ_INTERPOLATOR_CNTL { }; static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t)); +union alignas(uint32_t) SQ_VS_CONST { + uint32_t value; + struct { + uint32_t base : 9; // +0 + uint32_t : 3; // +9 + // Vec4 count minus one. + uint32_t size : 9; // 12 + }; + static constexpr Register register_index = XE_GPU_REG_SQ_VS_CONST; +}; +static_assert_size(SQ_VS_CONST, sizeof(uint32_t)); + +// Same as SQ_VS_CONST. +union alignas(uint32_t) SQ_PS_CONST { + uint32_t value; + struct { + uint32_t base : 9; // +0 + uint32_t : 3; // +9 + // Vec4 count minus one. + uint32_t size : 9; // 12 + }; + static constexpr Register register_index = XE_GPU_REG_SQ_PS_CONST; +}; +static_assert_size(SQ_PS_CONST, sizeof(uint32_t)); + /******************************************************************************* __ _____ ___ _____ _____ __ \ \ / / __| _ \_ _| __\ \/ / diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index d840a3550..5b5e9f613 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -22,7 +22,6 @@ #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/gpu/draw_util.h" -#include "xenia/gpu/gpu_flags.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/registers.h" #include "xenia/gpu/xenos.h" @@ -143,6 +142,19 @@ DEFINE_bool( "-1...1, remap -32...32 to -1...1 to use the full possible range of " "values, at the expense of multiplicative blending correctness.", "GPU"); +// Enabled by default as the GPU is overall usually the bottleneck when the +// pixel shader interlock render backend implementation is used, anything that +// may improve GPU performance is favorable. +DEFINE_bool( + execute_unclipped_draw_vs_on_cpu_for_psi_render_backend, true, + "If execute_unclipped_draw_vs_on_cpu is enabled, execute the vertex shader " + "for unclipped draws on the CPU even when using the pixel shader interlock " + "(rasterizer-ordered view) implementation of the render backend on the " + "host, for which no expensive copying between host render targets is " + "needed when the ownership of a EDRAM range is changed.\n" + "If this is enabled, excessive barriers may be eliminated when switching " + "between different render targets in separate EDRAM locations.", + "GPU"); namespace xe { namespace gpu { @@ -367,7 +379,8 @@ void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); } bool RenderTargetCache::Update(bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control, - uint32_t normalized_color_mask) { + uint32_t normalized_color_mask, + const Shader& vertex_shader) { const RegisterFile& regs = register_file(); bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock; @@ -556,47 +569,13 @@ bool RenderTargetCache::Update(bool is_rasterization_done, // Estimate height used by render targets (for color for writes, for depth / // stencil for both reads and writes) from various sources. - uint32_t height_used = - GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples); - int32_t window_y_offset = - regs.Get().window_y_offset; - if (!regs.Get().clip_disable) { - auto pa_cl_vte_cntl = regs.Get(); - float viewport_bottom = 0.0f; - // First calculate all the integer.0 or integer.5 offsetting exactly at full - // precision. - if (regs.Get().vtx_window_offset_enable) { - viewport_bottom += float(window_y_offset); - } - if (cvars::half_pixel_offset && - !regs.Get().pix_center) { - viewport_bottom += 0.5f; - } - // Then apply the floating-point viewport offset. - if (pa_cl_vte_cntl.vport_y_offset_ena) { - viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; - } - viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena - ? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32) - : 1.0f; - // Using floor, or, rather, truncation (because maxing with zero anyway) - // similar to how viewport scissoring behaves on real AMD, Intel and Nvidia - // GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo. - // max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first - // argument in the !(a < b) case (always for NaN), min as float (height_used - // is well below 2^24) to safely drop very large values. - height_used = - uint32_t(std::min(float(height_used), std::max(0.0f, viewport_bottom))); - } - int32_t scissor_bottom = - int32_t(regs.Get().br_y); - if (!regs.Get().window_offset_disable) { - scissor_bottom += window_y_offset; - } - scissor_bottom = - std::min(scissor_bottom, regs.Get().br_y); - height_used = - std::min(height_used, uint32_t(std::max(scissor_bottom, int32_t(0)))); + uint32_t height_used = std::min( + GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples), + draw_extent_estimator_.EstimateMaxY( + interlock_barrier_only + ? cvars::execute_unclipped_draw_vs_on_cpu_for_psi_render_backend + : true, + vertex_shader)); // Sorted by EDRAM base and then by index in the pipeline - for simplicity, // treat render targets placed closer to the end of the EDRAM as truncating diff --git a/src/xenia/gpu/render_target_cache.h b/src/xenia/gpu/render_target_cache.h index 2a04df75a..6a1aa3ea7 100644 --- a/src/xenia/gpu/render_target_cache.h +++ b/src/xenia/gpu/render_target_cache.h @@ -21,9 +21,11 @@ #include "third_party/fmt/include/fmt/format.h" #include "xenia/base/assert.h" #include "xenia/base/cvar.h" +#include "xenia/gpu/draw_extent_estimator.h" #include "xenia/gpu/draw_util.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/registers.h" +#include "xenia/gpu/shader.h" #include "xenia/gpu/xenos.h" DECLARE_bool(depth_transfer_not_equal_test); @@ -217,7 +219,8 @@ class RenderTargetCache { virtual bool Update(bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control, - uint32_t normalized_color_mask); + uint32_t normalized_color_mask, + const Shader& vertex_shader); // Returns bits where 0 is whether a depth render target is currently bound on // the host and 1... are whether the same applies to color render targets, and @@ -228,8 +231,10 @@ class RenderTargetCache { uint32_t* depth_and_color_formats_out = nullptr) const; protected: - RenderTargetCache(const RegisterFile& register_file) - : register_file_(register_file) {} + RenderTargetCache(const RegisterFile& register_file, const Memory& memory, + TraceWriter* trace_writer) + : register_file_(register_file), + draw_extent_estimator_(register_file, memory, trace_writer) {} const RegisterFile& register_file() const { return register_file_; } @@ -606,6 +611,8 @@ class RenderTargetCache { private: const RegisterFile& register_file_; + DrawExtentEstimator draw_extent_estimator_; + // For host render targets. struct OwnershipRange { diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 91964e332..99ad84b8a 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -914,6 +914,12 @@ class Shader { // True if the current shader has any `kill` instructions. bool kills_pixels() const { return kills_pixels_; } + // True if the shader has any texture-related instructions (any fetch + // instructions other than vertex fetch) writing any non-constant components. + bool uses_texture_fetch_instruction_results() const { + return uses_texture_fetch_instruction_results_; + } + // True if the shader overrides the pixel depth. bool writes_depth() const { return writes_depth_; } @@ -1002,6 +1008,7 @@ class Shader { uint32_t register_static_address_bound_ = 0; bool uses_register_dynamic_addressing_ = false; bool kills_pixels_ = false; + bool uses_texture_fetch_instruction_results_ = false; bool writes_depth_ = false; uint32_t writes_color_targets_ = 0b0000; diff --git a/src/xenia/gpu/shader_interpreter.cc b/src/xenia/gpu/shader_interpreter.cc new file mode 100644 index 000000000..566bade43 --- /dev/null +++ b/src/xenia/gpu/shader_interpreter.cc @@ -0,0 +1,1214 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include "xenia/gpu/shader_interpreter.h" + +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/base/byte_order.h" +#include "xenia/base/math.h" +#include "xenia/gpu/registers.h" +#include "xenia/gpu/trace_writer.h" +#include "xenia/gpu/xenos.h" + +namespace xe { +namespace gpu { + +void ShaderInterpreter::Execute() { + // For more consistency between invocations in case of a malformed shader. + state_.Reset(); + + const uint32_t* bool_constants = + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32; + const xenos::LoopConstant* loop_constants = + reinterpret_cast( + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].u32); + + bool exec_ended = false; + uint32_t cf_index_next = 1; + for (uint32_t cf_index = 0; !exec_ended; cf_index = cf_index_next) { + cf_index_next = cf_index + 1; + + const uint32_t* cf_pair = &ucode_[3 * (cf_index >> 1)]; + ucode::ControlFlowInstruction cf_instr; + if (cf_index & 1) { + cf_instr.dword_0 = (cf_pair[1] >> 16) | (cf_pair[2] << 16); + cf_instr.dword_1 = cf_pair[2] >> 16; + } else { + cf_instr.dword_0 = cf_pair[0]; + cf_instr.dword_1 = cf_pair[1] & 0xFFFF; + } + + ucode::ControlFlowOpcode cf_opcode = cf_instr.opcode(); + switch (cf_opcode) { + case ucode::ControlFlowOpcode::kNop: { + } break; + + case ucode::ControlFlowOpcode::kExec: + case ucode::ControlFlowOpcode::kExecEnd: + case ucode::ControlFlowOpcode::kCondExec: + case ucode::ControlFlowOpcode::kCondExecEnd: + case ucode::ControlFlowOpcode::kCondExecPred: + case ucode::ControlFlowOpcode::kCondExecPredEnd: + case ucode::ControlFlowOpcode::kCondExecPredClean: + case ucode::ControlFlowOpcode::kCondExecPredCleanEnd: { + ucode::ControlFlowExecInstruction cf_exec = + *reinterpret_cast( + &cf_instr); + + switch (cf_opcode) { + case ucode::ControlFlowOpcode::kCondExec: + case ucode::ControlFlowOpcode::kCondExecEnd: + case ucode::ControlFlowOpcode::kCondExecPredClean: + case ucode::ControlFlowOpcode::kCondExecPredCleanEnd: { + const ucode::ControlFlowCondExecInstruction cf_cond_exec = + *reinterpret_cast( + &cf_exec); + uint32_t bool_address = cf_cond_exec.bool_address(); + if (cf_cond_exec.condition() != + ((bool_constants[bool_address >> 5] & + (UINT32_C(1) << (bool_address & 31))) != 0)) { + continue; + } + } break; + case ucode::ControlFlowOpcode::kCondExecPred: + case ucode::ControlFlowOpcode::kCondExecPredEnd: { + const ucode::ControlFlowCondExecPredInstruction cf_cond_exec_pred = + *reinterpret_cast< + const ucode::ControlFlowCondExecPredInstruction*>(&cf_exec); + if (cf_cond_exec_pred.condition() != state_.predicate) { + continue; + } + } break; + default: + break; + } + + for (uint32_t exec_index = 0; exec_index < cf_exec.count(); + ++exec_index) { + const uint32_t* exec_instruction = + &ucode_[3 * (cf_exec.address() + exec_index)]; + if ((cf_exec.sequence() >> (exec_index << 1)) & 0b01) { + const ucode::FetchInstruction& fetch_instr = + *reinterpret_cast( + exec_instruction); + if (fetch_instr.is_predicated() && + fetch_instr.predicate_condition() != state_.predicate) { + continue; + } + if (fetch_instr.opcode() == ucode::FetchOpcode::kVertexFetch) { + ExecuteVertexFetchInstruction(fetch_instr.vertex_fetch()); + } else { + // Not supporting texture fetching (very complex). + float zero_result[4] = {}; + StoreFetchResult(fetch_instr.dest(), + fetch_instr.is_dest_relative(), + fetch_instr.dest_swizzle(), zero_result); + } + } else { + const ucode::AluInstruction& alu_instr = + *reinterpret_cast( + exec_instruction); + if (alu_instr.is_predicated() && + alu_instr.predicate_condition() != state_.predicate) { + continue; + } + ExecuteAluInstruction(alu_instr); + } + } + + if (ucode::DoesControlFlowOpcodeEndShader(cf_opcode)) { + exec_ended = true; + } + } break; + + case ucode::ControlFlowOpcode::kLoopStart: { + ucode::ControlFlowLoopStartInstruction cf_loop_start = + *reinterpret_cast( + &cf_instr); + assert_true(state_.loop_stack_depth < 4); + if (++state_.loop_stack_depth > 4) { + cf_index_next = cf_loop_start.address(); + continue; + } + xenos::LoopConstant loop_constant = + loop_constants[cf_loop_start.loop_id()]; + state_.loop_constants[state_.loop_stack_depth] = loop_constant; + uint32_t& loop_iterator_ref = + state_.loop_iterators[state_.loop_stack_depth]; + if (!cf_loop_start.is_repeat()) { + loop_iterator_ref = 0; + } + if (loop_iterator_ref >= loop_constant.count) { + cf_index_next = cf_loop_start.address(); + continue; + } + ++state_.loop_stack_depth; + } break; + + case ucode::ControlFlowOpcode::kLoopEnd: { + assert_not_zero(state_.loop_stack_depth); + if (!state_.loop_stack_depth) { + continue; + } + assert_true(state_.loop_stack_depth <= 4); + if (state_.loop_stack_depth > 4) { + --state_.loop_stack_depth; + continue; + } + ucode::ControlFlowLoopEndInstruction cf_loop_end = + *reinterpret_cast( + &cf_instr); + xenos::LoopConstant loop_constant = + state_.loop_constants[state_.loop_stack_depth - 1]; + assert_true(loop_constant.value == + loop_constants[cf_loop_end.loop_id()].value); + uint32_t loop_iterator = + ++state_.loop_iterators[state_.loop_stack_depth - 1]; + if (loop_iterator < loop_constant.count && + (!cf_loop_end.is_predicated_break() || + cf_loop_end.condition() != state_.predicate)) { + cf_index_next = cf_loop_end.address(); + continue; + } + --state_.loop_stack_depth; + } break; + + case ucode::ControlFlowOpcode::kCondCall: { + assert_true(state_.call_stack_depth < 4); + if (state_.call_stack_depth >= 4) { + continue; + } + const ucode::ControlFlowCondCallInstruction cf_cond_call = + *reinterpret_cast( + &cf_instr); + if (!cf_cond_call.is_unconditional()) { + if (cf_cond_call.is_predicated()) { + if (cf_cond_call.condition() != state_.predicate) { + continue; + } + } else { + uint32_t bool_address = cf_cond_call.bool_address(); + if (cf_cond_call.condition() != + ((bool_constants[bool_address >> 5] & + (UINT32_C(1) << (bool_address & 31))) != 0)) { + continue; + } + } + } + state_.call_return_addresses[state_.call_stack_depth++] = cf_index + 1; + cf_index_next = cf_cond_call.address(); + } break; + + case ucode::ControlFlowOpcode::kReturn: { + // No stack depth assertion - skipping the return is a well-defined + // behavior for `return` outside a function call. + if (!state_.call_stack_depth) { + continue; + } + cf_index_next = state_.call_return_addresses[--state_.call_stack_depth]; + } break; + + case ucode::ControlFlowOpcode::kCondJmp: { + const ucode::ControlFlowCondJmpInstruction cf_cond_jmp = + *reinterpret_cast( + &cf_instr); + if (!cf_cond_jmp.is_unconditional()) { + if (cf_cond_jmp.is_predicated()) { + if (cf_cond_jmp.condition() != state_.predicate) { + continue; + } + } else { + uint32_t bool_address = cf_cond_jmp.bool_address(); + if (cf_cond_jmp.condition() != + ((bool_constants[bool_address >> 5] & + (UINT32_C(1) << (bool_address & 31))) != 0)) { + continue; + } + } + } + cf_index_next = cf_cond_jmp.address(); + } break; + + case ucode::ControlFlowOpcode::kAlloc: { + if (export_sink_) { + const ucode::ControlFlowAllocInstruction& cf_alloc = + *reinterpret_cast( + &cf_instr); + export_sink_->AllocExport(cf_alloc.alloc_type(), cf_alloc.size()); + } + } break; + + case ucode::ControlFlowOpcode::kMarkVsFetchDone: { + } break; + + default: + assert_unhandled_case(cf_opcode); + } + } +} + +const float* ShaderInterpreter::GetFloatConstant( + uint32_t address, bool is_relative, bool relative_address_is_a0) const { + static const float zero[4] = {}; + int32_t index = int32_t(address); + if (is_relative) { + index += relative_address_is_a0 ? state_.address_register + : state_.GetLoopAddress(); + } + if (index < 0) { + return zero; + } + auto base_and_size_minus_1 = register_file_.Get( + shader_type_ == xenos::ShaderType::kVertex ? XE_GPU_REG_SQ_VS_CONST + : XE_GPU_REG_SQ_PS_CONST); + if (uint32_t(index) > base_and_size_minus_1.size) { + return zero; + } + index += base_and_size_minus_1.base; + if (index >= 512) { + return zero; + } + return ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_000_X + 4 * index].f32; +} + +void ShaderInterpreter::ExecuteAluInstruction(ucode::AluInstruction instr) { + // Vector operation. + float vector_result[4] = {}; + ucode::AluVectorOpcode vector_opcode = instr.vector_opcode(); + const ucode::AluVectorOpcodeInfo& vector_opcode_info = + ucode::GetAluVectorOpcodeInfo(vector_opcode); + uint32_t vector_result_write_mask = instr.GetVectorOpResultWriteMask(); + if (vector_result_write_mask || vector_opcode_info.changed_state) { + float vector_operands[3][4]; + for (uint32_t i = 0; i < 3; ++i) { + if (!vector_opcode_info.operand_components_used[i]) { + continue; + } + const float* vector_src_ptr; + uint32_t vector_src_register = instr.src_reg(1 + i); + bool vector_src_absolute = false; + if (instr.src_is_temp(1 + i)) { + vector_src_ptr = GetTempRegister( + ucode::AluInstruction::src_temp_reg(vector_src_register), + ucode::AluInstruction::is_src_temp_relative(vector_src_register)); + vector_src_absolute = ucode::AluInstruction::is_src_temp_value_absolute( + vector_src_register); + } else { + vector_src_ptr = GetFloatConstant( + vector_src_register, instr.src_const_is_addressed(1 + i), + instr.is_const_address_register_relative()); + } + uint32_t vector_src_absolute_mask = + ~(uint32_t(vector_src_absolute) << 31); + uint32_t vector_src_negate_bit = uint32_t(instr.src_negate(1 + i)) << 31; + uint32_t vector_src_swizzle = instr.src_swizzle(1 + i); + for (uint32_t j = 0; j < 4; ++j) { + float vector_src_component = FlushDenormal( + vector_src_ptr[ucode::AluInstruction::GetSwizzledComponentIndex( + vector_src_swizzle, j)]); + *reinterpret_cast(&vector_src_component) = + (*reinterpret_cast(&vector_src_component) & + vector_src_absolute_mask) ^ + vector_src_negate_bit; + vector_operands[i][j] = vector_src_component; + } + } + + bool replicate_vector_result_x = false; + switch (vector_opcode) { + case ucode::AluVectorOpcode::kAdd: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = vector_operands[0][i] + vector_operands[1][i]; + } + } break; + case ucode::AluVectorOpcode::kMul: { + for (uint32_t i = 0; i < 4; ++i) { + // Direct3D 9 behavior (0 or denormal * anything = +0). + vector_result[i] = (vector_operands[0][i] && vector_operands[1][i]) + ? vector_operands[0][i] * vector_operands[1][i] + : 0.0f; + } + } break; + case ucode::AluVectorOpcode::kMax: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = vector_operands[0][i] >= vector_operands[1][i] + ? vector_operands[0][i] + : vector_operands[1][i]; + } + } break; + case ucode::AluVectorOpcode::kMin: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = vector_operands[0][i] < vector_operands[1][i] + ? vector_operands[0][i] + : vector_operands[1][i]; + } + } break; + case ucode::AluVectorOpcode::kSeq: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = + float(vector_operands[0][i] == vector_operands[1][i]); + } + } break; + case ucode::AluVectorOpcode::kSgt: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = + float(vector_operands[0][i] > vector_operands[1][i]); + } + } break; + case ucode::AluVectorOpcode::kSge: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = + float(vector_operands[0][i] >= vector_operands[1][i]); + } + } break; + case ucode::AluVectorOpcode::kSne: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = + float(vector_operands[0][i] != vector_operands[1][i]); + } + } break; + case ucode::AluVectorOpcode::kFrc: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = + vector_operands[0][i] - std::floor(vector_operands[0][i]); + } + } break; + case ucode::AluVectorOpcode::kTrunc: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = std::trunc(vector_operands[0][i]); + } + } break; + case ucode::AluVectorOpcode::kFloor: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = std::floor(vector_operands[0][i]); + } + } break; + case ucode::AluVectorOpcode::kMad: { + for (uint32_t i = 0; i < 4; ++i) { + // Direct3D 9 behavior (0 or denormal * anything = +0). + // Doing the addition rather than conditional assignment even for zero + // operands because +0 + -0 must be +0. + vector_result[i] = + ((vector_operands[0][i] && vector_operands[1][i]) + ? vector_operands[0][i] * vector_operands[1][i] + : 0.0f) + + vector_operands[2][i]; + } + } break; + case ucode::AluVectorOpcode::kCndEq: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = vector_operands[0][i] == 0.0f + ? vector_operands[1][i] + : vector_operands[2][i]; + } + } break; + case ucode::AluVectorOpcode::kCndGe: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = vector_operands[0][i] >= 0.0f + ? vector_operands[1][i] + : vector_operands[2][i]; + } + } break; + case ucode::AluVectorOpcode::kCndGt: { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = vector_operands[0][i] > 0.0f + ? vector_operands[1][i] + : vector_operands[2][i]; + } + } break; + case ucode::AluVectorOpcode::kDp4: { + vector_result[0] = 0.0f; + for (uint32_t i = 0; i < 4; ++i) { + // Direct3D 9 behavior (0 or denormal * anything = +0). + // Doing the addition even for zero operands because +0 + -0 must be + // +0. + vector_result[0] += + (vector_operands[0][i] && vector_operands[1][i]) + ? vector_operands[0][i] * vector_operands[1][i] + : 0.0f; + } + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kDp3: { + vector_result[0] = 0.0f; + for (uint32_t i = 0; i < 3; ++i) { + // Direct3D 9 behavior (0 or denormal * anything = +0). + // Doing the addition even for zero operands because +0 + -0 must be + // +0. + vector_result[0] += + (vector_operands[0][i] && vector_operands[1][i]) + ? vector_operands[0][i] * vector_operands[1][i] + : 0.0f; + } + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kDp2Add: { + // Doing the addition even for zero operands because +0 + -0 must be +0. + vector_result[0] = 0.0f; + for (uint32_t i = 0; i < 2; ++i) { + // Direct3D 9 behavior (0 or denormal * anything = +0). + vector_result[0] += + (vector_operands[0][i] && vector_operands[1][i]) + ? vector_operands[0][i] * vector_operands[1][i] + : 0.0f; + } + vector_result[0] += vector_operands[2][0]; + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kCube: { + // Operand [0] is .z_xy. + float x = vector_operands[0][2]; + float y = vector_operands[0][3]; + float z = vector_operands[0][0]; + float x_abs = std::abs(x), y_abs = std::abs(y), z_abs = std::abs(z); + // Result is T coordinate, S coordinate, 2 * major axis, face ID. + if (z_abs >= x_abs && z_abs >= y_abs) { + vector_result[0] = -y; + vector_result[1] = z < 0.0f ? -x : x; + vector_result[2] = z; + vector_result[3] = z < 0.0f ? 5.0f : 4.0f; + } else if (y_abs >= x_abs) { + vector_result[0] = y < 0.0f ? -z : z; + vector_result[1] = x; + vector_result[2] = y; + vector_result[3] = y < 0.0f ? 3.0f : 2.0f; + } else { + vector_result[0] = -y; + vector_result[1] = x < 0.0f ? z : -z; + vector_result[2] = x; + vector_result[3] = x < 0.0f ? 1.0f : 0.0f; + } + vector_result[2] *= 2.0f; + } break; + case ucode::AluVectorOpcode::kMax4: { + if (vector_operands[0][0] >= vector_operands[0][1] && + vector_operands[0][0] >= vector_operands[0][2] && + vector_operands[0][0] >= vector_operands[0][3]) { + vector_result[0] = vector_operands[0][0]; + } else if (vector_operands[0][1] >= vector_operands[0][2] && + vector_operands[0][1] >= vector_operands[0][3]) { + vector_result[0] = vector_operands[0][1]; + } else if (vector_operands[0][2] >= vector_operands[0][3]) { + vector_result[0] = vector_operands[0][2]; + } else { + vector_result[0] = vector_operands[0][3]; + } + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kSetpEqPush: { + state_.predicate = + vector_operands[0][3] == 0.0f && vector_operands[1][3] == 0.0f; + vector_result[0] = + (vector_operands[0][0] == 0.0f && vector_operands[1][0] == 0.0f) + ? 0.0f + : vector_operands[0][0] + 1.0f; + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kSetpNePush: { + state_.predicate = + vector_operands[0][3] == 0.0f && vector_operands[1][3] != 0.0f; + vector_result[0] = + (vector_operands[0][0] == 0.0f && vector_operands[1][0] != 0.0f) + ? 0.0f + : vector_operands[0][0] + 1.0f; + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kSetpGtPush: { + state_.predicate = + vector_operands[0][3] == 0.0f && vector_operands[1][3] > 0.0f; + vector_result[0] = + (vector_operands[0][0] == 0.0f && vector_operands[1][0] > 0.0f) + ? 0.0f + : vector_operands[0][0] + 1.0f; + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kSetpGePush: { + state_.predicate = + vector_operands[0][3] == 0.0f && vector_operands[1][3] >= 0.0f; + vector_result[0] = + (vector_operands[0][0] == 0.0f && vector_operands[1][0] >= 0.0f) + ? 0.0f + : vector_operands[0][0] + 1.0f; + replicate_vector_result_x = true; + } break; + // Not implementing pixel kill currently, the interpreter is currently + // used only for vertex shaders. + case ucode::AluVectorOpcode::kKillEq: { + vector_result[0] = + float(vector_operands[0][0] == vector_operands[1][0] || + vector_operands[0][1] == vector_operands[1][1] || + vector_operands[0][2] == vector_operands[1][2] || + vector_operands[0][3] == vector_operands[1][3]); + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kKillGt: { + vector_result[0] = + float(vector_operands[0][0] > vector_operands[1][0] || + vector_operands[0][1] > vector_operands[1][1] || + vector_operands[0][2] > vector_operands[1][2] || + vector_operands[0][3] > vector_operands[1][3]); + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kKillGe: { + vector_result[0] = + float(vector_operands[0][0] >= vector_operands[1][0] || + vector_operands[0][1] >= vector_operands[1][1] || + vector_operands[0][2] >= vector_operands[1][2] || + vector_operands[0][3] >= vector_operands[1][3]); + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kKillNe: { + vector_result[0] = + float(vector_operands[0][0] != vector_operands[1][0] || + vector_operands[0][1] != vector_operands[1][1] || + vector_operands[0][2] != vector_operands[1][2] || + vector_operands[0][3] != vector_operands[1][3]); + replicate_vector_result_x = true; + } break; + case ucode::AluVectorOpcode::kDst: { + vector_result[0] = 1.0f; + // Direct3D 9 behavior (0 or denormal * anything = +0). + vector_result[1] = (vector_operands[0][1] && vector_operands[1][1]) + ? vector_operands[0][1] * vector_operands[1][1] + : 0.0f; + vector_result[2] = vector_operands[0][2]; + vector_result[3] = vector_operands[1][3]; + } break; + case ucode::AluVectorOpcode::kMaxA: { + // std::max is `a < b ? b : a`, thus in case of NaN, the first argument + // (-256.0f) is always the result. + state_.address_register = int32_t(std::floor( + std::min(255.0f, std::max(-256.0f, vector_operands[0][3])) + 0.5f)); + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = vector_operands[0][i] >= vector_operands[1][i] + ? vector_operands[0][i] + : vector_operands[1][i]; + } + } break; + default: { + assert_unhandled_case(vector_opcode); + } + } + if (replicate_vector_result_x) { + for (uint32_t i = 1; i < 4; ++i) { + vector_result[i] = vector_result[0]; + } + } + } + + // Scalar operation. + ucode::AluScalarOpcode scalar_opcode = instr.scalar_opcode(); + const ucode::AluScalarOpcodeInfo& scalar_opcode_info = + ucode::GetAluScalarOpcodeInfo(scalar_opcode); + float scalar_operands[2]; + uint32_t scalar_operand_component_count = 0; + bool scalar_src_absolute = false; + switch (scalar_opcode_info.operand_count) { + case 1: { + // r#/c#.w or r#/c#.wx. + const float* scalar_src_ptr; + uint32_t scalar_src_register = instr.src_reg(3); + if (instr.src_is_temp(3)) { + scalar_src_ptr = GetTempRegister( + ucode::AluInstruction::src_temp_reg(scalar_src_register), + ucode::AluInstruction::is_src_temp_relative(scalar_src_register)); + scalar_src_absolute = ucode::AluInstruction::is_src_temp_value_absolute( + scalar_src_register); + } else { + scalar_src_ptr = GetFloatConstant( + scalar_src_register, instr.src_const_is_addressed(3), + instr.is_const_address_register_relative()); + } + uint32_t scalar_src_swizzle = instr.src_swizzle(3); + scalar_operand_component_count = + scalar_opcode_info.single_operand_is_two_component ? 2 : 1; + for (uint32_t i = 0; i < scalar_operand_component_count; ++i) { + scalar_operands[i] = + scalar_src_ptr[ucode::AluInstruction::GetSwizzledComponentIndex( + scalar_src_swizzle, (3 + i) & 3)]; + } + } break; + case 2: { + scalar_operand_component_count = 2; + uint32_t scalar_src_absolute_mask = + ~(uint32_t(instr.abs_constants()) << 31); + uint32_t scalar_src_negate_bit = uint32_t(instr.src_negate(3)) << 31; + uint32_t scalar_src_swizzle = instr.src_swizzle(3); + // c#.w. + scalar_operands[0] = + GetFloatConstant(instr.src_reg(3), instr.src_const_is_addressed(3), + instr.is_const_address_register_relative()) + [ucode::AluInstruction::GetSwizzledComponentIndex( + scalar_src_swizzle, 3)]; + // r#.x. + scalar_operands[1] = GetTempRegister( + instr.scalar_const_reg_op_src_temp_reg(), + false)[ucode::AluInstruction::GetSwizzledComponentIndex( + scalar_src_swizzle, 0)]; + } break; + } + if (scalar_operand_component_count) { + uint32_t scalar_src_absolute_mask = ~(uint32_t(scalar_src_absolute) << 31); + uint32_t scalar_src_negate_bit = uint32_t(instr.src_negate(3)) << 31; + for (uint32_t i = 0; i < scalar_operand_component_count; ++i) { + float scalar_operand = FlushDenormal(scalar_operands[i]); + *reinterpret_cast(&scalar_operand) = + (*reinterpret_cast(&scalar_operand) & + scalar_src_absolute_mask) ^ + scalar_src_negate_bit; + scalar_operands[i] = scalar_operand; + } + } + switch (scalar_opcode) { + case ucode::AluScalarOpcode::kAdds: + case ucode::AluScalarOpcode::kAddsc0: + case ucode::AluScalarOpcode::kAddsc1: { + state_.previous_scalar = scalar_operands[0] + scalar_operands[1]; + } break; + case ucode::AluScalarOpcode::kAddsPrev: { + state_.previous_scalar = scalar_operands[0] + state_.previous_scalar; + } break; + case ucode::AluScalarOpcode::kMuls: + case ucode::AluScalarOpcode::kMulsc0: + case ucode::AluScalarOpcode::kMulsc1: { + // Direct3D 9 behavior (0 or denormal * anything = +0). + state_.previous_scalar = (scalar_operands[0] && scalar_operands[1]) + ? scalar_operands[0] * scalar_operands[1] + : 0.0f; + } break; + case ucode::AluScalarOpcode::kMulsPrev: { + // Direct3D 9 behavior (0 or denormal * anything = +0). + state_.previous_scalar = (scalar_operands[0] && state_.previous_scalar) + ? scalar_operands[0] * state_.previous_scalar + : 0.0f; + } break; + case ucode::AluScalarOpcode::kMulsPrev2: { + if (state_.previous_scalar == -FLT_MAX || + !std::isfinite(state_.previous_scalar) || + !std::isfinite(scalar_operands[1]) || scalar_operands[1] <= 0.0f) { + state_.previous_scalar = -FLT_MAX; + } else { + // Direct3D 9 behavior (0 or denormal * anything = +0). + state_.previous_scalar = + (scalar_operands[0] && state_.previous_scalar) + ? scalar_operands[0] * state_.previous_scalar + : 0.0f; + } + } break; + case ucode::AluScalarOpcode::kMaxs: { + state_.previous_scalar = scalar_operands[0] >= scalar_operands[1] + ? scalar_operands[0] + : scalar_operands[1]; + } break; + case ucode::AluScalarOpcode::kMins: { + state_.previous_scalar = scalar_operands[0] >= scalar_operands[1] + ? scalar_operands[0] + : scalar_operands[1]; + } break; + case ucode::AluScalarOpcode::kSeqs: { + state_.previous_scalar = float(scalar_operands[0] == 0.0f); + } break; + case ucode::AluScalarOpcode::kSgts: { + state_.previous_scalar = float(scalar_operands[0] > 0.0f); + } break; + case ucode::AluScalarOpcode::kSges: { + state_.previous_scalar = float(scalar_operands[0] >= 0.0f); + } break; + case ucode::AluScalarOpcode::kSnes: { + state_.previous_scalar = float(scalar_operands[0] != 0.0f); + } break; + case ucode::AluScalarOpcode::kFrcs: { + state_.previous_scalar = + scalar_operands[0] - std::floor(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kTruncs: { + state_.previous_scalar = std::trunc(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kFloors: { + state_.previous_scalar = std::floor(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kExp: { + state_.previous_scalar = std::exp2(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kLogc: { + state_.previous_scalar = std::log2(scalar_operands[0]); + if (state_.previous_scalar == -INFINITY) { + state_.previous_scalar = -FLT_MAX; + } + } break; + case ucode::AluScalarOpcode::kLog: { + state_.previous_scalar = std::log2(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kRcpc: { + state_.previous_scalar = 1.0f / scalar_operands[0]; + if (state_.previous_scalar == -INFINITY) { + state_.previous_scalar = -FLT_MAX; + } else if (state_.previous_scalar == INFINITY) { + state_.previous_scalar = FLT_MAX; + } + } break; + case ucode::AluScalarOpcode::kRcpf: { + state_.previous_scalar = 1.0f / scalar_operands[0]; + if (state_.previous_scalar == -INFINITY) { + state_.previous_scalar = -0.0f; + } else if (state_.previous_scalar == INFINITY) { + state_.previous_scalar = 0.0f; + } + } break; + case ucode::AluScalarOpcode::kRcp: { + state_.previous_scalar = 1.0f / scalar_operands[0]; + } break; + case ucode::AluScalarOpcode::kRsqc: { + state_.previous_scalar = 1.0f / std::sqrt(scalar_operands[0]); + if (state_.previous_scalar == -INFINITY) { + state_.previous_scalar = -FLT_MAX; + } else if (state_.previous_scalar == INFINITY) { + state_.previous_scalar = FLT_MAX; + } + } break; + case ucode::AluScalarOpcode::kRsqf: { + state_.previous_scalar = 1.0f / std::sqrt(scalar_operands[0]); + if (state_.previous_scalar == -INFINITY) { + state_.previous_scalar = -0.0f; + } else if (state_.previous_scalar == INFINITY) { + state_.previous_scalar = 0.0f; + } + } break; + case ucode::AluScalarOpcode::kRsq: { + state_.previous_scalar = 1.0f / std::sqrt(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kMaxAs: { + // std::max is `a < b ? b : a`, thus in case of NaN, the first argument + // (-256.0f) is always the result. + state_.address_register = int32_t(std::floor( + std::min(255.0f, std::max(-256.0f, scalar_operands[0])) + 0.5f)); + state_.previous_scalar = scalar_operands[0] >= scalar_operands[1] + ? scalar_operands[0] + : scalar_operands[1]; + } break; + case ucode::AluScalarOpcode::kMaxAsf: { + // std::max is `a < b ? b : a`, thus in case of NaN, the first argument + // (-256.0f) is always the result. + state_.address_register = int32_t( + std::floor(std::min(255.0f, std::max(-256.0f, scalar_operands[0])))); + state_.previous_scalar = scalar_operands[0] >= scalar_operands[1] + ? scalar_operands[0] + : scalar_operands[1]; + } break; + case ucode::AluScalarOpcode::kSubs: + case ucode::AluScalarOpcode::kSubsc0: + case ucode::AluScalarOpcode::kSubsc1: { + state_.previous_scalar = scalar_operands[0] - scalar_operands[1]; + } break; + case ucode::AluScalarOpcode::kSubsPrev: { + state_.previous_scalar = scalar_operands[0] - state_.previous_scalar; + } break; + case ucode::AluScalarOpcode::kSetpEq: { + state_.predicate = scalar_operands[0] == 0.0f; + state_.previous_scalar = float(!state_.predicate); + } break; + case ucode::AluScalarOpcode::kSetpNe: { + state_.predicate = scalar_operands[0] != 0.0f; + state_.previous_scalar = float(!state_.predicate); + } break; + case ucode::AluScalarOpcode::kSetpGt: { + state_.predicate = scalar_operands[0] > 0.0f; + state_.previous_scalar = float(!state_.predicate); + } break; + case ucode::AluScalarOpcode::kSetpGe: { + state_.predicate = scalar_operands[0] >= 0.0f; + state_.previous_scalar = float(!state_.predicate); + } break; + case ucode::AluScalarOpcode::kSetpInv: { + state_.predicate = scalar_operands[0] == 1.0f; + state_.previous_scalar = + state_.predicate + ? 0.0f + : (scalar_operands[0] == 0.0f ? 1.0f : scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kSetpPop: { + float new_counter = scalar_operands[0] - 1.0f; + state_.predicate = new_counter <= 0.0f; + state_.previous_scalar = state_.predicate ? 0.0f : new_counter; + } break; + case ucode::AluScalarOpcode::kSetpClr: { + state_.predicate = false; + state_.previous_scalar = FLT_MAX; + } break; + case ucode::AluScalarOpcode::kSetpRstr: { + state_.predicate = scalar_operands[0] == 0.0f; + state_.previous_scalar = state_.predicate ? 0.0f : scalar_operands[0]; + } break; + // Not implementing pixel kill currently, the interpreter is currently used + // only for vertex shaders. + case ucode::AluScalarOpcode::kKillsEq: { + state_.previous_scalar = float(scalar_operands[0] == 0.0f); + } break; + case ucode::AluScalarOpcode::kKillsGt: { + state_.previous_scalar = float(scalar_operands[0] > 0.0f); + } break; + case ucode::AluScalarOpcode::kKillsGe: { + state_.previous_scalar = float(scalar_operands[0] >= 0.0f); + } break; + case ucode::AluScalarOpcode::kKillsNe: { + state_.previous_scalar = float(scalar_operands[0] != 0.0f); + } break; + case ucode::AluScalarOpcode::kKillsOne: { + state_.previous_scalar = float(scalar_operands[0] == 1.0f); + } break; + case ucode::AluScalarOpcode::kSqrt: { + state_.previous_scalar = std::sqrt(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kSin: { + state_.previous_scalar = std::sin(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kCos: { + state_.previous_scalar = std::cos(scalar_operands[0]); + } break; + case ucode::AluScalarOpcode::kRetainPrev: { + } break; + default: { + assert_unhandled_case(scalar_opcode); + } + } + + if (instr.vector_clamp()) { + for (uint32_t i = 0; i < 4; ++i) { + vector_result[i] = xe::saturate_unsigned(vector_result[i]); + } + } + float scalar_result = instr.scalar_clamp() + ? xe::saturate_unsigned(state_.previous_scalar) + : state_.previous_scalar; + + uint32_t scalar_result_write_mask = instr.GetScalarOpResultWriteMask(); + if (instr.is_export()) { + if (export_sink_) { + float export_value[4]; + uint32_t export_constant_1_mask = instr.GetConstant1WriteMask(); + uint32_t export_mask = + vector_result_write_mask | scalar_result_write_mask | + instr.GetConstant0WriteMask() | export_constant_1_mask; + for (uint32_t i = 0; i < 4; ++i) { + uint32_t export_component_bit = UINT32_C(1) << i; + float export_component = 0.0f; + if (vector_result_write_mask & export_component_bit) { + export_component = vector_result[i]; + } else if (scalar_result_write_mask & export_component_bit) { + export_component = scalar_result; + } else if (export_constant_1_mask & export_component_bit) { + export_component = 1.0f; + } else { + export_component = 0.0f; + } + export_value[i] = export_component; + } + export_sink_->Export( + ucode::ExportRegister(instr.vector_dest()), export_value, + vector_result_write_mask | scalar_result_write_mask | + instr.GetConstant0WriteMask() | export_constant_1_mask); + } + } else { + if (vector_result_write_mask) { + float* vector_dest = + GetTempRegister(instr.vector_dest(), instr.is_vector_dest_relative()); + for (uint32_t i = 0; i < 4; ++i) { + if (vector_result_write_mask & (UINT32_C(1) << i)) { + vector_dest[i] = vector_result[i]; + } + } + } + if (scalar_result_write_mask) { + float* scalar_dest = + GetTempRegister(instr.scalar_dest(), instr.is_scalar_dest_relative()); + for (uint32_t i = 0; i < 4; ++i) { + if (scalar_result_write_mask & (UINT32_C(1) << i)) { + scalar_dest[i] = scalar_result; + } + } + } + } +} + +void ShaderInterpreter::StoreFetchResult(uint32_t dest, bool is_dest_relative, + uint32_t swizzle, const float* value) { + float* dest_data = GetTempRegister(dest, is_dest_relative); + for (uint32_t i = 0; i < 4; ++i) { + ucode::FetchDestinationSwizzle component_swizzle = + ucode::GetFetchDestinationComponentSwizzle(swizzle, i); + switch (component_swizzle) { + case ucode::FetchDestinationSwizzle::kX: + dest_data[i] = value[0]; + break; + case ucode::FetchDestinationSwizzle::kY: + dest_data[i] = value[1]; + break; + case ucode::FetchDestinationSwizzle::kZ: + dest_data[i] = value[2]; + break; + case ucode::FetchDestinationSwizzle::kW: + dest_data[i] = value[3]; + break; + case ucode::FetchDestinationSwizzle::k1: + dest_data[i] = 1.0f; + break; + case ucode::FetchDestinationSwizzle::kKeep: + break; + default: + // ucode::FetchDestinationSwizzle::k0 or the invalid swizzle 6. + // TODO(Triang3l): Find the correct handling of the invalid swizzle 6. + assert_true(component_swizzle == ucode::FetchDestinationSwizzle::k0); + dest_data[i] = 0.0f; + break; + } + } +} + +void ShaderInterpreter::ExecuteVertexFetchInstruction( + ucode::VertexFetchInstruction instr) { + // FIXME(Triang3l): Bit scan loops over components cause a link-time + // optimization internal error in Visual Studio 2019, mainly in the format + // unpacking. Using loops with up to 4 iterations here instead. + + if (!instr.is_mini_fetch()) { + state_.vfetch_full_last = instr; + } + + xenos::xe_gpu_vertex_fetch_t fetch_constant = + *reinterpret_cast( + ®ister_file_[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + + state_.vfetch_full_last.fetch_constant_index()]); + + if (!instr.is_mini_fetch()) { + // Get the part of the address that depends on vfetch_full data. + uint32_t vertex_index = uint32_t(std::floor( + GetTempRegister(instr.src(), + instr.is_src_relative())[instr.src_swizzle()] + + (instr.is_index_rounded() ? 0.5f : 0.0f))); + state_.vfetch_address_dwords = + instr.stride() * vertex_index + fetch_constant.address; + } + + // TODO(Triang3l): Find the default values for unused components. + float result[4] = {}; + uint32_t dest_swizzle = instr.dest_swizzle(); + uint32_t used_result_components = 0b0000; + for (uint32_t i = 0; i < 4; ++i) { + uint32_t dest_component_swizzle = (dest_swizzle >> (3 * i)) & 0b111; + if (dest_component_swizzle <= 3) { + used_result_components |= UINT32_C(1) << dest_component_swizzle; + } + } + uint32_t needed_dwords = xenos::GetVertexFormatNeededWords( + instr.data_format(), used_result_components); + if (needed_dwords) { + uint32_t data[4] = {}; + const uint32_t* memory_dwords = + reinterpret_cast(memory_.physical_membase()); + uint32_t buffer_end_dwords = fetch_constant.address + fetch_constant.size; + uint32_t dword_0_address_dwords = + uint32_t(int32_t(state_.vfetch_address_dwords) + instr.offset()); + for (uint32_t i = 0; i < 4; ++i) { + if (!(needed_dwords & (UINT32_C(1) << i))) { + continue; + } + uint32_t dword_value = 0; + uint32_t dword_address_dwords = dword_0_address_dwords + i; + if (dword_address_dwords >= fetch_constant.address && + dword_address_dwords < buffer_end_dwords) { + if (trace_writer_) { + trace_writer_->WriteMemoryRead( + sizeof(uint32_t) * dword_address_dwords, sizeof(uint32_t)); + } + dword_value = xenos::GpuSwap(memory_dwords[dword_address_dwords], + fetch_constant.endian); + } + data[i] = dword_value; + } + + uint32_t packed_components = 0b0000; + uint32_t packed_widths[4], packed_offsets[4]; + uint32_t packed_dwords[] = {data[0], data[0]}; + switch (instr.data_format()) { + case xenos::VertexFormat::k_8_8_8_8: { + packed_components = 0b1111; + packed_widths[0] = packed_widths[1] = packed_widths[2] = + packed_widths[3] = 8; + packed_offsets[1] = 8; + packed_offsets[2] = 16; + packed_offsets[3] = 24; + } break; + case xenos::VertexFormat::k_2_10_10_10: { + packed_components = 0b1111; + packed_widths[0] = packed_widths[1] = packed_widths[2] = 10; + packed_widths[3] = 2; + packed_offsets[1] = 10; + packed_offsets[2] = 20; + packed_offsets[3] = 30; + } break; + case xenos::VertexFormat::k_10_11_11: { + packed_components = 0b0111; + packed_widths[0] = packed_widths[1] = 11; + packed_widths[2] = 10; + packed_offsets[1] = 11; + packed_offsets[2] = 22; + } break; + case xenos::VertexFormat::k_11_11_10: { + packed_components = 0b0111; + packed_widths[0] = 10; + packed_widths[1] = packed_widths[2] = 11; + packed_offsets[1] = 10; + packed_offsets[2] = 21; + } break; + case xenos::VertexFormat::k_16_16: { + packed_components = 0b0011; + packed_widths[0] = packed_widths[1] = 16; + packed_offsets[1] = 16; + } break; + case xenos::VertexFormat::k_16_16_16_16: { + packed_components = 0b1111; + packed_widths[0] = packed_widths[1] = packed_widths[2] = + packed_widths[3] = 16; + packed_offsets[1] = packed_offsets[3] = 16; + packed_dwords[1] = data[1]; + } break; + case xenos::VertexFormat::k_16_16_16_16_FLOAT: { + if (used_result_components & 0b1000) { + result[3] = xe::xenos_half_to_float(uint16_t(data[1] >> 16)); + } + if (used_result_components & 0b0100) { + result[2] = xe::xenos_half_to_float(uint16_t(data[1])); + } + } + [[fallthrough]]; + case xenos::VertexFormat::k_16_16_FLOAT: { + if (used_result_components & 0b0010) { + result[1] = xe::xenos_half_to_float(uint16_t(data[0] >> 16)); + } + if (used_result_components & 0b0001) { + result[0] = xe::xenos_half_to_float(uint16_t(data[0])); + } + } break; + case xenos::VertexFormat::k_32: + case xenos::VertexFormat::k_32_32: + case xenos::VertexFormat::k_32_32_32_32: { + if (instr.is_signed()) { + for (uint32_t i = 0; i < 4; ++i) { + result[i] = float(int32_t(data[i])); + } + if (instr.is_normalized()) { + if (instr.signed_rf_mode() == + xenos::SignedRepeatingFractionMode::kNoZero) { + for (uint32_t i = 0; i < 4; ++i) { + result[i] = (result[i] + 0.5f) / 2147483647.5f; + } + } else { + for (uint32_t i = 0; i < 4; ++i) { + result[i] /= 2147483647.0f; + // No need to clamp to -1 if signed - the smallest value will be + // -2^23 / 2^23 due to rounding. + } + } + } + } else { + for (uint32_t i = 0; i < 4; ++i) { + result[i] = float(data[i]); + } + if (instr.is_normalized()) { + for (uint32_t i = 0; i < 4; ++i) { + result[i] /= 4294967295.0f; + } + } + } + } break; + case xenos::VertexFormat::k_32_FLOAT: + case xenos::VertexFormat::k_32_32_FLOAT: + case xenos::VertexFormat::k_32_32_32_32_FLOAT: + case xenos::VertexFormat::k_32_32_32_FLOAT: { + for (uint32_t i = 0; i < 4; ++i) { + result[i] = *reinterpret_cast(&data[i]); + } + } break; + default: + assert_unhandled_case(instr.data_format()); + break; + } + + packed_components &= used_result_components; + if (packed_components) { + if (instr.is_signed()) { + for (uint32_t i = 0; i < 4; ++i) { + if (!(packed_components & (UINT32_C(1) << i))) { + continue; + } + uint32_t packed_width = packed_widths[i]; + result[i] = float(int32_t(packed_dwords[i >> 1]) + << (32 - (packed_width + packed_offsets[i])) >> + (32 - packed_width)); + } + if (instr.is_normalized()) { + if (instr.signed_rf_mode() == + xenos::SignedRepeatingFractionMode::kNoZero) { + for (uint32_t i = 0; i < 4; ++i) { + if (!(packed_components & (UINT32_C(1) << i))) { + continue; + } + result[i] = (result[i] + 0.5f) * 2.0f / + float((UINT32_C(1) << packed_widths[i]) - 1); + } + } else { + for (uint32_t i = 0; i < 4; ++i) { + if (!(packed_components & (UINT32_C(1) << i))) { + continue; + } + result[i] = std::max( + -1.0f, + result[i] / + float((UINT32_C(1) << (packed_widths[i] - 1)) - 1)); + } + } + } + } else { + for (uint32_t i = 0; i < 4; ++i) { + if (!(packed_components & (UINT32_C(1) << i))) { + continue; + } + uint32_t packed_width = packed_widths[i]; + result[i] = float(packed_dwords[i >> 1] & + ((UINT32_C(1) << packed_widths[i]) - 1)); + } + if (instr.is_normalized()) { + for (uint32_t i = 0; i < 4; ++i) { + if (!(packed_components & (UINT32_C(1) << i))) { + continue; + } + result[i] /= float((UINT32_C(1) << packed_widths[i]) - 1); + } + } + } + } + } + + int32_t exp_adjust = instr.exp_adjust(); + if (exp_adjust) { + float exp_adjust_factor = std::ldexp(1.0f, exp_adjust); + for (uint32_t i = 0; i < 4; ++i) { + result[i] *= exp_adjust_factor; + } + } + + StoreFetchResult(instr.dest(), instr.is_dest_relative(), instr.dest_swizzle(), + result); +} + +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/shader_interpreter.h b/src/xenia/gpu/shader_interpreter.h new file mode 100644 index 000000000..6182acecf --- /dev/null +++ b/src/xenia/gpu/shader_interpreter.h @@ -0,0 +1,149 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2022 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_SHADER_INTERPRETER_H_ +#define XENIA_GPU_SHADER_INTERPRETER_H_ + +#include +#include +#include + +#include "xenia/base/assert.h" +#include "xenia/gpu/register_file.h" +#include "xenia/gpu/shader.h" +#include "xenia/gpu/trace_writer.h" +#include "xenia/gpu/ucode.h" +#include "xenia/gpu/xenos.h" +#include "xenia/memory.h" + +namespace xe { +namespace gpu { + +class ShaderInterpreter { + public: + ShaderInterpreter(const RegisterFile& register_file, const Memory& memory) + : register_file_(register_file), memory_(memory) {} + + class ExportSink { + public: + virtual ~ExportSink() = default; + virtual void AllocExport(ucode::AllocType type, uint32_t size) {} + virtual void Export(ucode::ExportRegister export_register, + const float* value, uint32_t value_mask) {} + }; + + void SetTraceWriter(TraceWriter* new_trace_writer) { + trace_writer_ = new_trace_writer; + } + + ExportSink* GetExportSink() const { return export_sink_; } + void SetExportSink(ExportSink* new_export_sink) { + export_sink_ = new_export_sink; + } + + const float* temp_registers() const { return &temp_registers_[0][0]; } + float* temp_registers() { return &temp_registers_[0][0]; } + + static bool CanInterpretShader(const Shader& shader) { + assert_true(shader.is_ucode_analyzed()); + // Texture instructions are not very common in vertex shaders (and not used + // in Direct3D 9's internal rectangles such as clears) and are extremely + // complex, not implemented. + if (shader.uses_texture_fetch_instruction_results()) { + return false; + } + return true; + } + void SetShader(xenos::ShaderType shader_type, const uint32_t* ucode) { + shader_type_ = shader_type; + ucode_ = ucode; + } + void SetShader(const Shader& shader) { + assert_true(CanInterpretShader(shader)); + SetShader(shader.type(), shader.ucode_dwords()); + } + + void Execute(); + + private: + struct State { + ucode::VertexFetchInstruction vfetch_full_last; + uint32_t vfetch_address_dwords; + float previous_scalar; + uint32_t call_stack_depth; + uint32_t call_return_addresses[4]; + uint32_t loop_stack_depth; + xenos::LoopConstant loop_constants[4]; + uint32_t loop_iterators[4]; + int32_t address_register; + bool predicate; + + void Reset() { std::memset(this, 0, sizeof(*this)); } + + int32_t GetLoopAddress() const { + assert_true(loop_stack_depth && loop_stack_depth < 4); + if (!loop_stack_depth || loop_stack_depth >= 4) { + return 0; + } + xenos::LoopConstant loop_constant = loop_constants[loop_stack_depth]; + // Clamp to the real range specified in the IPR2015-00325 sequencer + // specification. + // https://portal.unifiedpatents.com/ptab/case/IPR2015-00325 + return std::min( + INT32_C(256), + std::max(INT32_C(-256), + int32_t(int32_t(loop_iterators[loop_stack_depth]) * + loop_constant.step + + loop_constant.start))); + } + }; + + static float FlushDenormal(float value) { + uint32_t bits = *reinterpret_cast(&value); + bits &= (bits & UINT32_C(0x7F800000)) ? ~UINT32_C(0) : (UINT32_C(1) << 31); + return *reinterpret_cast(&bits); + } + + const float* GetTempRegister(uint32_t address, bool is_relative) const { + return temp_registers_[( + int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)]; + } + // For simplicity (due to writability), not bounds-checking. + float* GetTempRegister(uint32_t address, bool is_relative) { + return temp_registers_[( + int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)]; + } + const float* GetFloatConstant(uint32_t address, bool is_relative, + bool relative_address_is_a0) const; + + void ExecuteAluInstruction(ucode::AluInstruction instr); + void StoreFetchResult(uint32_t dest, bool is_dest_relative, uint32_t swizzle, + const float* value); + void ExecuteVertexFetchInstruction(ucode::VertexFetchInstruction instr); + + const RegisterFile& register_file_; + const Memory& memory_; + + TraceWriter* trace_writer_ = nullptr; + + ExportSink* export_sink_ = nullptr; + + xenos::ShaderType shader_type_ = xenos::ShaderType::kVertex; + const uint32_t* ucode_ = nullptr; + + // For both inputs and locals. + float temp_registers_[64][4]; + + State state_; +}; + +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_SHADER_INTERPRETER_H_ diff --git a/src/xenia/gpu/shader_translator.cc b/src/xenia/gpu/shader_translator.cc index 4e4bce854..4f4c1736c 100644 --- a/src/xenia/gpu/shader_translator.cc +++ b/src/xenia/gpu/shader_translator.cc @@ -334,6 +334,10 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op, GatherOperandInformation(binding.fetch_instr.operands[i]); } + if (binding.fetch_instr.result.GetUsedResultComponents()) { + uses_texture_fetch_instruction_results_ = true; + } + switch (op.opcode()) { case FetchOpcode::kSetTextureLod: case FetchOpcode::kSetTextureGradientsHorz: