[GPU] Get unclipped draw height by running VS on the CPU

This commit is contained in:
Triang3l 2022-04-28 22:25:25 +03:00
parent b2b1d7b518
commit 0fd578cafd
12 changed files with 1866 additions and 53 deletions

View File

@ -847,7 +847,8 @@ bool D3D12CommandProcessor::SetupContext() {
// Initialize the render target cache before configuring binding - need to
// know if using rasterizer-ordered views for the bindless root signature.
render_target_cache_ = std::make_unique<D3D12RenderTargetCache>(
*register_file_, *this, trace_writer_, bindless_resources_used_);
*register_file_, *memory_, trace_writer_, *this,
bindless_resources_used_);
if (!render_target_cache_->Initialize()) {
XELOGE("Failed to initialize the render target cache");
return false;
@ -2147,7 +2148,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
: 0;
if (!render_target_cache_->Update(is_rasterization_done,
normalized_depth_control,
normalized_color_mask)) {
normalized_color_mask, *vertex_shader)) {
return false;
}

View File

@ -1251,10 +1251,10 @@ void D3D12RenderTargetCache::BeginSubmission() {
bool D3D12RenderTargetCache::Update(
bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control,
uint32_t shader_writes_color_targets) {
uint32_t shader_writes_color_targets, const Shader& vertex_shader) {
if (!RenderTargetCache::Update(is_rasterization_done,
normalized_depth_control,
shader_writes_color_targets)) {
shader_writes_color_targets, vertex_shader)) {
return false;
}
switch (GetPath()) {

View File

@ -43,10 +43,10 @@ class D3D12CommandProcessor;
class D3D12RenderTargetCache final : public RenderTargetCache {
public:
D3D12RenderTargetCache(const RegisterFile& register_file,
const Memory& memory, TraceWriter& trace_writer,
D3D12CommandProcessor& command_processor,
TraceWriter& trace_writer,
bool bindless_resources_used)
: RenderTargetCache(register_file),
: RenderTargetCache(register_file, memory, &trace_writer),
command_processor_(command_processor),
trace_writer_(trace_writer),
bindless_resources_used_(bindless_resources_used) {}
@ -65,7 +65,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
bool Update(bool is_rasterization_done,
reg::RB_DEPTHCONTROL normalized_depth_control,
uint32_t shader_writes_color_targets) override;
uint32_t shader_writes_color_targets,
const Shader& vertex_shader) override;
void InvalidateCommandListRenderTargets() {
are_current_command_list_render_targets_valid_ = false;

View File

@ -0,0 +1,350 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2022 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include "xenia/gpu/draw_extent_estimator.h"
#include <algorithm>
#include <cfloat>
#include <cstdint>
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/profiling.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/ucode.h"
#include "xenia/gpu/xenos.h"
#include "xenia/ui/graphics_util.h"
DEFINE_bool(
execute_unclipped_draw_vs_on_cpu, true,
"Execute the vertex shader for draws with clipping disabled, primarily "
"screen-space draws (such as clears), on the CPU when possible to estimate "
"the extent of the EDRAM involved in the draw.\n"
"Enabling this may significantly improve GPU performance as otherwise up "
"to the entire EDRAM may be considered used in draws without clipping, "
"potentially resulting in spurious EDRAM range ownership transfer round "
"trips between host render targets.\n"
"Also, on hosts where certain render target formats have to be emulated in "
"a lossy way (for instance, 16-bit fixed-point via 16-bit floating-point), "
"this prevents corruption of other render targets located after the "
"current ones in the EDRAM by lossy range ownership transfers done for "
"those draws.",
"GPU");
DEFINE_bool(
execute_unclipped_draw_vs_on_cpu_with_scissor, false,
"Don't restrict the usage of execute_unclipped_draw_vs_on_cpu to only "
"non-scissored draws (with the right and the bottom sides of the scissor "
"rectangle at 8192 or beyond) even though if the scissor rectangle is "
"present, it's usually sufficient for esimating the height of the render "
"target.\n"
"Enabling this may cause excessive processing of vertices on the CPU, as "
"some games draw rectangles (for their UI, for instance) without clipping, "
"but with a proper scissor rectangle.",
"GPU");
namespace xe {
namespace gpu {
void DrawExtentEstimator::PositionYExportSink::Export(
ucode::ExportRegister export_register, const float* value,
uint32_t value_mask) {
if (export_register == ucode::ExportRegister::kVSPosition) {
if (value_mask & 0b0010) {
position_y_ = value[1];
}
if (value_mask & 0b1000) {
position_w_ = value[3];
}
} else if (export_register ==
ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
if (value_mask & 0b0001) {
point_size_ = value[0];
}
if (value_mask & 0b0100) {
vertex_kill_ = *reinterpret_cast<const uint32_t*>(&value[2]);
}
}
}
uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) {
SCOPE_profile_cpu_f("gpu");
const RegisterFile& regs = register_file_;
auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
if (!vgt_draw_initiator.num_indices) {
return 0;
}
if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA &&
vgt_draw_initiator.source_select != xenos::SourceSelect::kAutoIndex) {
// TODO(Triang3l): Support immediate indices.
return xenos::kTexture2DCubeMaxWidthHeight;
}
// Not reproducing tessellation.
if (xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
vgt_draw_initiator.prim_type) &&
regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
xenos::VGTOutputPath::kTessellationEnable) {
return xenos::kTexture2DCubeMaxWidthHeight;
}
assert_true(vertex_shader.type() == xenos::ShaderType::kVertex);
assert_true(vertex_shader.is_ucode_analyzed());
if (!ShaderInterpreter::CanInterpretShader(vertex_shader)) {
return xenos::kTexture2DCubeMaxWidthHeight;
}
auto vgt_dma_size = regs.Get<reg::VGT_DMA_SIZE>();
union {
const void* index_buffer;
const uint16_t* index_buffer_16;
const uint32_t* index_buffer_32;
};
xenos::Endian index_endian = vgt_dma_size.swap_mode;
if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
xenos::IndexFormat index_format = vgt_draw_initiator.index_size;
uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32;
uint32_t index_buffer_read_count =
std::min(vgt_draw_initiator.num_indices, vgt_dma_size.num_words);
if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
// Handle the index endianness to same way as the PrimitiveProcessor.
if (index_endian == xenos::Endian::k8in32) {
index_endian = xenos::Endian::k8in16;
} else if (index_endian == xenos::Endian::k16in32) {
index_endian = xenos::Endian::kNone;
}
index_buffer_base &= ~uint32_t(sizeof(uint16_t) - 1);
if (trace_writer_) {
trace_writer_->WriteMemoryRead(
index_buffer_base, sizeof(uint16_t) * index_buffer_read_count);
}
} else {
assert_true(vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32);
index_buffer_base &= ~uint32_t(sizeof(uint32_t) - 1);
if (trace_writer_) {
trace_writer_->WriteMemoryRead(
index_buffer_base, sizeof(uint32_t) * index_buffer_read_count);
}
}
index_buffer = memory_.TranslatePhysical(index_buffer_base);
}
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
uint32_t reset_index =
regs.Get<reg::VGT_MULTI_PRIM_IB_RESET_INDX>().reset_indx;
uint32_t index_offset = regs.Get<reg::VGT_INDX_OFFSET>().indx_offset;
uint32_t min_index = regs.Get<reg::VGT_MIN_VTX_INDX>().min_indx;
uint32_t max_index = regs.Get<reg::VGT_MAX_VTX_INDX>().max_indx;
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
float viewport_y_scale = pa_cl_vte_cntl.vport_y_scale_ena
? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
: 1.0f;
float viewport_y_offset = pa_cl_vte_cntl.vport_y_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
: 0.0f;
int32_t point_vertex_min_diameter_float = 0;
int32_t point_vertex_max_diameter_float = 0;
float point_constant_radius_y = 0.0f;
if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
*reinterpret_cast<float*>(&point_vertex_min_diameter_float) =
float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
*reinterpret_cast<float*>(&point_vertex_max_diameter_float) =
float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
point_constant_radius_y =
float(regs.Get<reg::PA_SU_POINT_SIZE>().height) * (1.0f / 16.0f);
}
float max_y = -FLT_MAX;
shader_interpreter_.SetShader(vertex_shader);
PositionYExportSink position_y_export_sink;
shader_interpreter_.SetExportSink(&position_y_export_sink);
for (uint32_t i = 0; i < vgt_draw_initiator.num_indices; ++i) {
uint32_t vertex_index;
if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
if (i < vgt_dma_size.num_words) {
if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
vertex_index = index_buffer_16[i];
} else {
vertex_index = index_buffer_32[i];
}
// The Xenos only uses 24 bits of the index (reset_indx is 24-bit).
vertex_index = xenos::GpuSwap(vertex_index, index_endian) & 0xFFFFFF;
} else {
vertex_index = 0;
}
if (pa_su_sc_mode_cntl.multi_prim_ib_ena && vertex_index == reset_index) {
continue;
}
} else {
assert_true(vgt_draw_initiator.source_select ==
xenos::SourceSelect::kAutoIndex);
vertex_index = i;
}
vertex_index =
std::min(max_index,
std::max(min_index, (vertex_index + index_offset) & 0xFFFFFF));
position_y_export_sink.Reset();
shader_interpreter_.temp_registers()[0] = float(vertex_index);
shader_interpreter_.Execute();
if (position_y_export_sink.vertex_kill().has_value() &&
(position_y_export_sink.vertex_kill().value() & ~(UINT32_C(1) << 31))) {
continue;
}
if (!position_y_export_sink.position_y().has_value()) {
continue;
}
float vertex_y = position_y_export_sink.position_y().value();
if (!pa_cl_vte_cntl.vtx_xy_fmt) {
if (!position_y_export_sink.position_w().has_value()) {
continue;
}
vertex_y /= position_y_export_sink.position_w().value();
}
vertex_y = vertex_y * viewport_y_scale + viewport_y_offset;
if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
float point_radius_y;
if (position_y_export_sink.point_size().has_value()) {
// Vertex-specified diameter. Clamped effectively as a signed integer in
// the hardware, -NaN, -Infinity ... -0 to the minimum, +Infinity, +NaN
// to the maximum.
point_radius_y = position_y_export_sink.point_size().value();
*reinterpret_cast<int32_t*>(&point_radius_y) = std::min(
point_vertex_max_diameter_float,
std::max(point_vertex_min_diameter_float,
*reinterpret_cast<const int32_t*>(&point_radius_y)));
point_radius_y *= 0.5f;
} else {
// Constant radius.
point_radius_y = point_constant_radius_y;
}
vertex_y += point_radius_y;
}
// std::max is `a < b ? b : a`, thus in case of NaN, the first argument is
// always returned - max_y, which is initialized to a normalized value.
max_y = std::max(max_y, vertex_y);
}
shader_interpreter_.SetExportSink(nullptr);
int32_t max_y_24p8 = ui::FloatToD3D11Fixed16p8(max_y);
// 16p8 range is -32768 to 32767+255/256, but it's stored as uint32_t here,
// as 24p8, so overflowing up to -8388608 to 8388608+255/256 is safe. The
// range of the window offset plus the half-pixel offset is -16384 to 16384.5,
// so it's safe to add both - adding it will neither move the 16p8 clamping
// bounds -32768 and 32767+255/256 into the 0...8192 screen space range, nor
// cause 24p8 overflow.
if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
max_y_24p8 += 128;
}
if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
max_y_24p8 += regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset * 256;
}
// Top-left rule - .5 exclusive without MSAA, 1. exclusive with MSAA.
auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
return (uint32_t(std::max(int32_t(0), max_y_24p8)) +
((rb_surface_info.msaa_samples == xenos::MsaaSamples::k1X) ? 127
: 255)) >>
8;
}
uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
const Shader& vertex_shader) {
SCOPE_profile_cpu_f("gpu");
const RegisterFile& regs = register_file_;
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
int32_t window_y_offset = pa_sc_window_offset.window_y_offset;
// Scissor.
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
int32_t scissor_bottom = int32_t(pa_sc_window_scissor_br.br_y);
bool scissor_window_offset =
!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable;
if (scissor_window_offset) {
scissor_bottom += window_y_offset;
}
auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
scissor_bottom = std::min(scissor_bottom, pa_sc_screen_scissor_br.br_y);
uint32_t max_y = uint32_t(std::max(scissor_bottom, int32_t(0)));
if (regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
// Actual extent from the vertices.
if (try_to_estimate_vertex_max_y &&
cvars::execute_unclipped_draw_vs_on_cpu) {
bool estimate_vertex_max_y;
if (cvars::execute_unclipped_draw_vs_on_cpu_with_scissor) {
estimate_vertex_max_y = true;
} else {
estimate_vertex_max_y = false;
if (scissor_bottom >= xenos::kTexture2DCubeMaxWidthHeight) {
// Handle just the usual special 8192x8192 case in Direct3D 9 - 8192
// may be a normal render target height (80x8192 is well within the
// EDRAM size, for instance), no need to process the vertices on the
// CPU in this case.
int32_t scissor_right = int32_t(pa_sc_window_scissor_br.br_x);
if (scissor_window_offset) {
scissor_right += pa_sc_window_offset.window_x_offset;
}
scissor_right = std::min(scissor_right, pa_sc_screen_scissor_br.br_x);
if (scissor_right >= xenos::kTexture2DCubeMaxWidthHeight) {
estimate_vertex_max_y = true;
}
}
}
if (estimate_vertex_max_y) {
max_y = std::min(max_y, EstimateVertexMaxY(vertex_shader));
}
}
} else {
// Viewport. Though the Xenos itself doesn't have an implicit viewport
// scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
// usually exists and can't be disabled.
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
float viewport_bottom = 0.0f;
// First calculate all the integer.0 or integer.5 offsetting exactly at full
// precision.
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
viewport_bottom += float(window_y_offset);
}
if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
viewport_bottom += 0.5f;
}
// Then apply the floating-point viewport offset.
if (pa_cl_vte_cntl.vport_y_offset_ena) {
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
}
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
: 1.0f;
// Using floor, or, rather, truncation (because maxing with zero anyway)
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
// GPUs on Direct3D 12 (but not WARP), also like in
// draw_util::GetHostViewportInfo.
// max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
// argument in the !(a < b) case (always for NaN), min as float (max_y is
// well below 2^24) to safely drop very large values.
max_y = uint32_t(std::min(float(max_y), std::max(0.0f, viewport_bottom)));
}
return max_y;
}
} // namespace gpu
} // namespace xe

View File

@ -0,0 +1,76 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2022 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
#define XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
#include <cstdint>
#include <optional>
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/shader_interpreter.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/memory.h"
namespace xe {
namespace gpu {
class DrawExtentEstimator {
public:
DrawExtentEstimator(const RegisterFile& register_file, const Memory& memory,
TraceWriter* trace_writer)
: register_file_(register_file),
memory_(memory),
trace_writer_(trace_writer),
shader_interpreter_(register_file, memory) {
shader_interpreter_.SetTraceWriter(trace_writer);
}
// The shader must have its ucode analyzed.
uint32_t EstimateVertexMaxY(const Shader& vertex_shader);
uint32_t EstimateMaxY(bool try_to_estimate_vertex_max_y,
const Shader& vertex_shader);
private:
class PositionYExportSink : public ShaderInterpreter::ExportSink {
public:
void Export(ucode::ExportRegister export_register, const float* value,
uint32_t value_mask) override;
void Reset() {
position_y_.reset();
position_w_.reset();
point_size_.reset();
vertex_kill_.reset();
}
const std::optional<float>& position_y() const { return position_y_; }
const std::optional<float>& position_w() const { return position_w_; }
const std::optional<float>& point_size() const { return point_size_; }
const std::optional<uint32_t>& vertex_kill() const { return vertex_kill_; }
private:
std::optional<float> position_y_;
std::optional<float> position_w_;
std::optional<float> point_size_;
std::optional<uint32_t> vertex_kill_;
};
const RegisterFile& register_file_;
const Memory& memory_;
TraceWriter* trace_writer_;
ShaderInterpreter shader_interpreter_;
};
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_

View File

@ -215,6 +215,31 @@ union alignas(uint32_t) SQ_INTERPOLATOR_CNTL {
};
static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));
union alignas(uint32_t) SQ_VS_CONST {
uint32_t value;
struct {
uint32_t base : 9; // +0
uint32_t : 3; // +9
// Vec4 count minus one.
uint32_t size : 9; // 12
};
static constexpr Register register_index = XE_GPU_REG_SQ_VS_CONST;
};
static_assert_size(SQ_VS_CONST, sizeof(uint32_t));
// Same as SQ_VS_CONST.
union alignas(uint32_t) SQ_PS_CONST {
uint32_t value;
struct {
uint32_t base : 9; // +0
uint32_t : 3; // +9
// Vec4 count minus one.
uint32_t size : 9; // 12
};
static constexpr Register register_index = XE_GPU_REG_SQ_PS_CONST;
};
static_assert_size(SQ_PS_CONST, sizeof(uint32_t));
/*******************************************************************************
__ _____ ___ _____ _____ __
\ \ / / __| _ \_ _| __\ \/ /

View File

@ -22,7 +22,6 @@
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/xenos.h"
@ -143,6 +142,19 @@ DEFINE_bool(
"-1...1, remap -32...32 to -1...1 to use the full possible range of "
"values, at the expense of multiplicative blending correctness.",
"GPU");
// Enabled by default as the GPU is overall usually the bottleneck when the
// pixel shader interlock render backend implementation is used, anything that
// may improve GPU performance is favorable.
DEFINE_bool(
execute_unclipped_draw_vs_on_cpu_for_psi_render_backend, true,
"If execute_unclipped_draw_vs_on_cpu is enabled, execute the vertex shader "
"for unclipped draws on the CPU even when using the pixel shader interlock "
"(rasterizer-ordered view) implementation of the render backend on the "
"host, for which no expensive copying between host render targets is "
"needed when the ownership of a EDRAM range is changed.\n"
"If this is enabled, excessive barriers may be eliminated when switching "
"between different render targets in separate EDRAM locations.",
"GPU");
namespace xe {
namespace gpu {
@ -367,7 +379,8 @@ void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); }
bool RenderTargetCache::Update(bool is_rasterization_done,
reg::RB_DEPTHCONTROL normalized_depth_control,
uint32_t normalized_color_mask) {
uint32_t normalized_color_mask,
const Shader& vertex_shader) {
const RegisterFile& regs = register_file();
bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock;
@ -556,47 +569,13 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
// Estimate height used by render targets (for color for writes, for depth /
// stencil for both reads and writes) from various sources.
uint32_t height_used =
GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples);
int32_t window_y_offset =
regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
if (!regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
float viewport_bottom = 0.0f;
// First calculate all the integer.0 or integer.5 offsetting exactly at full
// precision.
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
viewport_bottom += float(window_y_offset);
}
if (cvars::half_pixel_offset &&
!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
viewport_bottom += 0.5f;
}
// Then apply the floating-point viewport offset.
if (pa_cl_vte_cntl.vport_y_offset_ena) {
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
}
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
: 1.0f;
// Using floor, or, rather, truncation (because maxing with zero anyway)
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
// GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo.
// max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
// argument in the !(a < b) case (always for NaN), min as float (height_used
// is well below 2^24) to safely drop very large values.
height_used =
uint32_t(std::min(float(height_used), std::max(0.0f, viewport_bottom)));
}
int32_t scissor_bottom =
int32_t(regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y);
if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {
scissor_bottom += window_y_offset;
}
scissor_bottom =
std::min(scissor_bottom, regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>().br_y);
height_used =
std::min(height_used, uint32_t(std::max(scissor_bottom, int32_t(0))));
uint32_t height_used = std::min(
GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples),
draw_extent_estimator_.EstimateMaxY(
interlock_barrier_only
? cvars::execute_unclipped_draw_vs_on_cpu_for_psi_render_backend
: true,
vertex_shader));
// Sorted by EDRAM base and then by index in the pipeline - for simplicity,
// treat render targets placed closer to the end of the EDRAM as truncating

View File

@ -21,9 +21,11 @@
#include "third_party/fmt/include/fmt/format.h"
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/gpu/draw_extent_estimator.h"
#include "xenia/gpu/draw_util.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/xenos.h"
DECLARE_bool(depth_transfer_not_equal_test);
@ -217,7 +219,8 @@ class RenderTargetCache {
virtual bool Update(bool is_rasterization_done,
reg::RB_DEPTHCONTROL normalized_depth_control,
uint32_t normalized_color_mask);
uint32_t normalized_color_mask,
const Shader& vertex_shader);
// Returns bits where 0 is whether a depth render target is currently bound on
// the host and 1... are whether the same applies to color render targets, and
@ -228,8 +231,10 @@ class RenderTargetCache {
uint32_t* depth_and_color_formats_out = nullptr) const;
protected:
RenderTargetCache(const RegisterFile& register_file)
: register_file_(register_file) {}
RenderTargetCache(const RegisterFile& register_file, const Memory& memory,
TraceWriter* trace_writer)
: register_file_(register_file),
draw_extent_estimator_(register_file, memory, trace_writer) {}
const RegisterFile& register_file() const { return register_file_; }
@ -606,6 +611,8 @@ class RenderTargetCache {
private:
const RegisterFile& register_file_;
DrawExtentEstimator draw_extent_estimator_;
// For host render targets.
struct OwnershipRange {

View File

@ -914,6 +914,12 @@ class Shader {
// True if the current shader has any `kill` instructions.
bool kills_pixels() const { return kills_pixels_; }
// True if the shader has any texture-related instructions (any fetch
// instructions other than vertex fetch) writing any non-constant components.
bool uses_texture_fetch_instruction_results() const {
return uses_texture_fetch_instruction_results_;
}
// True if the shader overrides the pixel depth.
bool writes_depth() const { return writes_depth_; }
@ -1002,6 +1008,7 @@ class Shader {
uint32_t register_static_address_bound_ = 0;
bool uses_register_dynamic_addressing_ = false;
bool kills_pixels_ = false;
bool uses_texture_fetch_instruction_results_ = false;
bool writes_depth_ = false;
uint32_t writes_color_targets_ = 0b0000;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,149 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2022 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_GPU_SHADER_INTERPRETER_H_
#define XENIA_GPU_SHADER_INTERPRETER_H_
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include "xenia/base/assert.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/ucode.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
namespace xe {
namespace gpu {
class ShaderInterpreter {
public:
ShaderInterpreter(const RegisterFile& register_file, const Memory& memory)
: register_file_(register_file), memory_(memory) {}
class ExportSink {
public:
virtual ~ExportSink() = default;
virtual void AllocExport(ucode::AllocType type, uint32_t size) {}
virtual void Export(ucode::ExportRegister export_register,
const float* value, uint32_t value_mask) {}
};
void SetTraceWriter(TraceWriter* new_trace_writer) {
trace_writer_ = new_trace_writer;
}
ExportSink* GetExportSink() const { return export_sink_; }
void SetExportSink(ExportSink* new_export_sink) {
export_sink_ = new_export_sink;
}
const float* temp_registers() const { return &temp_registers_[0][0]; }
float* temp_registers() { return &temp_registers_[0][0]; }
static bool CanInterpretShader(const Shader& shader) {
assert_true(shader.is_ucode_analyzed());
// Texture instructions are not very common in vertex shaders (and not used
// in Direct3D 9's internal rectangles such as clears) and are extremely
// complex, not implemented.
if (shader.uses_texture_fetch_instruction_results()) {
return false;
}
return true;
}
void SetShader(xenos::ShaderType shader_type, const uint32_t* ucode) {
shader_type_ = shader_type;
ucode_ = ucode;
}
void SetShader(const Shader& shader) {
assert_true(CanInterpretShader(shader));
SetShader(shader.type(), shader.ucode_dwords());
}
void Execute();
private:
struct State {
ucode::VertexFetchInstruction vfetch_full_last;
uint32_t vfetch_address_dwords;
float previous_scalar;
uint32_t call_stack_depth;
uint32_t call_return_addresses[4];
uint32_t loop_stack_depth;
xenos::LoopConstant loop_constants[4];
uint32_t loop_iterators[4];
int32_t address_register;
bool predicate;
void Reset() { std::memset(this, 0, sizeof(*this)); }
int32_t GetLoopAddress() const {
assert_true(loop_stack_depth && loop_stack_depth < 4);
if (!loop_stack_depth || loop_stack_depth >= 4) {
return 0;
}
xenos::LoopConstant loop_constant = loop_constants[loop_stack_depth];
// Clamp to the real range specified in the IPR2015-00325 sequencer
// specification.
// https://portal.unifiedpatents.com/ptab/case/IPR2015-00325
return std::min(
INT32_C(256),
std::max(INT32_C(-256),
int32_t(int32_t(loop_iterators[loop_stack_depth]) *
loop_constant.step +
loop_constant.start)));
}
};
static float FlushDenormal(float value) {
uint32_t bits = *reinterpret_cast<const uint32_t*>(&value);
bits &= (bits & UINT32_C(0x7F800000)) ? ~UINT32_C(0) : (UINT32_C(1) << 31);
return *reinterpret_cast<const float*>(&bits);
}
const float* GetTempRegister(uint32_t address, bool is_relative) const {
return temp_registers_[(
int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
}
// For simplicity (due to writability), not bounds-checking.
float* GetTempRegister(uint32_t address, bool is_relative) {
return temp_registers_[(
int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
}
const float* GetFloatConstant(uint32_t address, bool is_relative,
bool relative_address_is_a0) const;
void ExecuteAluInstruction(ucode::AluInstruction instr);
void StoreFetchResult(uint32_t dest, bool is_dest_relative, uint32_t swizzle,
const float* value);
void ExecuteVertexFetchInstruction(ucode::VertexFetchInstruction instr);
const RegisterFile& register_file_;
const Memory& memory_;
TraceWriter* trace_writer_ = nullptr;
ExportSink* export_sink_ = nullptr;
xenos::ShaderType shader_type_ = xenos::ShaderType::kVertex;
const uint32_t* ucode_ = nullptr;
// For both inputs and locals.
float temp_registers_[64][4];
State state_;
};
} // namespace gpu
} // namespace xe
#endif // XENIA_GPU_SHADER_INTERPRETER_H_

View File

@ -334,6 +334,10 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
GatherOperandInformation(binding.fetch_instr.operands[i]);
}
if (binding.fetch_instr.result.GetUsedResultComponents()) {
uses_texture_fetch_instruction_results_ = true;
}
switch (op.opcode()) {
case FetchOpcode::kSetTextureLod:
case FetchOpcode::kSetTextureGradientsHorz: