[GPU] Get unclipped draw height by running VS on the CPU
This commit is contained in:
parent
b2b1d7b518
commit
0fd578cafd
|
@ -847,7 +847,8 @@ bool D3D12CommandProcessor::SetupContext() {
|
|||
// Initialize the render target cache before configuring binding - need to
|
||||
// know if using rasterizer-ordered views for the bindless root signature.
|
||||
render_target_cache_ = std::make_unique<D3D12RenderTargetCache>(
|
||||
*register_file_, *this, trace_writer_, bindless_resources_used_);
|
||||
*register_file_, *memory_, trace_writer_, *this,
|
||||
bindless_resources_used_);
|
||||
if (!render_target_cache_->Initialize()) {
|
||||
XELOGE("Failed to initialize the render target cache");
|
||||
return false;
|
||||
|
@ -2147,7 +2148,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
: 0;
|
||||
if (!render_target_cache_->Update(is_rasterization_done,
|
||||
normalized_depth_control,
|
||||
normalized_color_mask)) {
|
||||
normalized_color_mask, *vertex_shader)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -1251,10 +1251,10 @@ void D3D12RenderTargetCache::BeginSubmission() {
|
|||
|
||||
bool D3D12RenderTargetCache::Update(
|
||||
bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||
uint32_t shader_writes_color_targets) {
|
||||
uint32_t shader_writes_color_targets, const Shader& vertex_shader) {
|
||||
if (!RenderTargetCache::Update(is_rasterization_done,
|
||||
normalized_depth_control,
|
||||
shader_writes_color_targets)) {
|
||||
shader_writes_color_targets, vertex_shader)) {
|
||||
return false;
|
||||
}
|
||||
switch (GetPath()) {
|
||||
|
|
|
@ -43,10 +43,10 @@ class D3D12CommandProcessor;
|
|||
class D3D12RenderTargetCache final : public RenderTargetCache {
|
||||
public:
|
||||
D3D12RenderTargetCache(const RegisterFile& register_file,
|
||||
const Memory& memory, TraceWriter& trace_writer,
|
||||
D3D12CommandProcessor& command_processor,
|
||||
TraceWriter& trace_writer,
|
||||
bool bindless_resources_used)
|
||||
: RenderTargetCache(register_file),
|
||||
: RenderTargetCache(register_file, memory, &trace_writer),
|
||||
command_processor_(command_processor),
|
||||
trace_writer_(trace_writer),
|
||||
bindless_resources_used_(bindless_resources_used) {}
|
||||
|
@ -65,7 +65,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
|
|||
|
||||
bool Update(bool is_rasterization_done,
|
||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||
uint32_t shader_writes_color_targets) override;
|
||||
uint32_t shader_writes_color_targets,
|
||||
const Shader& vertex_shader) override;
|
||||
|
||||
void InvalidateCommandListRenderTargets() {
|
||||
are_current_command_list_render_targets_valid_ = false;
|
||||
|
|
|
@ -0,0 +1,350 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#include "xenia/gpu/draw_extent_estimator.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cfloat>
|
||||
#include <cstdint>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/ucode.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
#include "xenia/ui/graphics_util.h"
|
||||
|
||||
DEFINE_bool(
|
||||
execute_unclipped_draw_vs_on_cpu, true,
|
||||
"Execute the vertex shader for draws with clipping disabled, primarily "
|
||||
"screen-space draws (such as clears), on the CPU when possible to estimate "
|
||||
"the extent of the EDRAM involved in the draw.\n"
|
||||
"Enabling this may significantly improve GPU performance as otherwise up "
|
||||
"to the entire EDRAM may be considered used in draws without clipping, "
|
||||
"potentially resulting in spurious EDRAM range ownership transfer round "
|
||||
"trips between host render targets.\n"
|
||||
"Also, on hosts where certain render target formats have to be emulated in "
|
||||
"a lossy way (for instance, 16-bit fixed-point via 16-bit floating-point), "
|
||||
"this prevents corruption of other render targets located after the "
|
||||
"current ones in the EDRAM by lossy range ownership transfers done for "
|
||||
"those draws.",
|
||||
"GPU");
|
||||
DEFINE_bool(
|
||||
execute_unclipped_draw_vs_on_cpu_with_scissor, false,
|
||||
"Don't restrict the usage of execute_unclipped_draw_vs_on_cpu to only "
|
||||
"non-scissored draws (with the right and the bottom sides of the scissor "
|
||||
"rectangle at 8192 or beyond) even though if the scissor rectangle is "
|
||||
"present, it's usually sufficient for esimating the height of the render "
|
||||
"target.\n"
|
||||
"Enabling this may cause excessive processing of vertices on the CPU, as "
|
||||
"some games draw rectangles (for their UI, for instance) without clipping, "
|
||||
"but with a proper scissor rectangle.",
|
||||
"GPU");
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
||||
void DrawExtentEstimator::PositionYExportSink::Export(
|
||||
ucode::ExportRegister export_register, const float* value,
|
||||
uint32_t value_mask) {
|
||||
if (export_register == ucode::ExportRegister::kVSPosition) {
|
||||
if (value_mask & 0b0010) {
|
||||
position_y_ = value[1];
|
||||
}
|
||||
if (value_mask & 0b1000) {
|
||||
position_w_ = value[3];
|
||||
}
|
||||
} else if (export_register ==
|
||||
ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
|
||||
if (value_mask & 0b0001) {
|
||||
point_size_ = value[0];
|
||||
}
|
||||
if (value_mask & 0b0100) {
|
||||
vertex_kill_ = *reinterpret_cast<const uint32_t*>(&value[2]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
const RegisterFile& regs = register_file_;
|
||||
|
||||
auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
|
||||
if (!vgt_draw_initiator.num_indices) {
|
||||
return 0;
|
||||
}
|
||||
if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA &&
|
||||
vgt_draw_initiator.source_select != xenos::SourceSelect::kAutoIndex) {
|
||||
// TODO(Triang3l): Support immediate indices.
|
||||
return xenos::kTexture2DCubeMaxWidthHeight;
|
||||
}
|
||||
|
||||
// Not reproducing tessellation.
|
||||
if (xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
|
||||
vgt_draw_initiator.prim_type) &&
|
||||
regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
|
||||
xenos::VGTOutputPath::kTessellationEnable) {
|
||||
return xenos::kTexture2DCubeMaxWidthHeight;
|
||||
}
|
||||
|
||||
assert_true(vertex_shader.type() == xenos::ShaderType::kVertex);
|
||||
assert_true(vertex_shader.is_ucode_analyzed());
|
||||
if (!ShaderInterpreter::CanInterpretShader(vertex_shader)) {
|
||||
return xenos::kTexture2DCubeMaxWidthHeight;
|
||||
}
|
||||
|
||||
auto vgt_dma_size = regs.Get<reg::VGT_DMA_SIZE>();
|
||||
union {
|
||||
const void* index_buffer;
|
||||
const uint16_t* index_buffer_16;
|
||||
const uint32_t* index_buffer_32;
|
||||
};
|
||||
xenos::Endian index_endian = vgt_dma_size.swap_mode;
|
||||
if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
|
||||
xenos::IndexFormat index_format = vgt_draw_initiator.index_size;
|
||||
uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32;
|
||||
uint32_t index_buffer_read_count =
|
||||
std::min(vgt_draw_initiator.num_indices, vgt_dma_size.num_words);
|
||||
if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
|
||||
// Handle the index endianness to same way as the PrimitiveProcessor.
|
||||
if (index_endian == xenos::Endian::k8in32) {
|
||||
index_endian = xenos::Endian::k8in16;
|
||||
} else if (index_endian == xenos::Endian::k16in32) {
|
||||
index_endian = xenos::Endian::kNone;
|
||||
}
|
||||
index_buffer_base &= ~uint32_t(sizeof(uint16_t) - 1);
|
||||
if (trace_writer_) {
|
||||
trace_writer_->WriteMemoryRead(
|
||||
index_buffer_base, sizeof(uint16_t) * index_buffer_read_count);
|
||||
}
|
||||
} else {
|
||||
assert_true(vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32);
|
||||
index_buffer_base &= ~uint32_t(sizeof(uint32_t) - 1);
|
||||
if (trace_writer_) {
|
||||
trace_writer_->WriteMemoryRead(
|
||||
index_buffer_base, sizeof(uint32_t) * index_buffer_read_count);
|
||||
}
|
||||
}
|
||||
index_buffer = memory_.TranslatePhysical(index_buffer_base);
|
||||
}
|
||||
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
|
||||
uint32_t reset_index =
|
||||
regs.Get<reg::VGT_MULTI_PRIM_IB_RESET_INDX>().reset_indx;
|
||||
uint32_t index_offset = regs.Get<reg::VGT_INDX_OFFSET>().indx_offset;
|
||||
uint32_t min_index = regs.Get<reg::VGT_MIN_VTX_INDX>().min_indx;
|
||||
uint32_t max_index = regs.Get<reg::VGT_MAX_VTX_INDX>().max_indx;
|
||||
|
||||
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||
float viewport_y_scale = pa_cl_vte_cntl.vport_y_scale_ena
|
||||
? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
|
||||
: 1.0f;
|
||||
float viewport_y_offset = pa_cl_vte_cntl.vport_y_offset_ena
|
||||
? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
|
||||
: 0.0f;
|
||||
|
||||
int32_t point_vertex_min_diameter_float = 0;
|
||||
int32_t point_vertex_max_diameter_float = 0;
|
||||
float point_constant_radius_y = 0.0f;
|
||||
if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
|
||||
auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
|
||||
*reinterpret_cast<float*>(&point_vertex_min_diameter_float) =
|
||||
float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
|
||||
*reinterpret_cast<float*>(&point_vertex_max_diameter_float) =
|
||||
float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
|
||||
point_constant_radius_y =
|
||||
float(regs.Get<reg::PA_SU_POINT_SIZE>().height) * (1.0f / 16.0f);
|
||||
}
|
||||
|
||||
float max_y = -FLT_MAX;
|
||||
|
||||
shader_interpreter_.SetShader(vertex_shader);
|
||||
|
||||
PositionYExportSink position_y_export_sink;
|
||||
shader_interpreter_.SetExportSink(&position_y_export_sink);
|
||||
for (uint32_t i = 0; i < vgt_draw_initiator.num_indices; ++i) {
|
||||
uint32_t vertex_index;
|
||||
if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
|
||||
if (i < vgt_dma_size.num_words) {
|
||||
if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
|
||||
vertex_index = index_buffer_16[i];
|
||||
} else {
|
||||
vertex_index = index_buffer_32[i];
|
||||
}
|
||||
// The Xenos only uses 24 bits of the index (reset_indx is 24-bit).
|
||||
vertex_index = xenos::GpuSwap(vertex_index, index_endian) & 0xFFFFFF;
|
||||
} else {
|
||||
vertex_index = 0;
|
||||
}
|
||||
if (pa_su_sc_mode_cntl.multi_prim_ib_ena && vertex_index == reset_index) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
assert_true(vgt_draw_initiator.source_select ==
|
||||
xenos::SourceSelect::kAutoIndex);
|
||||
vertex_index = i;
|
||||
}
|
||||
vertex_index =
|
||||
std::min(max_index,
|
||||
std::max(min_index, (vertex_index + index_offset) & 0xFFFFFF));
|
||||
|
||||
position_y_export_sink.Reset();
|
||||
|
||||
shader_interpreter_.temp_registers()[0] = float(vertex_index);
|
||||
shader_interpreter_.Execute();
|
||||
|
||||
if (position_y_export_sink.vertex_kill().has_value() &&
|
||||
(position_y_export_sink.vertex_kill().value() & ~(UINT32_C(1) << 31))) {
|
||||
continue;
|
||||
}
|
||||
if (!position_y_export_sink.position_y().has_value()) {
|
||||
continue;
|
||||
}
|
||||
float vertex_y = position_y_export_sink.position_y().value();
|
||||
if (!pa_cl_vte_cntl.vtx_xy_fmt) {
|
||||
if (!position_y_export_sink.position_w().has_value()) {
|
||||
continue;
|
||||
}
|
||||
vertex_y /= position_y_export_sink.position_w().value();
|
||||
}
|
||||
|
||||
vertex_y = vertex_y * viewport_y_scale + viewport_y_offset;
|
||||
|
||||
if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
|
||||
float point_radius_y;
|
||||
if (position_y_export_sink.point_size().has_value()) {
|
||||
// Vertex-specified diameter. Clamped effectively as a signed integer in
|
||||
// the hardware, -NaN, -Infinity ... -0 to the minimum, +Infinity, +NaN
|
||||
// to the maximum.
|
||||
point_radius_y = position_y_export_sink.point_size().value();
|
||||
*reinterpret_cast<int32_t*>(&point_radius_y) = std::min(
|
||||
point_vertex_max_diameter_float,
|
||||
std::max(point_vertex_min_diameter_float,
|
||||
*reinterpret_cast<const int32_t*>(&point_radius_y)));
|
||||
point_radius_y *= 0.5f;
|
||||
} else {
|
||||
// Constant radius.
|
||||
point_radius_y = point_constant_radius_y;
|
||||
}
|
||||
vertex_y += point_radius_y;
|
||||
}
|
||||
|
||||
// std::max is `a < b ? b : a`, thus in case of NaN, the first argument is
|
||||
// always returned - max_y, which is initialized to a normalized value.
|
||||
max_y = std::max(max_y, vertex_y);
|
||||
}
|
||||
shader_interpreter_.SetExportSink(nullptr);
|
||||
|
||||
int32_t max_y_24p8 = ui::FloatToD3D11Fixed16p8(max_y);
|
||||
// 16p8 range is -32768 to 32767+255/256, but it's stored as uint32_t here,
|
||||
// as 24p8, so overflowing up to -8388608 to 8388608+255/256 is safe. The
|
||||
// range of the window offset plus the half-pixel offset is -16384 to 16384.5,
|
||||
// so it's safe to add both - adding it will neither move the 16p8 clamping
|
||||
// bounds -32768 and 32767+255/256 into the 0...8192 screen space range, nor
|
||||
// cause 24p8 overflow.
|
||||
if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
|
||||
max_y_24p8 += 128;
|
||||
}
|
||||
if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
|
||||
max_y_24p8 += regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset * 256;
|
||||
}
|
||||
// Top-left rule - .5 exclusive without MSAA, 1. exclusive with MSAA.
|
||||
auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
|
||||
return (uint32_t(std::max(int32_t(0), max_y_24p8)) +
|
||||
((rb_surface_info.msaa_samples == xenos::MsaaSamples::k1X) ? 127
|
||||
: 255)) >>
|
||||
8;
|
||||
}
|
||||
|
||||
uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
|
||||
const Shader& vertex_shader) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
const RegisterFile& regs = register_file_;
|
||||
|
||||
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
|
||||
int32_t window_y_offset = pa_sc_window_offset.window_y_offset;
|
||||
|
||||
// Scissor.
|
||||
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
|
||||
int32_t scissor_bottom = int32_t(pa_sc_window_scissor_br.br_y);
|
||||
bool scissor_window_offset =
|
||||
!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable;
|
||||
if (scissor_window_offset) {
|
||||
scissor_bottom += window_y_offset;
|
||||
}
|
||||
auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
|
||||
scissor_bottom = std::min(scissor_bottom, pa_sc_screen_scissor_br.br_y);
|
||||
uint32_t max_y = uint32_t(std::max(scissor_bottom, int32_t(0)));
|
||||
|
||||
if (regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
|
||||
// Actual extent from the vertices.
|
||||
if (try_to_estimate_vertex_max_y &&
|
||||
cvars::execute_unclipped_draw_vs_on_cpu) {
|
||||
bool estimate_vertex_max_y;
|
||||
if (cvars::execute_unclipped_draw_vs_on_cpu_with_scissor) {
|
||||
estimate_vertex_max_y = true;
|
||||
} else {
|
||||
estimate_vertex_max_y = false;
|
||||
if (scissor_bottom >= xenos::kTexture2DCubeMaxWidthHeight) {
|
||||
// Handle just the usual special 8192x8192 case in Direct3D 9 - 8192
|
||||
// may be a normal render target height (80x8192 is well within the
|
||||
// EDRAM size, for instance), no need to process the vertices on the
|
||||
// CPU in this case.
|
||||
int32_t scissor_right = int32_t(pa_sc_window_scissor_br.br_x);
|
||||
if (scissor_window_offset) {
|
||||
scissor_right += pa_sc_window_offset.window_x_offset;
|
||||
}
|
||||
scissor_right = std::min(scissor_right, pa_sc_screen_scissor_br.br_x);
|
||||
if (scissor_right >= xenos::kTexture2DCubeMaxWidthHeight) {
|
||||
estimate_vertex_max_y = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (estimate_vertex_max_y) {
|
||||
max_y = std::min(max_y, EstimateVertexMaxY(vertex_shader));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Viewport. Though the Xenos itself doesn't have an implicit viewport
|
||||
// scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
|
||||
// usually exists and can't be disabled.
|
||||
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||
float viewport_bottom = 0.0f;
|
||||
// First calculate all the integer.0 or integer.5 offsetting exactly at full
|
||||
// precision.
|
||||
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
|
||||
viewport_bottom += float(window_y_offset);
|
||||
}
|
||||
if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
|
||||
viewport_bottom += 0.5f;
|
||||
}
|
||||
// Then apply the floating-point viewport offset.
|
||||
if (pa_cl_vte_cntl.vport_y_offset_ena) {
|
||||
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
||||
}
|
||||
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
|
||||
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
|
||||
: 1.0f;
|
||||
// Using floor, or, rather, truncation (because maxing with zero anyway)
|
||||
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
|
||||
// GPUs on Direct3D 12 (but not WARP), also like in
|
||||
// draw_util::GetHostViewportInfo.
|
||||
// max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
|
||||
// argument in the !(a < b) case (always for NaN), min as float (max_y is
|
||||
// well below 2^24) to safely drop very large values.
|
||||
max_y = uint32_t(std::min(float(max_y), std::max(0.0f, viewport_bottom)));
|
||||
}
|
||||
|
||||
return max_y;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
|
@ -0,0 +1,76 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
|
||||
#define XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/shader.h"
|
||||
#include "xenia/gpu/shader_interpreter.h"
|
||||
#include "xenia/gpu/trace_writer.h"
|
||||
#include "xenia/memory.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
||||
class DrawExtentEstimator {
|
||||
public:
|
||||
DrawExtentEstimator(const RegisterFile& register_file, const Memory& memory,
|
||||
TraceWriter* trace_writer)
|
||||
: register_file_(register_file),
|
||||
memory_(memory),
|
||||
trace_writer_(trace_writer),
|
||||
shader_interpreter_(register_file, memory) {
|
||||
shader_interpreter_.SetTraceWriter(trace_writer);
|
||||
}
|
||||
|
||||
// The shader must have its ucode analyzed.
|
||||
uint32_t EstimateVertexMaxY(const Shader& vertex_shader);
|
||||
uint32_t EstimateMaxY(bool try_to_estimate_vertex_max_y,
|
||||
const Shader& vertex_shader);
|
||||
|
||||
private:
|
||||
class PositionYExportSink : public ShaderInterpreter::ExportSink {
|
||||
public:
|
||||
void Export(ucode::ExportRegister export_register, const float* value,
|
||||
uint32_t value_mask) override;
|
||||
|
||||
void Reset() {
|
||||
position_y_.reset();
|
||||
position_w_.reset();
|
||||
point_size_.reset();
|
||||
vertex_kill_.reset();
|
||||
}
|
||||
|
||||
const std::optional<float>& position_y() const { return position_y_; }
|
||||
const std::optional<float>& position_w() const { return position_w_; }
|
||||
const std::optional<float>& point_size() const { return point_size_; }
|
||||
const std::optional<uint32_t>& vertex_kill() const { return vertex_kill_; }
|
||||
|
||||
private:
|
||||
std::optional<float> position_y_;
|
||||
std::optional<float> position_w_;
|
||||
std::optional<float> point_size_;
|
||||
std::optional<uint32_t> vertex_kill_;
|
||||
};
|
||||
|
||||
const RegisterFile& register_file_;
|
||||
const Memory& memory_;
|
||||
TraceWriter* trace_writer_;
|
||||
|
||||
ShaderInterpreter shader_interpreter_;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
||||
|
||||
#endif // XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
|
|
@ -215,6 +215,31 @@ union alignas(uint32_t) SQ_INTERPOLATOR_CNTL {
|
|||
};
|
||||
static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));
|
||||
|
||||
union alignas(uint32_t) SQ_VS_CONST {
|
||||
uint32_t value;
|
||||
struct {
|
||||
uint32_t base : 9; // +0
|
||||
uint32_t : 3; // +9
|
||||
// Vec4 count minus one.
|
||||
uint32_t size : 9; // 12
|
||||
};
|
||||
static constexpr Register register_index = XE_GPU_REG_SQ_VS_CONST;
|
||||
};
|
||||
static_assert_size(SQ_VS_CONST, sizeof(uint32_t));
|
||||
|
||||
// Same as SQ_VS_CONST.
|
||||
union alignas(uint32_t) SQ_PS_CONST {
|
||||
uint32_t value;
|
||||
struct {
|
||||
uint32_t base : 9; // +0
|
||||
uint32_t : 3; // +9
|
||||
// Vec4 count minus one.
|
||||
uint32_t size : 9; // 12
|
||||
};
|
||||
static constexpr Register register_index = XE_GPU_REG_SQ_PS_CONST;
|
||||
};
|
||||
static_assert_size(SQ_PS_CONST, sizeof(uint32_t));
|
||||
|
||||
/*******************************************************************************
|
||||
__ _____ ___ _____ _____ __
|
||||
\ \ / / __| _ \_ _| __\ \/ /
|
||||
|
|
|
@ -22,7 +22,6 @@
|
|||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/gpu/draw_util.h"
|
||||
#include "xenia/gpu/gpu_flags.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
@ -143,6 +142,19 @@ DEFINE_bool(
|
|||
"-1...1, remap -32...32 to -1...1 to use the full possible range of "
|
||||
"values, at the expense of multiplicative blending correctness.",
|
||||
"GPU");
|
||||
// Enabled by default as the GPU is overall usually the bottleneck when the
|
||||
// pixel shader interlock render backend implementation is used, anything that
|
||||
// may improve GPU performance is favorable.
|
||||
DEFINE_bool(
|
||||
execute_unclipped_draw_vs_on_cpu_for_psi_render_backend, true,
|
||||
"If execute_unclipped_draw_vs_on_cpu is enabled, execute the vertex shader "
|
||||
"for unclipped draws on the CPU even when using the pixel shader interlock "
|
||||
"(rasterizer-ordered view) implementation of the render backend on the "
|
||||
"host, for which no expensive copying between host render targets is "
|
||||
"needed when the ownership of a EDRAM range is changed.\n"
|
||||
"If this is enabled, excessive barriers may be eliminated when switching "
|
||||
"between different render targets in separate EDRAM locations.",
|
||||
"GPU");
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
@ -367,7 +379,8 @@ void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); }
|
|||
|
||||
bool RenderTargetCache::Update(bool is_rasterization_done,
|
||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||
uint32_t normalized_color_mask) {
|
||||
uint32_t normalized_color_mask,
|
||||
const Shader& vertex_shader) {
|
||||
const RegisterFile& regs = register_file();
|
||||
bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock;
|
||||
|
||||
|
@ -556,47 +569,13 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
|
|||
|
||||
// Estimate height used by render targets (for color for writes, for depth /
|
||||
// stencil for both reads and writes) from various sources.
|
||||
uint32_t height_used =
|
||||
GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples);
|
||||
int32_t window_y_offset =
|
||||
regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
|
||||
if (!regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
|
||||
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||
float viewport_bottom = 0.0f;
|
||||
// First calculate all the integer.0 or integer.5 offsetting exactly at full
|
||||
// precision.
|
||||
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
|
||||
viewport_bottom += float(window_y_offset);
|
||||
}
|
||||
if (cvars::half_pixel_offset &&
|
||||
!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
|
||||
viewport_bottom += 0.5f;
|
||||
}
|
||||
// Then apply the floating-point viewport offset.
|
||||
if (pa_cl_vte_cntl.vport_y_offset_ena) {
|
||||
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
||||
}
|
||||
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
|
||||
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
|
||||
: 1.0f;
|
||||
// Using floor, or, rather, truncation (because maxing with zero anyway)
|
||||
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
|
||||
// GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo.
|
||||
// max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
|
||||
// argument in the !(a < b) case (always for NaN), min as float (height_used
|
||||
// is well below 2^24) to safely drop very large values.
|
||||
height_used =
|
||||
uint32_t(std::min(float(height_used), std::max(0.0f, viewport_bottom)));
|
||||
}
|
||||
int32_t scissor_bottom =
|
||||
int32_t(regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y);
|
||||
if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {
|
||||
scissor_bottom += window_y_offset;
|
||||
}
|
||||
scissor_bottom =
|
||||
std::min(scissor_bottom, regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>().br_y);
|
||||
height_used =
|
||||
std::min(height_used, uint32_t(std::max(scissor_bottom, int32_t(0))));
|
||||
uint32_t height_used = std::min(
|
||||
GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples),
|
||||
draw_extent_estimator_.EstimateMaxY(
|
||||
interlock_barrier_only
|
||||
? cvars::execute_unclipped_draw_vs_on_cpu_for_psi_render_backend
|
||||
: true,
|
||||
vertex_shader));
|
||||
|
||||
// Sorted by EDRAM base and then by index in the pipeline - for simplicity,
|
||||
// treat render targets placed closer to the end of the EDRAM as truncating
|
||||
|
|
|
@ -21,9 +21,11 @@
|
|||
#include "third_party/fmt/include/fmt/format.h"
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/gpu/draw_extent_estimator.h"
|
||||
#include "xenia/gpu/draw_util.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/shader.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
DECLARE_bool(depth_transfer_not_equal_test);
|
||||
|
@ -217,7 +219,8 @@ class RenderTargetCache {
|
|||
|
||||
virtual bool Update(bool is_rasterization_done,
|
||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||
uint32_t normalized_color_mask);
|
||||
uint32_t normalized_color_mask,
|
||||
const Shader& vertex_shader);
|
||||
|
||||
// Returns bits where 0 is whether a depth render target is currently bound on
|
||||
// the host and 1... are whether the same applies to color render targets, and
|
||||
|
@ -228,8 +231,10 @@ class RenderTargetCache {
|
|||
uint32_t* depth_and_color_formats_out = nullptr) const;
|
||||
|
||||
protected:
|
||||
RenderTargetCache(const RegisterFile& register_file)
|
||||
: register_file_(register_file) {}
|
||||
RenderTargetCache(const RegisterFile& register_file, const Memory& memory,
|
||||
TraceWriter* trace_writer)
|
||||
: register_file_(register_file),
|
||||
draw_extent_estimator_(register_file, memory, trace_writer) {}
|
||||
|
||||
const RegisterFile& register_file() const { return register_file_; }
|
||||
|
||||
|
@ -606,6 +611,8 @@ class RenderTargetCache {
|
|||
private:
|
||||
const RegisterFile& register_file_;
|
||||
|
||||
DrawExtentEstimator draw_extent_estimator_;
|
||||
|
||||
// For host render targets.
|
||||
|
||||
struct OwnershipRange {
|
||||
|
|
|
@ -914,6 +914,12 @@ class Shader {
|
|||
// True if the current shader has any `kill` instructions.
|
||||
bool kills_pixels() const { return kills_pixels_; }
|
||||
|
||||
// True if the shader has any texture-related instructions (any fetch
|
||||
// instructions other than vertex fetch) writing any non-constant components.
|
||||
bool uses_texture_fetch_instruction_results() const {
|
||||
return uses_texture_fetch_instruction_results_;
|
||||
}
|
||||
|
||||
// True if the shader overrides the pixel depth.
|
||||
bool writes_depth() const { return writes_depth_; }
|
||||
|
||||
|
@ -1002,6 +1008,7 @@ class Shader {
|
|||
uint32_t register_static_address_bound_ = 0;
|
||||
bool uses_register_dynamic_addressing_ = false;
|
||||
bool kills_pixels_ = false;
|
||||
bool uses_texture_fetch_instruction_results_ = false;
|
||||
bool writes_depth_ = false;
|
||||
uint32_t writes_color_targets_ = 0b0000;
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,149 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef XENIA_GPU_SHADER_INTERPRETER_H_
|
||||
#define XENIA_GPU_SHADER_INTERPRETER_H_
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/shader.h"
|
||||
#include "xenia/gpu/trace_writer.h"
|
||||
#include "xenia/gpu/ucode.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
#include "xenia/memory.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
||||
class ShaderInterpreter {
|
||||
public:
|
||||
ShaderInterpreter(const RegisterFile& register_file, const Memory& memory)
|
||||
: register_file_(register_file), memory_(memory) {}
|
||||
|
||||
class ExportSink {
|
||||
public:
|
||||
virtual ~ExportSink() = default;
|
||||
virtual void AllocExport(ucode::AllocType type, uint32_t size) {}
|
||||
virtual void Export(ucode::ExportRegister export_register,
|
||||
const float* value, uint32_t value_mask) {}
|
||||
};
|
||||
|
||||
void SetTraceWriter(TraceWriter* new_trace_writer) {
|
||||
trace_writer_ = new_trace_writer;
|
||||
}
|
||||
|
||||
ExportSink* GetExportSink() const { return export_sink_; }
|
||||
void SetExportSink(ExportSink* new_export_sink) {
|
||||
export_sink_ = new_export_sink;
|
||||
}
|
||||
|
||||
const float* temp_registers() const { return &temp_registers_[0][0]; }
|
||||
float* temp_registers() { return &temp_registers_[0][0]; }
|
||||
|
||||
static bool CanInterpretShader(const Shader& shader) {
|
||||
assert_true(shader.is_ucode_analyzed());
|
||||
// Texture instructions are not very common in vertex shaders (and not used
|
||||
// in Direct3D 9's internal rectangles such as clears) and are extremely
|
||||
// complex, not implemented.
|
||||
if (shader.uses_texture_fetch_instruction_results()) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
void SetShader(xenos::ShaderType shader_type, const uint32_t* ucode) {
|
||||
shader_type_ = shader_type;
|
||||
ucode_ = ucode;
|
||||
}
|
||||
void SetShader(const Shader& shader) {
|
||||
assert_true(CanInterpretShader(shader));
|
||||
SetShader(shader.type(), shader.ucode_dwords());
|
||||
}
|
||||
|
||||
void Execute();
|
||||
|
||||
private:
|
||||
struct State {
|
||||
ucode::VertexFetchInstruction vfetch_full_last;
|
||||
uint32_t vfetch_address_dwords;
|
||||
float previous_scalar;
|
||||
uint32_t call_stack_depth;
|
||||
uint32_t call_return_addresses[4];
|
||||
uint32_t loop_stack_depth;
|
||||
xenos::LoopConstant loop_constants[4];
|
||||
uint32_t loop_iterators[4];
|
||||
int32_t address_register;
|
||||
bool predicate;
|
||||
|
||||
void Reset() { std::memset(this, 0, sizeof(*this)); }
|
||||
|
||||
int32_t GetLoopAddress() const {
|
||||
assert_true(loop_stack_depth && loop_stack_depth < 4);
|
||||
if (!loop_stack_depth || loop_stack_depth >= 4) {
|
||||
return 0;
|
||||
}
|
||||
xenos::LoopConstant loop_constant = loop_constants[loop_stack_depth];
|
||||
// Clamp to the real range specified in the IPR2015-00325 sequencer
|
||||
// specification.
|
||||
// https://portal.unifiedpatents.com/ptab/case/IPR2015-00325
|
||||
return std::min(
|
||||
INT32_C(256),
|
||||
std::max(INT32_C(-256),
|
||||
int32_t(int32_t(loop_iterators[loop_stack_depth]) *
|
||||
loop_constant.step +
|
||||
loop_constant.start)));
|
||||
}
|
||||
};
|
||||
|
||||
static float FlushDenormal(float value) {
|
||||
uint32_t bits = *reinterpret_cast<const uint32_t*>(&value);
|
||||
bits &= (bits & UINT32_C(0x7F800000)) ? ~UINT32_C(0) : (UINT32_C(1) << 31);
|
||||
return *reinterpret_cast<const float*>(&bits);
|
||||
}
|
||||
|
||||
const float* GetTempRegister(uint32_t address, bool is_relative) const {
|
||||
return temp_registers_[(
|
||||
int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
|
||||
}
|
||||
// For simplicity (due to writability), not bounds-checking.
|
||||
float* GetTempRegister(uint32_t address, bool is_relative) {
|
||||
return temp_registers_[(
|
||||
int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
|
||||
}
|
||||
const float* GetFloatConstant(uint32_t address, bool is_relative,
|
||||
bool relative_address_is_a0) const;
|
||||
|
||||
void ExecuteAluInstruction(ucode::AluInstruction instr);
|
||||
void StoreFetchResult(uint32_t dest, bool is_dest_relative, uint32_t swizzle,
|
||||
const float* value);
|
||||
void ExecuteVertexFetchInstruction(ucode::VertexFetchInstruction instr);
|
||||
|
||||
const RegisterFile& register_file_;
|
||||
const Memory& memory_;
|
||||
|
||||
TraceWriter* trace_writer_ = nullptr;
|
||||
|
||||
ExportSink* export_sink_ = nullptr;
|
||||
|
||||
xenos::ShaderType shader_type_ = xenos::ShaderType::kVertex;
|
||||
const uint32_t* ucode_ = nullptr;
|
||||
|
||||
// For both inputs and locals.
|
||||
float temp_registers_[64][4];
|
||||
|
||||
State state_;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
||||
|
||||
#endif // XENIA_GPU_SHADER_INTERPRETER_H_
|
|
@ -334,6 +334,10 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
|
|||
GatherOperandInformation(binding.fetch_instr.operands[i]);
|
||||
}
|
||||
|
||||
if (binding.fetch_instr.result.GetUsedResultComponents()) {
|
||||
uses_texture_fetch_instruction_results_ = true;
|
||||
}
|
||||
|
||||
switch (op.opcode()) {
|
||||
case FetchOpcode::kSetTextureLod:
|
||||
case FetchOpcode::kSetTextureGradientsHorz:
|
||||
|
|
Loading…
Reference in New Issue