[GPU] Get unclipped draw height by running VS on the CPU
This commit is contained in:
parent
b2b1d7b518
commit
0fd578cafd
|
@ -847,7 +847,8 @@ bool D3D12CommandProcessor::SetupContext() {
|
||||||
// Initialize the render target cache before configuring binding - need to
|
// Initialize the render target cache before configuring binding - need to
|
||||||
// know if using rasterizer-ordered views for the bindless root signature.
|
// know if using rasterizer-ordered views for the bindless root signature.
|
||||||
render_target_cache_ = std::make_unique<D3D12RenderTargetCache>(
|
render_target_cache_ = std::make_unique<D3D12RenderTargetCache>(
|
||||||
*register_file_, *this, trace_writer_, bindless_resources_used_);
|
*register_file_, *memory_, trace_writer_, *this,
|
||||||
|
bindless_resources_used_);
|
||||||
if (!render_target_cache_->Initialize()) {
|
if (!render_target_cache_->Initialize()) {
|
||||||
XELOGE("Failed to initialize the render target cache");
|
XELOGE("Failed to initialize the render target cache");
|
||||||
return false;
|
return false;
|
||||||
|
@ -2147,7 +2148,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
||||||
: 0;
|
: 0;
|
||||||
if (!render_target_cache_->Update(is_rasterization_done,
|
if (!render_target_cache_->Update(is_rasterization_done,
|
||||||
normalized_depth_control,
|
normalized_depth_control,
|
||||||
normalized_color_mask)) {
|
normalized_color_mask, *vertex_shader)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1251,10 +1251,10 @@ void D3D12RenderTargetCache::BeginSubmission() {
|
||||||
|
|
||||||
bool D3D12RenderTargetCache::Update(
|
bool D3D12RenderTargetCache::Update(
|
||||||
bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control,
|
bool is_rasterization_done, reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||||
uint32_t shader_writes_color_targets) {
|
uint32_t shader_writes_color_targets, const Shader& vertex_shader) {
|
||||||
if (!RenderTargetCache::Update(is_rasterization_done,
|
if (!RenderTargetCache::Update(is_rasterization_done,
|
||||||
normalized_depth_control,
|
normalized_depth_control,
|
||||||
shader_writes_color_targets)) {
|
shader_writes_color_targets, vertex_shader)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
switch (GetPath()) {
|
switch (GetPath()) {
|
||||||
|
|
|
@ -43,10 +43,10 @@ class D3D12CommandProcessor;
|
||||||
class D3D12RenderTargetCache final : public RenderTargetCache {
|
class D3D12RenderTargetCache final : public RenderTargetCache {
|
||||||
public:
|
public:
|
||||||
D3D12RenderTargetCache(const RegisterFile& register_file,
|
D3D12RenderTargetCache(const RegisterFile& register_file,
|
||||||
|
const Memory& memory, TraceWriter& trace_writer,
|
||||||
D3D12CommandProcessor& command_processor,
|
D3D12CommandProcessor& command_processor,
|
||||||
TraceWriter& trace_writer,
|
|
||||||
bool bindless_resources_used)
|
bool bindless_resources_used)
|
||||||
: RenderTargetCache(register_file),
|
: RenderTargetCache(register_file, memory, &trace_writer),
|
||||||
command_processor_(command_processor),
|
command_processor_(command_processor),
|
||||||
trace_writer_(trace_writer),
|
trace_writer_(trace_writer),
|
||||||
bindless_resources_used_(bindless_resources_used) {}
|
bindless_resources_used_(bindless_resources_used) {}
|
||||||
|
@ -65,7 +65,8 @@ class D3D12RenderTargetCache final : public RenderTargetCache {
|
||||||
|
|
||||||
bool Update(bool is_rasterization_done,
|
bool Update(bool is_rasterization_done,
|
||||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||||
uint32_t shader_writes_color_targets) override;
|
uint32_t shader_writes_color_targets,
|
||||||
|
const Shader& vertex_shader) override;
|
||||||
|
|
||||||
void InvalidateCommandListRenderTargets() {
|
void InvalidateCommandListRenderTargets() {
|
||||||
are_current_command_list_render_targets_valid_ = false;
|
are_current_command_list_render_targets_valid_ = false;
|
||||||
|
|
|
@ -0,0 +1,350 @@
|
||||||
|
/**
|
||||||
|
******************************************************************************
|
||||||
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
******************************************************************************
|
||||||
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||||
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
|
******************************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "xenia/gpu/draw_extent_estimator.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cfloat>
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
#include "xenia/base/assert.h"
|
||||||
|
#include "xenia/base/cvar.h"
|
||||||
|
#include "xenia/base/profiling.h"
|
||||||
|
#include "xenia/gpu/registers.h"
|
||||||
|
#include "xenia/gpu/ucode.h"
|
||||||
|
#include "xenia/gpu/xenos.h"
|
||||||
|
#include "xenia/ui/graphics_util.h"
|
||||||
|
|
||||||
|
DEFINE_bool(
|
||||||
|
execute_unclipped_draw_vs_on_cpu, true,
|
||||||
|
"Execute the vertex shader for draws with clipping disabled, primarily "
|
||||||
|
"screen-space draws (such as clears), on the CPU when possible to estimate "
|
||||||
|
"the extent of the EDRAM involved in the draw.\n"
|
||||||
|
"Enabling this may significantly improve GPU performance as otherwise up "
|
||||||
|
"to the entire EDRAM may be considered used in draws without clipping, "
|
||||||
|
"potentially resulting in spurious EDRAM range ownership transfer round "
|
||||||
|
"trips between host render targets.\n"
|
||||||
|
"Also, on hosts where certain render target formats have to be emulated in "
|
||||||
|
"a lossy way (for instance, 16-bit fixed-point via 16-bit floating-point), "
|
||||||
|
"this prevents corruption of other render targets located after the "
|
||||||
|
"current ones in the EDRAM by lossy range ownership transfers done for "
|
||||||
|
"those draws.",
|
||||||
|
"GPU");
|
||||||
|
DEFINE_bool(
|
||||||
|
execute_unclipped_draw_vs_on_cpu_with_scissor, false,
|
||||||
|
"Don't restrict the usage of execute_unclipped_draw_vs_on_cpu to only "
|
||||||
|
"non-scissored draws (with the right and the bottom sides of the scissor "
|
||||||
|
"rectangle at 8192 or beyond) even though if the scissor rectangle is "
|
||||||
|
"present, it's usually sufficient for esimating the height of the render "
|
||||||
|
"target.\n"
|
||||||
|
"Enabling this may cause excessive processing of vertices on the CPU, as "
|
||||||
|
"some games draw rectangles (for their UI, for instance) without clipping, "
|
||||||
|
"but with a proper scissor rectangle.",
|
||||||
|
"GPU");
|
||||||
|
|
||||||
|
namespace xe {
|
||||||
|
namespace gpu {
|
||||||
|
|
||||||
|
void DrawExtentEstimator::PositionYExportSink::Export(
|
||||||
|
ucode::ExportRegister export_register, const float* value,
|
||||||
|
uint32_t value_mask) {
|
||||||
|
if (export_register == ucode::ExportRegister::kVSPosition) {
|
||||||
|
if (value_mask & 0b0010) {
|
||||||
|
position_y_ = value[1];
|
||||||
|
}
|
||||||
|
if (value_mask & 0b1000) {
|
||||||
|
position_w_ = value[3];
|
||||||
|
}
|
||||||
|
} else if (export_register ==
|
||||||
|
ucode::ExportRegister::kVSPointSizeEdgeFlagKillVertex) {
|
||||||
|
if (value_mask & 0b0001) {
|
||||||
|
point_size_ = value[0];
|
||||||
|
}
|
||||||
|
if (value_mask & 0b0100) {
|
||||||
|
vertex_kill_ = *reinterpret_cast<const uint32_t*>(&value[2]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t DrawExtentEstimator::EstimateVertexMaxY(const Shader& vertex_shader) {
|
||||||
|
SCOPE_profile_cpu_f("gpu");
|
||||||
|
|
||||||
|
const RegisterFile& regs = register_file_;
|
||||||
|
|
||||||
|
auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();
|
||||||
|
if (!vgt_draw_initiator.num_indices) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA &&
|
||||||
|
vgt_draw_initiator.source_select != xenos::SourceSelect::kAutoIndex) {
|
||||||
|
// TODO(Triang3l): Support immediate indices.
|
||||||
|
return xenos::kTexture2DCubeMaxWidthHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Not reproducing tessellation.
|
||||||
|
if (xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
|
||||||
|
vgt_draw_initiator.prim_type) &&
|
||||||
|
regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
|
||||||
|
xenos::VGTOutputPath::kTessellationEnable) {
|
||||||
|
return xenos::kTexture2DCubeMaxWidthHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_true(vertex_shader.type() == xenos::ShaderType::kVertex);
|
||||||
|
assert_true(vertex_shader.is_ucode_analyzed());
|
||||||
|
if (!ShaderInterpreter::CanInterpretShader(vertex_shader)) {
|
||||||
|
return xenos::kTexture2DCubeMaxWidthHeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto vgt_dma_size = regs.Get<reg::VGT_DMA_SIZE>();
|
||||||
|
union {
|
||||||
|
const void* index_buffer;
|
||||||
|
const uint16_t* index_buffer_16;
|
||||||
|
const uint32_t* index_buffer_32;
|
||||||
|
};
|
||||||
|
xenos::Endian index_endian = vgt_dma_size.swap_mode;
|
||||||
|
if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
|
||||||
|
xenos::IndexFormat index_format = vgt_draw_initiator.index_size;
|
||||||
|
uint32_t index_buffer_base = regs[XE_GPU_REG_VGT_DMA_BASE].u32;
|
||||||
|
uint32_t index_buffer_read_count =
|
||||||
|
std::min(vgt_draw_initiator.num_indices, vgt_dma_size.num_words);
|
||||||
|
if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
|
||||||
|
// Handle the index endianness to same way as the PrimitiveProcessor.
|
||||||
|
if (index_endian == xenos::Endian::k8in32) {
|
||||||
|
index_endian = xenos::Endian::k8in16;
|
||||||
|
} else if (index_endian == xenos::Endian::k16in32) {
|
||||||
|
index_endian = xenos::Endian::kNone;
|
||||||
|
}
|
||||||
|
index_buffer_base &= ~uint32_t(sizeof(uint16_t) - 1);
|
||||||
|
if (trace_writer_) {
|
||||||
|
trace_writer_->WriteMemoryRead(
|
||||||
|
index_buffer_base, sizeof(uint16_t) * index_buffer_read_count);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert_true(vgt_draw_initiator.index_size == xenos::IndexFormat::kInt32);
|
||||||
|
index_buffer_base &= ~uint32_t(sizeof(uint32_t) - 1);
|
||||||
|
if (trace_writer_) {
|
||||||
|
trace_writer_->WriteMemoryRead(
|
||||||
|
index_buffer_base, sizeof(uint32_t) * index_buffer_read_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
index_buffer = memory_.TranslatePhysical(index_buffer_base);
|
||||||
|
}
|
||||||
|
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
|
||||||
|
uint32_t reset_index =
|
||||||
|
regs.Get<reg::VGT_MULTI_PRIM_IB_RESET_INDX>().reset_indx;
|
||||||
|
uint32_t index_offset = regs.Get<reg::VGT_INDX_OFFSET>().indx_offset;
|
||||||
|
uint32_t min_index = regs.Get<reg::VGT_MIN_VTX_INDX>().min_indx;
|
||||||
|
uint32_t max_index = regs.Get<reg::VGT_MAX_VTX_INDX>().max_indx;
|
||||||
|
|
||||||
|
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||||
|
float viewport_y_scale = pa_cl_vte_cntl.vport_y_scale_ena
|
||||||
|
? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
|
||||||
|
: 1.0f;
|
||||||
|
float viewport_y_offset = pa_cl_vte_cntl.vport_y_offset_ena
|
||||||
|
? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
|
||||||
|
: 0.0f;
|
||||||
|
|
||||||
|
int32_t point_vertex_min_diameter_float = 0;
|
||||||
|
int32_t point_vertex_max_diameter_float = 0;
|
||||||
|
float point_constant_radius_y = 0.0f;
|
||||||
|
if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
|
||||||
|
auto pa_su_point_minmax = regs.Get<reg::PA_SU_POINT_MINMAX>();
|
||||||
|
*reinterpret_cast<float*>(&point_vertex_min_diameter_float) =
|
||||||
|
float(pa_su_point_minmax.min_size) * (2.0f / 16.0f);
|
||||||
|
*reinterpret_cast<float*>(&point_vertex_max_diameter_float) =
|
||||||
|
float(pa_su_point_minmax.max_size) * (2.0f / 16.0f);
|
||||||
|
point_constant_radius_y =
|
||||||
|
float(regs.Get<reg::PA_SU_POINT_SIZE>().height) * (1.0f / 16.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
float max_y = -FLT_MAX;
|
||||||
|
|
||||||
|
shader_interpreter_.SetShader(vertex_shader);
|
||||||
|
|
||||||
|
PositionYExportSink position_y_export_sink;
|
||||||
|
shader_interpreter_.SetExportSink(&position_y_export_sink);
|
||||||
|
for (uint32_t i = 0; i < vgt_draw_initiator.num_indices; ++i) {
|
||||||
|
uint32_t vertex_index;
|
||||||
|
if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA) {
|
||||||
|
if (i < vgt_dma_size.num_words) {
|
||||||
|
if (vgt_draw_initiator.index_size == xenos::IndexFormat::kInt16) {
|
||||||
|
vertex_index = index_buffer_16[i];
|
||||||
|
} else {
|
||||||
|
vertex_index = index_buffer_32[i];
|
||||||
|
}
|
||||||
|
// The Xenos only uses 24 bits of the index (reset_indx is 24-bit).
|
||||||
|
vertex_index = xenos::GpuSwap(vertex_index, index_endian) & 0xFFFFFF;
|
||||||
|
} else {
|
||||||
|
vertex_index = 0;
|
||||||
|
}
|
||||||
|
if (pa_su_sc_mode_cntl.multi_prim_ib_ena && vertex_index == reset_index) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
assert_true(vgt_draw_initiator.source_select ==
|
||||||
|
xenos::SourceSelect::kAutoIndex);
|
||||||
|
vertex_index = i;
|
||||||
|
}
|
||||||
|
vertex_index =
|
||||||
|
std::min(max_index,
|
||||||
|
std::max(min_index, (vertex_index + index_offset) & 0xFFFFFF));
|
||||||
|
|
||||||
|
position_y_export_sink.Reset();
|
||||||
|
|
||||||
|
shader_interpreter_.temp_registers()[0] = float(vertex_index);
|
||||||
|
shader_interpreter_.Execute();
|
||||||
|
|
||||||
|
if (position_y_export_sink.vertex_kill().has_value() &&
|
||||||
|
(position_y_export_sink.vertex_kill().value() & ~(UINT32_C(1) << 31))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!position_y_export_sink.position_y().has_value()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
float vertex_y = position_y_export_sink.position_y().value();
|
||||||
|
if (!pa_cl_vte_cntl.vtx_xy_fmt) {
|
||||||
|
if (!position_y_export_sink.position_w().has_value()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
vertex_y /= position_y_export_sink.position_w().value();
|
||||||
|
}
|
||||||
|
|
||||||
|
vertex_y = vertex_y * viewport_y_scale + viewport_y_offset;
|
||||||
|
|
||||||
|
if (vgt_draw_initiator.prim_type == xenos::PrimitiveType::kPointList) {
|
||||||
|
float point_radius_y;
|
||||||
|
if (position_y_export_sink.point_size().has_value()) {
|
||||||
|
// Vertex-specified diameter. Clamped effectively as a signed integer in
|
||||||
|
// the hardware, -NaN, -Infinity ... -0 to the minimum, +Infinity, +NaN
|
||||||
|
// to the maximum.
|
||||||
|
point_radius_y = position_y_export_sink.point_size().value();
|
||||||
|
*reinterpret_cast<int32_t*>(&point_radius_y) = std::min(
|
||||||
|
point_vertex_max_diameter_float,
|
||||||
|
std::max(point_vertex_min_diameter_float,
|
||||||
|
*reinterpret_cast<const int32_t*>(&point_radius_y)));
|
||||||
|
point_radius_y *= 0.5f;
|
||||||
|
} else {
|
||||||
|
// Constant radius.
|
||||||
|
point_radius_y = point_constant_radius_y;
|
||||||
|
}
|
||||||
|
vertex_y += point_radius_y;
|
||||||
|
}
|
||||||
|
|
||||||
|
// std::max is `a < b ? b : a`, thus in case of NaN, the first argument is
|
||||||
|
// always returned - max_y, which is initialized to a normalized value.
|
||||||
|
max_y = std::max(max_y, vertex_y);
|
||||||
|
}
|
||||||
|
shader_interpreter_.SetExportSink(nullptr);
|
||||||
|
|
||||||
|
int32_t max_y_24p8 = ui::FloatToD3D11Fixed16p8(max_y);
|
||||||
|
// 16p8 range is -32768 to 32767+255/256, but it's stored as uint32_t here,
|
||||||
|
// as 24p8, so overflowing up to -8388608 to 8388608+255/256 is safe. The
|
||||||
|
// range of the window offset plus the half-pixel offset is -16384 to 16384.5,
|
||||||
|
// so it's safe to add both - adding it will neither move the 16p8 clamping
|
||||||
|
// bounds -32768 and 32767+255/256 into the 0...8192 screen space range, nor
|
||||||
|
// cause 24p8 overflow.
|
||||||
|
if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
|
||||||
|
max_y_24p8 += 128;
|
||||||
|
}
|
||||||
|
if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
|
||||||
|
max_y_24p8 += regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset * 256;
|
||||||
|
}
|
||||||
|
// Top-left rule - .5 exclusive without MSAA, 1. exclusive with MSAA.
|
||||||
|
auto rb_surface_info = regs.Get<reg::RB_SURFACE_INFO>();
|
||||||
|
return (uint32_t(std::max(int32_t(0), max_y_24p8)) +
|
||||||
|
((rb_surface_info.msaa_samples == xenos::MsaaSamples::k1X) ? 127
|
||||||
|
: 255)) >>
|
||||||
|
8;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
|
||||||
|
const Shader& vertex_shader) {
|
||||||
|
SCOPE_profile_cpu_f("gpu");
|
||||||
|
|
||||||
|
const RegisterFile& regs = register_file_;
|
||||||
|
|
||||||
|
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
|
||||||
|
int32_t window_y_offset = pa_sc_window_offset.window_y_offset;
|
||||||
|
|
||||||
|
// Scissor.
|
||||||
|
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
|
||||||
|
int32_t scissor_bottom = int32_t(pa_sc_window_scissor_br.br_y);
|
||||||
|
bool scissor_window_offset =
|
||||||
|
!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable;
|
||||||
|
if (scissor_window_offset) {
|
||||||
|
scissor_bottom += window_y_offset;
|
||||||
|
}
|
||||||
|
auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
|
||||||
|
scissor_bottom = std::min(scissor_bottom, pa_sc_screen_scissor_br.br_y);
|
||||||
|
uint32_t max_y = uint32_t(std::max(scissor_bottom, int32_t(0)));
|
||||||
|
|
||||||
|
if (regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
|
||||||
|
// Actual extent from the vertices.
|
||||||
|
if (try_to_estimate_vertex_max_y &&
|
||||||
|
cvars::execute_unclipped_draw_vs_on_cpu) {
|
||||||
|
bool estimate_vertex_max_y;
|
||||||
|
if (cvars::execute_unclipped_draw_vs_on_cpu_with_scissor) {
|
||||||
|
estimate_vertex_max_y = true;
|
||||||
|
} else {
|
||||||
|
estimate_vertex_max_y = false;
|
||||||
|
if (scissor_bottom >= xenos::kTexture2DCubeMaxWidthHeight) {
|
||||||
|
// Handle just the usual special 8192x8192 case in Direct3D 9 - 8192
|
||||||
|
// may be a normal render target height (80x8192 is well within the
|
||||||
|
// EDRAM size, for instance), no need to process the vertices on the
|
||||||
|
// CPU in this case.
|
||||||
|
int32_t scissor_right = int32_t(pa_sc_window_scissor_br.br_x);
|
||||||
|
if (scissor_window_offset) {
|
||||||
|
scissor_right += pa_sc_window_offset.window_x_offset;
|
||||||
|
}
|
||||||
|
scissor_right = std::min(scissor_right, pa_sc_screen_scissor_br.br_x);
|
||||||
|
if (scissor_right >= xenos::kTexture2DCubeMaxWidthHeight) {
|
||||||
|
estimate_vertex_max_y = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (estimate_vertex_max_y) {
|
||||||
|
max_y = std::min(max_y, EstimateVertexMaxY(vertex_shader));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Viewport. Though the Xenos itself doesn't have an implicit viewport
|
||||||
|
// scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
|
||||||
|
// usually exists and can't be disabled.
|
||||||
|
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||||
|
float viewport_bottom = 0.0f;
|
||||||
|
// First calculate all the integer.0 or integer.5 offsetting exactly at full
|
||||||
|
// precision.
|
||||||
|
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
|
||||||
|
viewport_bottom += float(window_y_offset);
|
||||||
|
}
|
||||||
|
if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
|
||||||
|
viewport_bottom += 0.5f;
|
||||||
|
}
|
||||||
|
// Then apply the floating-point viewport offset.
|
||||||
|
if (pa_cl_vte_cntl.vport_y_offset_ena) {
|
||||||
|
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
||||||
|
}
|
||||||
|
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
|
||||||
|
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
|
||||||
|
: 1.0f;
|
||||||
|
// Using floor, or, rather, truncation (because maxing with zero anyway)
|
||||||
|
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
|
||||||
|
// GPUs on Direct3D 12 (but not WARP), also like in
|
||||||
|
// draw_util::GetHostViewportInfo.
|
||||||
|
// max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
|
||||||
|
// argument in the !(a < b) case (always for NaN), min as float (max_y is
|
||||||
|
// well below 2^24) to safely drop very large values.
|
||||||
|
max_y = uint32_t(std::min(float(max_y), std::max(0.0f, viewport_bottom)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return max_y;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace gpu
|
||||||
|
} // namespace xe
|
|
@ -0,0 +1,76 @@
|
||||||
|
/**
|
||||||
|
******************************************************************************
|
||||||
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
******************************************************************************
|
||||||
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||||
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
|
******************************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
|
||||||
|
#define XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <optional>
|
||||||
|
|
||||||
|
#include "xenia/gpu/register_file.h"
|
||||||
|
#include "xenia/gpu/shader.h"
|
||||||
|
#include "xenia/gpu/shader_interpreter.h"
|
||||||
|
#include "xenia/gpu/trace_writer.h"
|
||||||
|
#include "xenia/memory.h"
|
||||||
|
|
||||||
|
namespace xe {
|
||||||
|
namespace gpu {
|
||||||
|
|
||||||
|
class DrawExtentEstimator {
|
||||||
|
public:
|
||||||
|
DrawExtentEstimator(const RegisterFile& register_file, const Memory& memory,
|
||||||
|
TraceWriter* trace_writer)
|
||||||
|
: register_file_(register_file),
|
||||||
|
memory_(memory),
|
||||||
|
trace_writer_(trace_writer),
|
||||||
|
shader_interpreter_(register_file, memory) {
|
||||||
|
shader_interpreter_.SetTraceWriter(trace_writer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The shader must have its ucode analyzed.
|
||||||
|
uint32_t EstimateVertexMaxY(const Shader& vertex_shader);
|
||||||
|
uint32_t EstimateMaxY(bool try_to_estimate_vertex_max_y,
|
||||||
|
const Shader& vertex_shader);
|
||||||
|
|
||||||
|
private:
|
||||||
|
class PositionYExportSink : public ShaderInterpreter::ExportSink {
|
||||||
|
public:
|
||||||
|
void Export(ucode::ExportRegister export_register, const float* value,
|
||||||
|
uint32_t value_mask) override;
|
||||||
|
|
||||||
|
void Reset() {
|
||||||
|
position_y_.reset();
|
||||||
|
position_w_.reset();
|
||||||
|
point_size_.reset();
|
||||||
|
vertex_kill_.reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::optional<float>& position_y() const { return position_y_; }
|
||||||
|
const std::optional<float>& position_w() const { return position_w_; }
|
||||||
|
const std::optional<float>& point_size() const { return point_size_; }
|
||||||
|
const std::optional<uint32_t>& vertex_kill() const { return vertex_kill_; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::optional<float> position_y_;
|
||||||
|
std::optional<float> position_w_;
|
||||||
|
std::optional<float> point_size_;
|
||||||
|
std::optional<uint32_t> vertex_kill_;
|
||||||
|
};
|
||||||
|
|
||||||
|
const RegisterFile& register_file_;
|
||||||
|
const Memory& memory_;
|
||||||
|
TraceWriter* trace_writer_;
|
||||||
|
|
||||||
|
ShaderInterpreter shader_interpreter_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace gpu
|
||||||
|
} // namespace xe
|
||||||
|
|
||||||
|
#endif // XENIA_GPU_DRAW_EXTENT_ESTIMATOR_H_
|
|
@ -215,6 +215,31 @@ union alignas(uint32_t) SQ_INTERPOLATOR_CNTL {
|
||||||
};
|
};
|
||||||
static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));
|
static_assert_size(SQ_INTERPOLATOR_CNTL, sizeof(uint32_t));
|
||||||
|
|
||||||
|
union alignas(uint32_t) SQ_VS_CONST {
|
||||||
|
uint32_t value;
|
||||||
|
struct {
|
||||||
|
uint32_t base : 9; // +0
|
||||||
|
uint32_t : 3; // +9
|
||||||
|
// Vec4 count minus one.
|
||||||
|
uint32_t size : 9; // 12
|
||||||
|
};
|
||||||
|
static constexpr Register register_index = XE_GPU_REG_SQ_VS_CONST;
|
||||||
|
};
|
||||||
|
static_assert_size(SQ_VS_CONST, sizeof(uint32_t));
|
||||||
|
|
||||||
|
// Same as SQ_VS_CONST.
|
||||||
|
union alignas(uint32_t) SQ_PS_CONST {
|
||||||
|
uint32_t value;
|
||||||
|
struct {
|
||||||
|
uint32_t base : 9; // +0
|
||||||
|
uint32_t : 3; // +9
|
||||||
|
// Vec4 count minus one.
|
||||||
|
uint32_t size : 9; // 12
|
||||||
|
};
|
||||||
|
static constexpr Register register_index = XE_GPU_REG_SQ_PS_CONST;
|
||||||
|
};
|
||||||
|
static_assert_size(SQ_PS_CONST, sizeof(uint32_t));
|
||||||
|
|
||||||
/*******************************************************************************
|
/*******************************************************************************
|
||||||
__ _____ ___ _____ _____ __
|
__ _____ ___ _____ _____ __
|
||||||
\ \ / / __| _ \_ _| __\ \/ /
|
\ \ / / __| _ \_ _| __\ \/ /
|
||||||
|
|
|
@ -22,7 +22,6 @@
|
||||||
#include "xenia/base/logging.h"
|
#include "xenia/base/logging.h"
|
||||||
#include "xenia/base/math.h"
|
#include "xenia/base/math.h"
|
||||||
#include "xenia/gpu/draw_util.h"
|
#include "xenia/gpu/draw_util.h"
|
||||||
#include "xenia/gpu/gpu_flags.h"
|
|
||||||
#include "xenia/gpu/register_file.h"
|
#include "xenia/gpu/register_file.h"
|
||||||
#include "xenia/gpu/registers.h"
|
#include "xenia/gpu/registers.h"
|
||||||
#include "xenia/gpu/xenos.h"
|
#include "xenia/gpu/xenos.h"
|
||||||
|
@ -143,6 +142,19 @@ DEFINE_bool(
|
||||||
"-1...1, remap -32...32 to -1...1 to use the full possible range of "
|
"-1...1, remap -32...32 to -1...1 to use the full possible range of "
|
||||||
"values, at the expense of multiplicative blending correctness.",
|
"values, at the expense of multiplicative blending correctness.",
|
||||||
"GPU");
|
"GPU");
|
||||||
|
// Enabled by default as the GPU is overall usually the bottleneck when the
|
||||||
|
// pixel shader interlock render backend implementation is used, anything that
|
||||||
|
// may improve GPU performance is favorable.
|
||||||
|
DEFINE_bool(
|
||||||
|
execute_unclipped_draw_vs_on_cpu_for_psi_render_backend, true,
|
||||||
|
"If execute_unclipped_draw_vs_on_cpu is enabled, execute the vertex shader "
|
||||||
|
"for unclipped draws on the CPU even when using the pixel shader interlock "
|
||||||
|
"(rasterizer-ordered view) implementation of the render backend on the "
|
||||||
|
"host, for which no expensive copying between host render targets is "
|
||||||
|
"needed when the ownership of a EDRAM range is changed.\n"
|
||||||
|
"If this is enabled, excessive barriers may be eliminated when switching "
|
||||||
|
"between different render targets in separate EDRAM locations.",
|
||||||
|
"GPU");
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
|
@ -367,7 +379,8 @@ void RenderTargetCache::BeginFrame() { ResetAccumulatedRenderTargets(); }
|
||||||
|
|
||||||
bool RenderTargetCache::Update(bool is_rasterization_done,
|
bool RenderTargetCache::Update(bool is_rasterization_done,
|
||||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||||
uint32_t normalized_color_mask) {
|
uint32_t normalized_color_mask,
|
||||||
|
const Shader& vertex_shader) {
|
||||||
const RegisterFile& regs = register_file();
|
const RegisterFile& regs = register_file();
|
||||||
bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock;
|
bool interlock_barrier_only = GetPath() == Path::kPixelShaderInterlock;
|
||||||
|
|
||||||
|
@ -556,47 +569,13 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
|
||||||
|
|
||||||
// Estimate height used by render targets (for color for writes, for depth /
|
// Estimate height used by render targets (for color for writes, for depth /
|
||||||
// stencil for both reads and writes) from various sources.
|
// stencil for both reads and writes) from various sources.
|
||||||
uint32_t height_used =
|
uint32_t height_used = std::min(
|
||||||
GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples);
|
GetRenderTargetHeight(pitch_tiles_at_32bpp, msaa_samples),
|
||||||
int32_t window_y_offset =
|
draw_extent_estimator_.EstimateMaxY(
|
||||||
regs.Get<reg::PA_SC_WINDOW_OFFSET>().window_y_offset;
|
interlock_barrier_only
|
||||||
if (!regs.Get<reg::PA_CL_CLIP_CNTL>().clip_disable) {
|
? cvars::execute_unclipped_draw_vs_on_cpu_for_psi_render_backend
|
||||||
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
: true,
|
||||||
float viewport_bottom = 0.0f;
|
vertex_shader));
|
||||||
// First calculate all the integer.0 or integer.5 offsetting exactly at full
|
|
||||||
// precision.
|
|
||||||
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
|
|
||||||
viewport_bottom += float(window_y_offset);
|
|
||||||
}
|
|
||||||
if (cvars::half_pixel_offset &&
|
|
||||||
!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
|
|
||||||
viewport_bottom += 0.5f;
|
|
||||||
}
|
|
||||||
// Then apply the floating-point viewport offset.
|
|
||||||
if (pa_cl_vte_cntl.vport_y_offset_ena) {
|
|
||||||
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
|
||||||
}
|
|
||||||
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
|
|
||||||
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
|
|
||||||
: 1.0f;
|
|
||||||
// Using floor, or, rather, truncation (because maxing with zero anyway)
|
|
||||||
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
|
|
||||||
// GPUs on Direct3D 12, also like in draw_util::GetHostViewportInfo.
|
|
||||||
// max(0.0f, viewport_bottom) to drop NaN and < 0 - max picks the first
|
|
||||||
// argument in the !(a < b) case (always for NaN), min as float (height_used
|
|
||||||
// is well below 2^24) to safely drop very large values.
|
|
||||||
height_used =
|
|
||||||
uint32_t(std::min(float(height_used), std::max(0.0f, viewport_bottom)));
|
|
||||||
}
|
|
||||||
int32_t scissor_bottom =
|
|
||||||
int32_t(regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>().br_y);
|
|
||||||
if (!regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>().window_offset_disable) {
|
|
||||||
scissor_bottom += window_y_offset;
|
|
||||||
}
|
|
||||||
scissor_bottom =
|
|
||||||
std::min(scissor_bottom, regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>().br_y);
|
|
||||||
height_used =
|
|
||||||
std::min(height_used, uint32_t(std::max(scissor_bottom, int32_t(0))));
|
|
||||||
|
|
||||||
// Sorted by EDRAM base and then by index in the pipeline - for simplicity,
|
// Sorted by EDRAM base and then by index in the pipeline - for simplicity,
|
||||||
// treat render targets placed closer to the end of the EDRAM as truncating
|
// treat render targets placed closer to the end of the EDRAM as truncating
|
||||||
|
|
|
@ -21,9 +21,11 @@
|
||||||
#include "third_party/fmt/include/fmt/format.h"
|
#include "third_party/fmt/include/fmt/format.h"
|
||||||
#include "xenia/base/assert.h"
|
#include "xenia/base/assert.h"
|
||||||
#include "xenia/base/cvar.h"
|
#include "xenia/base/cvar.h"
|
||||||
|
#include "xenia/gpu/draw_extent_estimator.h"
|
||||||
#include "xenia/gpu/draw_util.h"
|
#include "xenia/gpu/draw_util.h"
|
||||||
#include "xenia/gpu/register_file.h"
|
#include "xenia/gpu/register_file.h"
|
||||||
#include "xenia/gpu/registers.h"
|
#include "xenia/gpu/registers.h"
|
||||||
|
#include "xenia/gpu/shader.h"
|
||||||
#include "xenia/gpu/xenos.h"
|
#include "xenia/gpu/xenos.h"
|
||||||
|
|
||||||
DECLARE_bool(depth_transfer_not_equal_test);
|
DECLARE_bool(depth_transfer_not_equal_test);
|
||||||
|
@ -217,7 +219,8 @@ class RenderTargetCache {
|
||||||
|
|
||||||
virtual bool Update(bool is_rasterization_done,
|
virtual bool Update(bool is_rasterization_done,
|
||||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||||
uint32_t normalized_color_mask);
|
uint32_t normalized_color_mask,
|
||||||
|
const Shader& vertex_shader);
|
||||||
|
|
||||||
// Returns bits where 0 is whether a depth render target is currently bound on
|
// Returns bits where 0 is whether a depth render target is currently bound on
|
||||||
// the host and 1... are whether the same applies to color render targets, and
|
// the host and 1... are whether the same applies to color render targets, and
|
||||||
|
@ -228,8 +231,10 @@ class RenderTargetCache {
|
||||||
uint32_t* depth_and_color_formats_out = nullptr) const;
|
uint32_t* depth_and_color_formats_out = nullptr) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
RenderTargetCache(const RegisterFile& register_file)
|
RenderTargetCache(const RegisterFile& register_file, const Memory& memory,
|
||||||
: register_file_(register_file) {}
|
TraceWriter* trace_writer)
|
||||||
|
: register_file_(register_file),
|
||||||
|
draw_extent_estimator_(register_file, memory, trace_writer) {}
|
||||||
|
|
||||||
const RegisterFile& register_file() const { return register_file_; }
|
const RegisterFile& register_file() const { return register_file_; }
|
||||||
|
|
||||||
|
@ -606,6 +611,8 @@ class RenderTargetCache {
|
||||||
private:
|
private:
|
||||||
const RegisterFile& register_file_;
|
const RegisterFile& register_file_;
|
||||||
|
|
||||||
|
DrawExtentEstimator draw_extent_estimator_;
|
||||||
|
|
||||||
// For host render targets.
|
// For host render targets.
|
||||||
|
|
||||||
struct OwnershipRange {
|
struct OwnershipRange {
|
||||||
|
|
|
@ -914,6 +914,12 @@ class Shader {
|
||||||
// True if the current shader has any `kill` instructions.
|
// True if the current shader has any `kill` instructions.
|
||||||
bool kills_pixels() const { return kills_pixels_; }
|
bool kills_pixels() const { return kills_pixels_; }
|
||||||
|
|
||||||
|
// True if the shader has any texture-related instructions (any fetch
|
||||||
|
// instructions other than vertex fetch) writing any non-constant components.
|
||||||
|
bool uses_texture_fetch_instruction_results() const {
|
||||||
|
return uses_texture_fetch_instruction_results_;
|
||||||
|
}
|
||||||
|
|
||||||
// True if the shader overrides the pixel depth.
|
// True if the shader overrides the pixel depth.
|
||||||
bool writes_depth() const { return writes_depth_; }
|
bool writes_depth() const { return writes_depth_; }
|
||||||
|
|
||||||
|
@ -1002,6 +1008,7 @@ class Shader {
|
||||||
uint32_t register_static_address_bound_ = 0;
|
uint32_t register_static_address_bound_ = 0;
|
||||||
bool uses_register_dynamic_addressing_ = false;
|
bool uses_register_dynamic_addressing_ = false;
|
||||||
bool kills_pixels_ = false;
|
bool kills_pixels_ = false;
|
||||||
|
bool uses_texture_fetch_instruction_results_ = false;
|
||||||
bool writes_depth_ = false;
|
bool writes_depth_ = false;
|
||||||
uint32_t writes_color_targets_ = 0b0000;
|
uint32_t writes_color_targets_ = 0b0000;
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,149 @@
|
||||||
|
/**
|
||||||
|
******************************************************************************
|
||||||
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
******************************************************************************
|
||||||
|
* Copyright 2022 Ben Vanik. All rights reserved. *
|
||||||
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
|
******************************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef XENIA_GPU_SHADER_INTERPRETER_H_
|
||||||
|
#define XENIA_GPU_SHADER_INTERPRETER_H_
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
#include "xenia/base/assert.h"
|
||||||
|
#include "xenia/gpu/register_file.h"
|
||||||
|
#include "xenia/gpu/shader.h"
|
||||||
|
#include "xenia/gpu/trace_writer.h"
|
||||||
|
#include "xenia/gpu/ucode.h"
|
||||||
|
#include "xenia/gpu/xenos.h"
|
||||||
|
#include "xenia/memory.h"
|
||||||
|
|
||||||
|
namespace xe {
|
||||||
|
namespace gpu {
|
||||||
|
|
||||||
|
class ShaderInterpreter {
|
||||||
|
public:
|
||||||
|
ShaderInterpreter(const RegisterFile& register_file, const Memory& memory)
|
||||||
|
: register_file_(register_file), memory_(memory) {}
|
||||||
|
|
||||||
|
class ExportSink {
|
||||||
|
public:
|
||||||
|
virtual ~ExportSink() = default;
|
||||||
|
virtual void AllocExport(ucode::AllocType type, uint32_t size) {}
|
||||||
|
virtual void Export(ucode::ExportRegister export_register,
|
||||||
|
const float* value, uint32_t value_mask) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
void SetTraceWriter(TraceWriter* new_trace_writer) {
|
||||||
|
trace_writer_ = new_trace_writer;
|
||||||
|
}
|
||||||
|
|
||||||
|
ExportSink* GetExportSink() const { return export_sink_; }
|
||||||
|
void SetExportSink(ExportSink* new_export_sink) {
|
||||||
|
export_sink_ = new_export_sink;
|
||||||
|
}
|
||||||
|
|
||||||
|
const float* temp_registers() const { return &temp_registers_[0][0]; }
|
||||||
|
float* temp_registers() { return &temp_registers_[0][0]; }
|
||||||
|
|
||||||
|
static bool CanInterpretShader(const Shader& shader) {
|
||||||
|
assert_true(shader.is_ucode_analyzed());
|
||||||
|
// Texture instructions are not very common in vertex shaders (and not used
|
||||||
|
// in Direct3D 9's internal rectangles such as clears) and are extremely
|
||||||
|
// complex, not implemented.
|
||||||
|
if (shader.uses_texture_fetch_instruction_results()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
void SetShader(xenos::ShaderType shader_type, const uint32_t* ucode) {
|
||||||
|
shader_type_ = shader_type;
|
||||||
|
ucode_ = ucode;
|
||||||
|
}
|
||||||
|
void SetShader(const Shader& shader) {
|
||||||
|
assert_true(CanInterpretShader(shader));
|
||||||
|
SetShader(shader.type(), shader.ucode_dwords());
|
||||||
|
}
|
||||||
|
|
||||||
|
void Execute();
|
||||||
|
|
||||||
|
private:
|
||||||
|
struct State {
|
||||||
|
ucode::VertexFetchInstruction vfetch_full_last;
|
||||||
|
uint32_t vfetch_address_dwords;
|
||||||
|
float previous_scalar;
|
||||||
|
uint32_t call_stack_depth;
|
||||||
|
uint32_t call_return_addresses[4];
|
||||||
|
uint32_t loop_stack_depth;
|
||||||
|
xenos::LoopConstant loop_constants[4];
|
||||||
|
uint32_t loop_iterators[4];
|
||||||
|
int32_t address_register;
|
||||||
|
bool predicate;
|
||||||
|
|
||||||
|
void Reset() { std::memset(this, 0, sizeof(*this)); }
|
||||||
|
|
||||||
|
int32_t GetLoopAddress() const {
|
||||||
|
assert_true(loop_stack_depth && loop_stack_depth < 4);
|
||||||
|
if (!loop_stack_depth || loop_stack_depth >= 4) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
xenos::LoopConstant loop_constant = loop_constants[loop_stack_depth];
|
||||||
|
// Clamp to the real range specified in the IPR2015-00325 sequencer
|
||||||
|
// specification.
|
||||||
|
// https://portal.unifiedpatents.com/ptab/case/IPR2015-00325
|
||||||
|
return std::min(
|
||||||
|
INT32_C(256),
|
||||||
|
std::max(INT32_C(-256),
|
||||||
|
int32_t(int32_t(loop_iterators[loop_stack_depth]) *
|
||||||
|
loop_constant.step +
|
||||||
|
loop_constant.start)));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static float FlushDenormal(float value) {
|
||||||
|
uint32_t bits = *reinterpret_cast<const uint32_t*>(&value);
|
||||||
|
bits &= (bits & UINT32_C(0x7F800000)) ? ~UINT32_C(0) : (UINT32_C(1) << 31);
|
||||||
|
return *reinterpret_cast<const float*>(&bits);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float* GetTempRegister(uint32_t address, bool is_relative) const {
|
||||||
|
return temp_registers_[(
|
||||||
|
int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
|
||||||
|
}
|
||||||
|
// For simplicity (due to writability), not bounds-checking.
|
||||||
|
float* GetTempRegister(uint32_t address, bool is_relative) {
|
||||||
|
return temp_registers_[(
|
||||||
|
int32_t(address) + (is_relative ? state_.GetLoopAddress() : 0) & 63)];
|
||||||
|
}
|
||||||
|
const float* GetFloatConstant(uint32_t address, bool is_relative,
|
||||||
|
bool relative_address_is_a0) const;
|
||||||
|
|
||||||
|
void ExecuteAluInstruction(ucode::AluInstruction instr);
|
||||||
|
void StoreFetchResult(uint32_t dest, bool is_dest_relative, uint32_t swizzle,
|
||||||
|
const float* value);
|
||||||
|
void ExecuteVertexFetchInstruction(ucode::VertexFetchInstruction instr);
|
||||||
|
|
||||||
|
const RegisterFile& register_file_;
|
||||||
|
const Memory& memory_;
|
||||||
|
|
||||||
|
TraceWriter* trace_writer_ = nullptr;
|
||||||
|
|
||||||
|
ExportSink* export_sink_ = nullptr;
|
||||||
|
|
||||||
|
xenos::ShaderType shader_type_ = xenos::ShaderType::kVertex;
|
||||||
|
const uint32_t* ucode_ = nullptr;
|
||||||
|
|
||||||
|
// For both inputs and locals.
|
||||||
|
float temp_registers_[64][4];
|
||||||
|
|
||||||
|
State state_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace gpu
|
||||||
|
} // namespace xe
|
||||||
|
|
||||||
|
#endif // XENIA_GPU_SHADER_INTERPRETER_H_
|
|
@ -334,6 +334,10 @@ void Shader::GatherTextureFetchInformation(const TextureFetchInstruction& op,
|
||||||
GatherOperandInformation(binding.fetch_instr.operands[i]);
|
GatherOperandInformation(binding.fetch_instr.operands[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (binding.fetch_instr.result.GetUsedResultComponents()) {
|
||||||
|
uses_texture_fetch_instruction_results_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
switch (op.opcode()) {
|
switch (op.opcode()) {
|
||||||
case FetchOpcode::kSetTextureLod:
|
case FetchOpcode::kSetTextureLod:
|
||||||
case FetchOpcode::kSetTextureGradientsHorz:
|
case FetchOpcode::kSetTextureGradientsHorz:
|
||||||
|
|
Loading…
Reference in New Issue