faster/more compact MatchValueAndRef

Made commandprocessor GetcurrentRingReadcount inline, it was made noinline to match PGO decisions but i think PGO can make extra reg allocation decisions that make this inlining choice yield gains, whereas if we do it manually we lose a tiny bit of performance

Working on a more compact vectorized version of GetScissor to save icache on cmd processor thread
Add WriteRegisterForceinline, will probably end up amending to remove it

add ERMS path for vastcpy

Adding WriteRegistersFromMemCommonSense (name will be changed later), name is because i realized my approach with optimizing writeregisters has been backwards,  instead of handling all checks more quickly within the loop that writes the registers, we need a different loop for each range with unique handling. we also manage to hoist a lot of the logic out of the loops

Use 100ns delay for MaybeYield, noticed we often return almost immediately from the syscall so we end up wasting some cpu, instead we give up the cpu for the min waitable time (on my system, this is 0.5 ms)

Added a note about affinity mask/dynamic process affinity updates to threading_win

Add TextureFetchConstantsWritten
This commit is contained in:
chss95cs@gmail.com 2022-12-13 10:45:19 -08:00
parent 43d7fc5158
commit 82dcf3f951
9 changed files with 345 additions and 97 deletions

View File

@ -9,8 +9,8 @@
#include "xenia/base/memory.h" #include "xenia/base/memory.h"
#include "xenia/base/cvar.h" #include "xenia/base/cvar.h"
#include "xenia/base/platform.h"
#include "xenia/base/logging.h" #include "xenia/base/logging.h"
#include "xenia/base/platform.h"
#if XE_ARCH_ARM64 #if XE_ARCH_ARM64
#include <arm_neon.h> #include <arm_neon.h>
@ -173,7 +173,12 @@ static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
_movdir64b(physaddr + i, rdmapping + i); _movdir64b(physaddr + i, rdmapping + i);
} }
} }
static void vastcpy_impl_repmovs(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length) {
__movsq((unsigned long long*)physaddr, (unsigned long long*)rdmapping,
written_length / 8);
}
XE_COLD XE_COLD
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping, CacheLine* XE_RESTRICT rdmapping,
@ -189,6 +194,9 @@ static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) { if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
XELOGI("Selecting MOVDIR64M vastcpy."); XELOGI("Selecting MOVDIR64M vastcpy.");
dispatch_to_use = vastcpy_impl_movdir64m; dispatch_to_use = vastcpy_impl_movdir64m;
} else if (amd64::GetFeatureFlags() & amd64::kX64FastRepMovs) {
XELOGI("Selecting rep movs vastcpy.");
dispatch_to_use = vastcpy_impl_repmovs;
} else { } else {
XELOGI("Selecting generic AVX vastcpy."); XELOGI("Selecting generic AVX vastcpy.");
dispatch_to_use = vastcpy_impl_avx; dispatch_to_use = vastcpy_impl_avx;

View File

@ -60,6 +60,12 @@ namespace xe {
namespace threading { namespace threading {
void EnableAffinityConfiguration() { void EnableAffinityConfiguration() {
// chrispy: i don't think this is necessary,
// affinity always seems to be the system mask? research more
// also, maybe if ignore_thread_affinities is on we should use
// SetProcessAffinityUpdateMode to allow windows to dynamically update
// our process' affinity (by default windows cannot change the affinity itself
// at runtime, user code must do it)
HANDLE process_handle = GetCurrentProcess(); HANDLE process_handle = GetCurrentProcess();
DWORD_PTR process_affinity_mask; DWORD_PTR process_affinity_mask;
DWORD_PTR system_affinity_mask; DWORD_PTR system_affinity_mask;
@ -117,7 +123,7 @@ void set_name(const std::string_view name) {
// checked ntoskrnl, it does not modify delay, so we can place this as a // checked ntoskrnl, it does not modify delay, so we can place this as a
// constant and avoid creating a stack variable // constant and avoid creating a stack variable
static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}}; static const LARGE_INTEGER sleepdelay0_for_maybeyield{{~0u, -1}};
void MaybeYield() { void MaybeYield() {
#if 0 #if 0

View File

@ -1712,8 +1712,10 @@ void D3D12CommandProcessor::ShutdownContext() {
CommandProcessor::ShutdownContext(); CommandProcessor::ShutdownContext();
} }
// todo: bit-pack the bools and use bitarith to reduce branches
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { XE_FORCEINLINE
void D3D12CommandProcessor::WriteRegisterForceinline(uint32_t index,
uint32_t value) {
__m128i to_rangecheck = _mm_set1_epi16(static_cast<short>(index)); __m128i to_rangecheck = _mm_set1_epi16(static_cast<short>(index));
__m128i lower_bounds = _mm_setr_epi16( __m128i lower_bounds = _mm_setr_epi16(
@ -1783,6 +1785,10 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
return; return;
} }
} }
// todo: bit-pack the bools and use bitarith to reduce branches
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
WriteRegisterForceinline(index, value);
}
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
uint32_t* base, uint32_t* base,
@ -1911,8 +1917,22 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
uint32_t base, uint32_t base,
uint32_t num_registers) { uint32_t num_registers) {
WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base, // WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
num_registers); // num_registers);
RingBuffer::ReadRange range =
ring->BeginRead(num_registers * sizeof(uint32_t));
XE_LIKELY_IF(!range.second) {
WriteRegistersFromMemCommonSense(
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
num_registers);
ring->EndRead(range);
}
else {
return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
}
} }
template <uint32_t register_lower_bound, uint32_t register_upper_bound> template <uint32_t register_lower_bound, uint32_t register_upper_bound>
@ -1926,6 +1946,145 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
bounds_may_have_reg<register_lower_bound, register_upper_bound>( bounds_may_have_reg<register_lower_bound, register_upper_bound>(
last_reg); last_reg);
} }
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
if (frame_open_) {
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
if (cbuffer_pixel_uptodate || cbuffer_vertex_uptodate) {
// super naive, could just do some bit magic and interval checking,
// but we just need this hoisted out of the copy so we do a bulk copy
// because its the actual load/swap/store we're getting murdered by
// this precheck followed by copy_and_swap_32_unaligned reduced the cpu
// usage from packettype0/writeregistersfrommem from 10-11% of cpu time
// spent on xenia to like 1%
// chrispy: todo, this can be reduced even further, should be split into
// two loops and should skip whole words, this could net us even bigger
// gains
uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
uint32_t end_map_index =
(start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
if (!cbuffer_vertex_uptodate) {
if (256 >= end_map_index) {
goto skip_map_checks;
}
map_index = 256;
}
for (; map_index < end_map_index; ++map_index) {
uint32_t float_constant_index = map_index;
if (float_constant_index >= 256) {
float_constant_index -= 256;
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
(1ull << (float_constant_index & 63))) {
cbuffer_pixel_uptodate = false;
if (!cbuffer_vertex_uptodate) {
break;
}
}
} else {
if (current_float_constant_map_vertex_[float_constant_index >> 6] &
(1ull << (float_constant_index & 63))) {
cbuffer_vertex_uptodate = false;
if (!cbuffer_pixel_uptodate) {
break;
} else {
map_index = 255; // skip to checking pixel
continue; // increment will put us at 256, then the check will
// happen
}
}
}
}
skip_map_checks:;
}
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
}
// maybe use non-temporal copy if possible...
copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
num_registers);
}
void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
cbuffer_binding_bool_loop_.up_to_date = false;
copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
num_registers);
}
void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
uint32_t* base,
uint32_t num_registers) {
cbuffer_binding_fetch_.up_to_date = false;
uint32_t first_fetch =
((start_index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
uint32_t last_fetch = // i think last_fetch should be inclusive if its modulo
// is nz...
(((start_index + num_registers) - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) /
6);
texture_cache_->TextureFetchConstantsWritten(first_fetch, last_fetch);
copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
num_registers);
}
void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
uint32_t end = start_index + num_registers;
uint32_t current_index = start_index;
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
return std::min<uint32_t>(regnum, end) - current_index;
};
#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n) \
WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \
index, base, n)
#define DO_A_RANGE(start_range, end_range, cb) \
if (current_index < (end_range)) { \
uint32_t ntowrite = get_end_before_qty(end_range); \
cb((start_range), (end_range), current_index, base, ntowrite); \
current_index += ntowrite; \
base += ntowrite; \
} \
if (current_index >= end) { \
return; \
}
if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) { // fairly common
goto shader_vars_start;
}
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
DO_A_RANGE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
REGULAR_WRITE_CALLBACK);
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
n) \
WriteShaderConstantsFromMem(index, base, n)
shader_vars_start:
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
WRITE_SHADER_CONSTANTS_CALLBACK);
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
WriteFetchFromMem(ind, b, n)
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
WRITE_FETCH_CONSTANTS_CALLBACK);
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
REGULAR_WRITE_CALLBACK);
}
template <uint32_t register_lower_bound, uint32_t register_upper_bound> template <uint32_t register_lower_bound, uint32_t register_upper_bound>
XE_FORCEINLINE void XE_FORCEINLINE void
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
@ -1935,12 +2094,21 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
constexpr auto bounds_has_bounds = constexpr auto bounds_has_bounds =
bounds_may_have_bounds<register_lower_bound, register_upper_bound>; bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
bool skip_uptodate_checks =
(!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
for (uint32_t i = 0; i < num_registers; ++i) { for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = xe::load_and_swap<uint32_t>(range + i); uint32_t data = xe::load_and_swap<uint32_t>(range + i);
uint32_t index = base + i;
{ uint32_t value = data;
uint32_t index = base + i; // cant if constexpr this one or we get unreferenced label errors, and if we
uint32_t value = data; // move the label into the else we get errors about a jump from one if
// constexpr into another
if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
D3D12CommandProcessor::WriteRegisterForceinline(index, value);
} else {
XE_MSVC_ASSUME(index >= register_lower_bound && XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound); index < register_upper_bound);
register_file_->values[index].u32 = value; register_file_->values[index].u32 = value;
@ -1968,27 +2136,13 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
HandleSpecialRegisterWrite(index, value); HandleSpecialRegisterWrite(index, value);
goto write_done; goto write_done;
} }
XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
cbuffer_binding_fetch_.up_to_date = false;
// texture cache is never nullptr
texture_cache_->TextureFetchConstantWritten(
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
goto write_done;
}
}
XE_MSVC_ASSUME(index >= register_lower_bound && XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound); index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X, if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_511_W)) { XE_GPU_REG_SHADER_CONSTANT_511_W)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
if (frame_open_) { if (!skip_uptodate_checks) {
uint32_t float_constant_index = uint32_t float_constant_index =
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
if (float_constant_index >= 256) { if (float_constant_index >= 256) {
@ -2002,12 +2156,30 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
6] & 6] &
(1ull << (float_constant_index & 63))) { (1ull << (float_constant_index & 63))) {
cbuffer_binding_float_vertex_.up_to_date = false; cbuffer_binding_float_vertex_.up_to_date = false;
if (!cbuffer_pixel_uptodate) {
skip_uptodate_checks = true;
}
} }
} }
} }
goto write_done; goto write_done;
} }
} }
XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
cbuffer_binding_fetch_.up_to_date = false;
// texture cache is never nullptr
texture_cache_->TextureFetchConstantWritten(
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
goto write_done;
}
}
XE_MSVC_ASSUME(index >= register_lower_bound && XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound); index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,

View File

@ -211,12 +211,21 @@ class D3D12CommandProcessor final : public CommandProcessor {
protected: protected:
bool SetupContext() override; bool SetupContext() override;
void ShutdownContext() override; void ShutdownContext() override;
XE_FORCEINLINE
void WriteRegisterForceinline(uint32_t index, uint32_t value);
void WriteRegister(uint32_t index, uint32_t value) override; void WriteRegister(uint32_t index, uint32_t value) override;
XE_FORCEINLINE XE_FORCEINLINE
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base, virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers) override; uint32_t num_registers) override;
//SHADER_CONSTANT_blah_XWYZ
void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers);
void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers);
void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers);
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
uint32_t num_registers) ;
template <uint32_t register_lower_bound, uint32_t register_upper_bound> template <uint32_t register_lower_bound, uint32_t register_upper_bound>
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound( XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
uint32_t start_index, uint32_t* base, uint32_t num_registers); uint32_t start_index, uint32_t* base, uint32_t num_registers);

View File

@ -551,30 +551,60 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
viewport_info_out.ndc_offset[i] = ndc_offset[i]; viewport_info_out.ndc_offset[i] = ndc_offset[i];
} }
} }
void GetScissor(const RegisterFile& regs, Scissor& scissor_out, template <bool clamp_to_surface_pitch>
bool clamp_to_surface_pitch) { XE_NOINLINE static void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
Scissor& XE_RESTRICT scissor_out) {
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>(); auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
int32_t tl_x = int32_t(pa_sc_window_scissor_tl.tl_x);
int32_t tl_y = int32_t(pa_sc_window_scissor_tl.tl_y);
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>(); auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
int32_t br_x = int32_t(pa_sc_window_scissor_br.br_x); auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
int32_t br_y = int32_t(pa_sc_window_scissor_br.br_y); auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
if (!pa_sc_window_scissor_tl.window_offset_disable) { auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>(); uint32_t surface_pitch = 0;
tl_x += pa_sc_window_offset.window_x_offset; if constexpr (clamp_to_surface_pitch) {
tl_y += pa_sc_window_offset.window_y_offset; surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
br_x += pa_sc_window_offset.window_x_offset;
br_y += pa_sc_window_offset.window_y_offset;
} }
uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x,
pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y,
pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x,
pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y,
pa_sc_window_offset_window_x_offset =
pa_sc_window_offset.window_x_offset,
pa_sc_window_offset_window_y_offset =
pa_sc_window_offset.window_y_offset,
pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x,
pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y,
pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x,
pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y;
int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x);
int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y);
int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x);
int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y);
// chrispy: put this here to make it clear that the shift by 31 is extracting
// this field
XE_MAYBE_UNUSED
uint32_t window_offset_disable_reference =
pa_sc_window_scissor_tl.window_offset_disable;
int32_t window_offset_disable_mask =
~(static_cast<int32_t>(pa_sc_window_scissor_tl.value) >> 31);
// if (!pa_sc_window_scissor_tl.window_offset_disable) {
tl_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
tl_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
br_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
br_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
//}
// Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but // Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but
// still handled here for completeness. // still handled here for completeness.
auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl.tl_x)); tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl_tl_x));
tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl.tl_y)); tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl_tl_y));
auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br.br_x)); br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br_br_x));
br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br.br_y)); br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br_br_y));
if (clamp_to_surface_pitch) { if constexpr (clamp_to_surface_pitch) {
// Clamp the horizontal scissor to surface_pitch for safety, in case that's // Clamp the horizontal scissor to surface_pitch for safety, in case that's
// not done by the guest for some reason (it's not when doing draws without // not done by the guest for some reason (it's not when doing draws without
// clipping in Direct3D 9, for instance), to prevent overflow - this is // clipping in Direct3D 9, for instance), to prevent overflow - this is
@ -582,7 +612,7 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
// rasterization without render target width at all (pixel shader // rasterization without render target width at all (pixel shader
// interlock-based custom RB implementations) and using conventional render // interlock-based custom RB implementations) and using conventional render
// targets, but padded to EDRAM tiles. // targets, but padded to EDRAM tiles.
uint32_t surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
tl_x = std::min(tl_x, int32_t(surface_pitch)); tl_x = std::min(tl_x, int32_t(surface_pitch));
br_x = std::min(br_x, int32_t(surface_pitch)); br_x = std::min(br_x, int32_t(surface_pitch));
} }
@ -601,6 +631,15 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
scissor_out.extent[1] = uint32_t(br_y - tl_y); scissor_out.extent[1] = uint32_t(br_y - tl_y);
} }
void GetScissor(const RegisterFile& XE_RESTRICT regs,
Scissor& XE_RESTRICT scissor_out, bool clamp_to_surface_pitch) {
if (clamp_to_surface_pitch) {
return GetScissorTmpl<true>(regs, scissor_out);
} else {
return GetScissorTmpl<false>(regs, scissor_out);
}
}
uint32_t GetNormalizedColorMask(const RegisterFile& regs, uint32_t GetNormalizedColorMask(const RegisterFile& regs,
uint32_t pixel_shader_writes_color_targets) { uint32_t pixel_shader_writes_color_targets) {
if (regs.Get<reg::RB_MODECONTROL>().edram_mode != if (regs.Get<reg::RB_MODECONTROL>().edram_mode !=
@ -863,7 +902,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
y1 = y0 + int32_t(xenos::kMaxResolveSize); y1 = y0 + int32_t(xenos::kMaxResolveSize);
} }
// fails in forza horizon 1 // fails in forza horizon 1
//x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100 // x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
assert_true(x0 <= x1 && y0 <= y1); assert_true(x0 <= x1 && y0 <= y1);
if (x0 >= x1 || y0 >= y1) { if (x0 >= x1 || y0 >= y1) {
XELOGE("Resolve region is empty"); XELOGE("Resolve region is empty");
@ -1103,7 +1142,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32; info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32; info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32; info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
#if 0 #if 0
XELOGD( XELOGD(
"Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially " "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
"modified memory range 0x{:08X} to 0x{:08X})", "modified memory range 0x{:08X} to 0x{:08X})",
@ -1114,7 +1153,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
xenos::ColorRenderTargetFormat(color_edram_info.format)), xenos::ColorRenderTargetFormat(color_edram_info.format)),
FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start, FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
copy_dest_extent_end); copy_dest_extent_end);
#endif #endif
return true; return true;
} }
XE_MSVC_OPTIMIZE_REVERT() XE_MSVC_OPTIMIZE_REVERT()

View File

@ -433,13 +433,15 @@ struct GetViewportInfoArgs {
void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args, void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
ViewportInfo& viewport_info_out); ViewportInfo& viewport_info_out);
struct Scissor { struct alignas(16) Scissor {
// Offset from render target UV = 0 to +UV. // Offset from render target UV = 0 to +UV.
uint32_t offset[2]; uint32_t offset[2];
// Extent can be zero. // Extent can be zero.
uint32_t extent[2]; uint32_t extent[2];
}; };
void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
void GetScissor(const RegisterFile& XE_RESTRICT regs,
Scissor& XE_RESTRICT scissor_out,
bool clamp_to_surface_pitch = true); bool clamp_to_surface_pitch = true);
// Returns the color component write mask for the draw command taking into // Returns the color component write mask for the draw command taking into

View File

@ -109,7 +109,7 @@ XE_NOINLINE
XE_COLD XE_COLD
bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT; bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT;
XE_NOINLINE XE_FORCEINLINE
XE_NOALIAS XE_NOALIAS
uint32_t GetCurrentRingReadCount(); uint32_t GetCurrentRingReadCount();

View File

@ -4,7 +4,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
uint32_t count) XE_RESTRICT { uint32_t count) XE_RESTRICT {
SCOPE_profile_cpu_f("gpu"); SCOPE_profile_cpu_f("gpu");
trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t)); trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t));
if (count != 0) { if (count != 0) {
RingBuffer old_reader = reader_; RingBuffer old_reader = reader_;
@ -32,10 +31,9 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
trace_writer_.WriteIndirectBufferEnd(); trace_writer_.WriteIndirectBufferEnd();
reader_ = old_reader; reader_ = old_reader;
} else { } else {
//rare, but i've seen it happen! (and then a division by 0 occurs) // rare, but i've seen it happen! (and then a division by 0 occurs)
return; return;
} }
} }
bool COMMAND_PROCESSOR::ExecutePacket() { bool COMMAND_PROCESSOR::ExecutePacket() {
@ -88,8 +86,9 @@ bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
count * sizeof(uint32_t)); count * sizeof(uint32_t));
return false; return false;
} }
/* /*
Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4 Todo: optimize this function this one along with execute packet type III are
the most frequently called functions for PM4
*/ */
XE_NOINLINE XE_NOINLINE
bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT { bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
@ -99,7 +98,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
uint32_t count = ((packet >> 16) & 0x3FFF) + 1; uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >= if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
count * sizeof(uint32_t)) { count * sizeof(uint32_t)) {
trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count); trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
@ -143,7 +141,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType2(uint32_t packet) XE_RESTRICT {
trace_writer_.WritePacketEnd(); trace_writer_.WritePacketEnd();
return true; return true;
} }
XE_NOINLINE XE_FORCEINLINE
XE_NOALIAS XE_NOALIAS
uint32_t COMMAND_PROCESSOR::GetCurrentRingReadCount() { uint32_t COMMAND_PROCESSOR::GetCurrentRingReadCount() {
return reader_.read_count(); return reader_.read_count();
@ -446,41 +444,46 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(
return true; return true;
} }
XE_NOINLINE /*
chrispy: this is fine to inline, as a noinline function it compiled down
to 54 bytes
*/
static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) { static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
/* // smaller code is generated than the #else path, although whether it is faster
Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits // i do not know. i don't think games do an enormous number of cond_write
then use the wait_info value in order to select the bits that correctly implement the condition // though, so we have picked
If neither subtraction has the signbit set then that means the value is equal // the path with the smaller codegen.
*/ // we do technically have more instructions executed vs the switch case method,
bool matched = false; // but we have no mispredicts and most of our instructions are 0.25/0.3
switch (wait_info & 0x7) { // throughput
case 0x0: // Never. #if 1
matched = false; uint32_t value_minus_ref =
break; static_cast<uint32_t>(static_cast<int32_t>(value - ref) >> 31);
case 0x1: // Less than reference. uint32_t ref_minus_value =
matched = value < ref; static_cast<uint32_t>(static_cast<int32_t>(ref - value) >> 31);
break; uint32_t eqmask = ~(value_minus_ref | ref_minus_value);
case 0x2: // Less than or equal to reference. uint32_t nemask = (value_minus_ref | ref_minus_value);
matched = value <= ref;
break; uint32_t value_lt_mask = value_minus_ref;
case 0x3: // Equal to reference. uint32_t value_gt_mask = ref_minus_value;
matched = value == ref; uint32_t value_lte_mask = value_lt_mask | eqmask;
break; uint32_t value_gte_mask = value_gt_mask | eqmask;
case 0x4: // Not equal to reference.
matched = value != ref; uint32_t bits_for_selecting =
break; (value_lt_mask & (1 << 1)) | (value_lte_mask & (1 << 2)) |
case 0x5: // Greater than or equal to reference. (eqmask & (1 << 3)) | (nemask & (1 << 4)) | (value_gte_mask & (1 << 5)) |
matched = value >= ref; (value_gt_mask & (1 << 6)) | (1 << 7);
break;
case 0x6: // Greater than reference. return (bits_for_selecting >> (wait_info & 7)) & 1;
matched = value > ref;
break; #else
case 0x7: // Always
matched = true; return ((((value < ref) << 1) | ((value <= ref) << 2) |
break; ((value == ref) << 3) | ((value != ref) << 4) |
} ((value >= ref) << 5) | ((value > ref) << 6) | (1 << 7)) >>
return matched; (wait_info & 7)) &
1;
#endif
} }
XE_NOINLINE XE_NOINLINE
bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM( bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
@ -1128,7 +1131,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(
} }
uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index, uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
uint32_t write_index) { uint32_t write_index) {
SCOPE_profile_cpu_f("gpu"); SCOPE_profile_cpu_f("gpu");
#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 #if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
// If we have a pending trace stream open it now. That way we ensure we get // If we have a pending trace stream open it now. That way we ensure we get

View File

@ -104,6 +104,15 @@ class TextureCache {
void TextureFetchConstantWritten(uint32_t index) { void TextureFetchConstantWritten(uint32_t index) {
texture_bindings_in_sync_ &= ~(UINT32_C(1) << index); texture_bindings_in_sync_ &= ~(UINT32_C(1) << index);
} }
void TextureFetchConstantsWritten(uint32_t first_index, uint32_t last_index) {
// generate a mask of all bits from before the first index, and xor it with
// all bits before the last index this produces a mask covering only the
// bits between first and last
uint32_t res = ((1U << first_index) - 1) ^ ((1U << last_index) - 1);
// todo: check that this is right
texture_bindings_in_sync_ &= ~res;
}
virtual void RequestTextures(uint32_t used_texture_mask); virtual void RequestTextures(uint32_t used_texture_mask);