diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index b83e545d2..4fb537226 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -9,8 +9,8 @@ #include "xenia/base/memory.h" #include "xenia/base/cvar.h" -#include "xenia/base/platform.h" #include "xenia/base/logging.h" +#include "xenia/base/platform.h" #if XE_ARCH_ARM64 #include @@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to, CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); - + for (uint32_t i = 0; i < num_lines_for_8k; ++i) { xe::swcache::CacheLine line0, line1, line2, line3; @@ -173,7 +173,12 @@ static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr, _movdir64b(physaddr + i, rdmapping + i); } } - +static void vastcpy_impl_repmovs(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length) { + __movsq((unsigned long long*)physaddr, (unsigned long long*)rdmapping, + written_length / 8); +} XE_COLD static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, CacheLine* XE_RESTRICT rdmapping, @@ -189,6 +194,9 @@ static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) { XELOGI("Selecting MOVDIR64M vastcpy."); dispatch_to_use = vastcpy_impl_movdir64m; + } else if (amd64::GetFeatureFlags() & amd64::kX64FastRepMovs) { + XELOGI("Selecting rep movs vastcpy."); + dispatch_to_use = vastcpy_impl_repmovs; } else { XELOGI("Selecting generic AVX vastcpy."); dispatch_to_use = vastcpy_impl_avx; @@ -301,15 +309,64 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr, size_t count) { auto dest = reinterpret_cast(dest_ptr); auto src = reinterpret_cast(src_ptr); - __m128i shufmask = - _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, - 0x06, 0x07, 0x00, 0x01, 0x02, 0x03); - size_t i; - for (i = 0; i + 4 <= count; i += 4) { - __m128i input = _mm_loadu_si128(reinterpret_cast(&src[i])); - __m128i output = _mm_shuffle_epi8(input, shufmask); - _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + // chrispy: this optimization mightt backfire if our unaligned load spans two + // cachelines... which it probably will + if (amd64::GetFeatureFlags() & amd64::kX64EmitAVX2) { + __m256i shufmask = _mm256_set_epi8( + 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, + 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03); + // with vpshufb being a 0.5 through instruction, it makes the most sense to + // double up on our iters + for (i = 0; i + 16 <= count; i += 16) { + __m256i input1 = + _mm256_loadu_si256(reinterpret_cast(&src[i])); + __m256i input2 = + _mm256_loadu_si256(reinterpret_cast(&src[i + 8])); + + __m256i output1 = _mm256_shuffle_epi8(input1, shufmask); + __m256i output2 = _mm256_shuffle_epi8(input2, shufmask); + //chrispy: todo, benchmark this w/ and w/out these prefetches here on multiple machines + //finding a good distance for prefetchw in particular is probably important + //for when we're writing across 2 cachelines + #if 0 + if (i + 48 <= count) { + swcache::PrefetchNTA(&src[i + 32]); + if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) { + swcache::PrefetchW(&dest[i + 32]); + } + } + #endif + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2); + } + if (i + 8 <= count) { + __m256i input = + _mm256_loadu_si256(reinterpret_cast(&src[i])); + __m256i output = _mm256_shuffle_epi8(input, shufmask); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output); + i += 8; + } + if (i + 4 <= count) { + __m128i input = + _mm_loadu_si128(reinterpret_cast(&src[i])); + __m128i output = + _mm_shuffle_epi8(input, _mm256_castsi256_si128(shufmask)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + i += 4; + } + } else { + __m128i shufmask = + _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, + 0x06, 0x07, 0x00, 0x01, 0x02, 0x03); + + for (i = 0; i + 4 <= count; i += 4) { + __m128i input = + _mm_loadu_si128(reinterpret_cast(&src[i])); + __m128i output = _mm_shuffle_epi8(input, shufmask); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } } XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0); for (; i < count; ++i) { // handle residual elements diff --git a/src/xenia/base/threading_win.cc b/src/xenia/base/threading_win.cc index a8aa7889c..88ecb145c 100644 --- a/src/xenia/base/threading_win.cc +++ b/src/xenia/base/threading_win.cc @@ -60,6 +60,12 @@ namespace xe { namespace threading { void EnableAffinityConfiguration() { + // chrispy: i don't think this is necessary, + // affinity always seems to be the system mask? research more + // also, maybe if ignore_thread_affinities is on we should use + // SetProcessAffinityUpdateMode to allow windows to dynamically update + // our process' affinity (by default windows cannot change the affinity itself + // at runtime, user code must do it) HANDLE process_handle = GetCurrentProcess(); DWORD_PTR process_affinity_mask; DWORD_PTR system_affinity_mask; @@ -117,7 +123,7 @@ void set_name(const std::string_view name) { // checked ntoskrnl, it does not modify delay, so we can place this as a // constant and avoid creating a stack variable -static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}}; +static const LARGE_INTEGER sleepdelay0_for_maybeyield{{~0u, -1}}; void MaybeYield() { #if 0 diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index fc236aa6a..3d31e50de 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1712,8 +1712,10 @@ void D3D12CommandProcessor::ShutdownContext() { CommandProcessor::ShutdownContext(); } -// todo: bit-pack the bools and use bitarith to reduce branches -void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { + +XE_FORCEINLINE +void D3D12CommandProcessor::WriteRegisterForceinline(uint32_t index, + uint32_t value) { __m128i to_rangecheck = _mm_set1_epi16(static_cast(index)); __m128i lower_bounds = _mm_setr_epi16( @@ -1783,14 +1785,16 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { return; } } +// todo: bit-pack the bools and use bitarith to reduce branches +void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { + WriteRegisterForceinline(index, value); +} void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers) { - for (uint32_t i = 0; i < num_registers; ++i) { - uint32_t data = xe::load_and_swap(base + i); - D3D12CommandProcessor::WriteRegister(start_index + i, data); - } + WriteRegisterRangeFromMem_WithKnownBound<0, 0xFFFF>(start_index, base, + num_registers); } void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring, @@ -1911,8 +1915,19 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase( void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base, uint32_t num_registers) { - WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base, - num_registers); + RingBuffer::ReadRange range = + ring->BeginRead(num_registers * sizeof(uint32_t)); + + XE_LIKELY_IF(!range.second) { + WriteRegistersFromMem( + base, reinterpret_cast(const_cast(range.first)), + num_registers); + + ring->EndRead(range); + } + else { + return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers); + } } template @@ -1926,101 +1941,167 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) { bounds_may_have_reg( last_reg); } -template -XE_FORCEINLINE void -D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( - uint32_t base, uint32_t* range, uint32_t num_registers) { - constexpr auto bounds_has_reg = - bounds_may_have_reg; - constexpr auto bounds_has_bounds = - bounds_may_have_bounds; +XE_FORCEINLINE +void D3D12CommandProcessor::WriteShaderConstantsFromMem( + uint32_t start_index, uint32_t* base, uint32_t num_registers) { + if (frame_open_) { + bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date; + bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date; + if (cbuffer_pixel_uptodate || cbuffer_vertex_uptodate) { + // super naive, could just do some bit magic and interval checking, + // but we just need this hoisted out of the copy so we do a bulk copy + // because its the actual load/swap/store we're getting murdered by + // this precheck followed by copy_and_swap_32_unaligned reduced the cpu + // usage from packettype0/writeregistersfrommem from 10-11% of cpu time + // spent on xenia to like 1% + // chrispy: todo, this can be reduced even further, should be split into + // two loops and should skip whole words, this could net us even bigger + // gains + uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4; + uint32_t end_map_index = + (start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4; - for (uint32_t i = 0; i < num_registers; ++i) { - uint32_t data = xe::load_and_swap(range + i); - - { - uint32_t index = base + i; - uint32_t value = data; - XE_MSVC_ASSUME(index >= register_lower_bound && - index < register_upper_bound); - register_file_->values[index].u32 = value; - - unsigned expr = 0; - - if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0, - XE_GPU_REG_SCRATCH_REG7)) { - expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8); - } - if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) { - expr |= (index == XE_GPU_REG_COHER_STATUS_HOST); - } - if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX, - XE_GPU_REG_DC_LUT_30_COLOR)) { - expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= - (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); - } - // chrispy: reordered for msvc branch probability (assumes - // if is taken and else is not) - if (XE_LIKELY(expr == 0)) { - XE_MSVC_REORDER_BARRIER(); - - } else { - HandleSpecialRegisterWrite(index, value); - goto write_done; - } - XE_MSVC_ASSUME(index >= register_lower_bound && - index < register_upper_bound); - if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, - XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) { - if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 && - index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) { - cbuffer_binding_fetch_.up_to_date = false; - // texture cache is never nullptr - texture_cache_->TextureFetchConstantWritten( - (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); - - goto write_done; - } - } - XE_MSVC_ASSUME(index >= register_lower_bound && - index < register_upper_bound); - if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X, - XE_GPU_REG_SHADER_CONSTANT_511_W)) { - if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && - index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { - if (frame_open_) { - uint32_t float_constant_index = - (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; - if (float_constant_index >= 256) { - float_constant_index -= 256; - if (current_float_constant_map_pixel_[float_constant_index >> 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_binding_float_pixel_.up_to_date = false; - } - } else { - if (current_float_constant_map_vertex_[float_constant_index >> - 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_binding_float_vertex_.up_to_date = false; - } - } + if (map_index < 256 && cbuffer_vertex_uptodate) { + for (; map_index < end_map_index; ++map_index) { + if (current_float_constant_map_vertex_[map_index >> 6] & + (1ull << map_index)) { + cbuffer_vertex_uptodate = false; + break; } - goto write_done; } } - XE_MSVC_ASSUME(index >= register_lower_bound && - index < register_upper_bound); - if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, - XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) { - if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 && - index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) { - cbuffer_binding_bool_loop_.up_to_date = false; - goto write_done; + if (end_map_index > 256 && cbuffer_pixel_uptodate) { + for (; map_index < end_map_index; ++map_index) { + uint32_t float_constant_index = map_index; + float_constant_index -= 256; + if (current_float_constant_map_pixel_[float_constant_index >> 6] & + (1ull << float_constant_index)) { + cbuffer_pixel_uptodate = false; + break; + } } } } - write_done:; + cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate; + cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate; } + + // maybe use non-temporal copy if possible... + copy_and_swap_32_unaligned(®ister_file_->values[start_index], base, + num_registers); +} +XE_FORCEINLINE +void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + cbuffer_binding_bool_loop_.up_to_date = false; + copy_and_swap_32_unaligned(®ister_file_->values[start_index], base, + num_registers); +} +XE_FORCEINLINE +void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index, + uint32_t* base, + uint32_t num_registers) { + cbuffer_binding_fetch_.up_to_date = false; + + uint32_t first_fetch = + ((start_index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); + uint32_t last_fetch = // i think last_fetch should be inclusive if its modulo + // is nz... + (((start_index + num_registers) - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / + 6); + texture_cache_->TextureFetchConstantsWritten(first_fetch, last_fetch); + + copy_and_swap_32_unaligned(®ister_file_->values[start_index], base, + num_registers); +} + +void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem( + uint32_t start_index, uint32_t* base, uint32_t numregs) { + uint32_t end = numregs + start_index; + for (uint32_t index = start_index; index < end; ++index, ++base) { + uint32_t value = xe::load_and_swap(base); + + register_file_->values[index].u32 = value; + + unsigned expr = 0; + + expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8); + + expr |= (index == XE_GPU_REG_COHER_STATUS_HOST); + + expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= + (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); + + if (expr == 0) { + } else { + HandleSpecialRegisterWrite(index, value); + } + } +} +template +XE_FORCEINLINE void +D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( + uint32_t start_index, uint32_t* base, uint32_t num_registers) { + uint32_t end = start_index + num_registers; + + uint32_t current_index = start_index; + + auto get_end_before_qty = [&end, current_index](uint32_t regnum) { + return std::min(regnum, end) - current_index; + }; +#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \ + copy_and_swap_32_unaligned(®ister_file_->values[i], b, n) +#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \ + WriteFetchFromMem(ind, b, n) +#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \ + WritePossiblySpecialRegistersFromMem(ind, bs, n) +#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \ + n) \ + WriteShaderConstantsFromMem(index, base, n) +#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n) + +#define DO_A_RANGE(start_range, end_range, cb) \ + if constexpr (start_range >= register_lower_bound || \ + end_range > register_lower_bound) { \ + if (current_index < (end_range)) { \ + uint32_t ntowrite = get_end_before_qty(end_range); \ + cb((start_range), (end_range), current_index, base, ntowrite); \ + current_index += ntowrite; \ + base += ntowrite; \ + } \ + if (current_index >= end) { \ + return; \ + } \ + } + +#define REFRESH_MSVC_RANGE() \ + XE_MSVC_ASSUME(current_index >= register_lower_bound && \ + current_index < register_upper_bound) + + REFRESH_MSVC_RANGE(); + + DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1, + SPECIAL_REG_RANGE_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X, + REGULAR_WRITE_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X, + XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, + WRITE_SHADER_CONSTANTS_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, + XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1, + WRITE_FETCH_CONSTANTS_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, + XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536, + REGULAR_WRITE_CALLBACK); } template XE_FORCEINLINE void @@ -2626,11 +2707,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, // todo: use SIMD for getscissor + scaling here, should reduce code size more draw_util::Scissor scissor; draw_util::GetScissor(regs, scissor); +#if XE_ARCH_AMD64 == 1 + __m128i* scisp = (__m128i*)&scissor; + *scisp = _mm_mullo_epi32( + *scisp, _mm_setr_epi32(draw_resolution_scale_x, draw_resolution_scale_y, + draw_resolution_scale_x, draw_resolution_scale_y)); +#else scissor.offset[0] *= draw_resolution_scale_x; scissor.offset[1] *= draw_resolution_scale_y; scissor.extent[0] *= draw_resolution_scale_x; scissor.extent[1] *= draw_resolution_scale_y; - +#endif // Update viewport, scissor, blend factor and stencil reference. UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal, normalized_depth_control); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 75f23cf03..1fbfba23c 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -211,12 +211,25 @@ class D3D12CommandProcessor final : public CommandProcessor { protected: bool SetupContext() override; void ShutdownContext() override; - - void WriteRegister(uint32_t index, uint32_t value) override; XE_FORCEINLINE + void WriteRegisterForceinline(uint32_t index, uint32_t value); + void WriteRegister(uint32_t index, uint32_t value) override; + virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers) override; + /*helper functions for WriteRegistersFromMem*/ + XE_FORCEINLINE + void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + XE_FORCEINLINE + void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + XE_FORCEINLINE + void WriteFetchFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); + void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); template XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound( uint32_t start_index, uint32_t* base, uint32_t num_registers); diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index af977d4d5..e6461e8bd 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -551,30 +551,70 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args, viewport_info_out.ndc_offset[i] = ndc_offset[i]; } } -void GetScissor(const RegisterFile& regs, Scissor& scissor_out, - bool clamp_to_surface_pitch) { +template +static inline +void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs, + Scissor& XE_RESTRICT scissor_out) { +#if XE_ARCH_AMD64 == 1 auto pa_sc_window_scissor_tl = regs.Get(); - int32_t tl_x = int32_t(pa_sc_window_scissor_tl.tl_x); - int32_t tl_y = int32_t(pa_sc_window_scissor_tl.tl_y); auto pa_sc_window_scissor_br = regs.Get(); - int32_t br_x = int32_t(pa_sc_window_scissor_br.br_x); - int32_t br_y = int32_t(pa_sc_window_scissor_br.br_y); - if (!pa_sc_window_scissor_tl.window_offset_disable) { - auto pa_sc_window_offset = regs.Get(); - tl_x += pa_sc_window_offset.window_x_offset; - tl_y += pa_sc_window_offset.window_y_offset; - br_x += pa_sc_window_offset.window_x_offset; - br_y += pa_sc_window_offset.window_y_offset; + auto pa_sc_window_offset = regs.Get(); + auto pa_sc_screen_scissor_tl = regs.Get(); + auto pa_sc_screen_scissor_br = regs.Get(); + uint32_t surface_pitch = 0; + if constexpr (clamp_to_surface_pitch) { + surface_pitch = regs.Get().surface_pitch; } + uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x, + pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y, + pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x, + pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y, + pa_sc_window_offset_window_x_offset = + pa_sc_window_offset.window_x_offset, + pa_sc_window_offset_window_y_offset = + pa_sc_window_offset.window_y_offset, + pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x, + pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y, + pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x, + pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y; + + int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x); + int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y); + + int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x); + int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y); + + __m128i tmp1 = _mm_setr_epi32(tl_x, tl_y, br_x, br_y); + __m128i pa_sc_scissor = _mm_setr_epi32( + pa_sc_screen_scissor_tl_tl_x, pa_sc_screen_scissor_tl_tl_y, + pa_sc_screen_scissor_br_br_x, pa_sc_screen_scissor_br_br_y); + __m128i xyoffsetadd = _mm_cvtsi64x_si128( + static_cast(pa_sc_window_offset_window_x_offset) | + (static_cast(pa_sc_window_offset_window_y_offset) + << 32)); + xyoffsetadd = _mm_unpacklo_epi64(xyoffsetadd, xyoffsetadd); + // chrispy: put this here to make it clear that the shift by 31 is extracting + // this field + XE_MAYBE_UNUSED + uint32_t window_offset_disable_reference = + pa_sc_window_scissor_tl.window_offset_disable; + + __m128i offset_disable_mask = _mm_set1_epi32(pa_sc_window_scissor_tl.value); + + __m128i addend = _mm_blendv_epi8(xyoffsetadd, _mm_setzero_si128(), + _mm_srai_epi32(offset_disable_mask, 31)); + + tmp1 = _mm_add_epi32(tmp1, addend); + + //} // Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but // still handled here for completeness. - auto pa_sc_screen_scissor_tl = regs.Get(); - tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl.tl_x)); - tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl.tl_y)); - auto pa_sc_screen_scissor_br = regs.Get(); - br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br.br_x)); - br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br.br_y)); - if (clamp_to_surface_pitch) { + __m128i lomax = _mm_max_epi32(tmp1, pa_sc_scissor); + __m128i himin = _mm_min_epi32(tmp1, pa_sc_scissor); + + tmp1 = _mm_blend_epi16(lomax, himin, 0b11110000); + + if constexpr (clamp_to_surface_pitch) { // Clamp the horizontal scissor to surface_pitch for safety, in case that's // not done by the guest for some reason (it's not when doing draws without // clipping in Direct3D 9, for instance), to prevent overflow - this is @@ -582,7 +622,79 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out, // rasterization without render target width at all (pixel shader // interlock-based custom RB implementations) and using conventional render // targets, but padded to EDRAM tiles. - uint32_t surface_pitch = regs.Get().surface_pitch; + tmp1 = _mm_blend_epi16( + tmp1, _mm_min_epi32(tmp1, _mm_set1_epi32(surface_pitch)), + 0b00110011); + } + + tmp1 = _mm_max_epi32(tmp1, _mm_setzero_si128()); + + __m128i tl_in_high = _mm_unpacklo_epi64(tmp1, tmp1); + + __m128i final_br = _mm_max_epi32(tmp1, tl_in_high); + final_br = _mm_sub_epi32(final_br, tl_in_high); + __m128i scissor_res = _mm_blend_epi16(tmp1, final_br, 0b11110000); + _mm_storeu_si128((__m128i*)&scissor_out, scissor_res); +#else + auto pa_sc_window_scissor_tl = regs.Get(); + auto pa_sc_window_scissor_br = regs.Get(); + auto pa_sc_window_offset = regs.Get(); + auto pa_sc_screen_scissor_tl = regs.Get(); + auto pa_sc_screen_scissor_br = regs.Get(); + uint32_t surface_pitch = 0; + if constexpr (clamp_to_surface_pitch) { + surface_pitch = regs.Get().surface_pitch; + } + uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x, + pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y, + pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x, + pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y, + pa_sc_window_offset_window_x_offset = + pa_sc_window_offset.window_x_offset, + pa_sc_window_offset_window_y_offset = + pa_sc_window_offset.window_y_offset, + pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x, + pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y, + pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x, + pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y; + + int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x); + int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y); + + int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x); + int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y); + + // chrispy: put this here to make it clear that the shift by 31 is extracting + // this field + XE_MAYBE_UNUSED + uint32_t window_offset_disable_reference = + pa_sc_window_scissor_tl.window_offset_disable; + int32_t window_offset_disable_mask = + ~(static_cast(pa_sc_window_scissor_tl.value) >> 31); + // if (!pa_sc_window_scissor_tl.window_offset_disable) { + + tl_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask; + tl_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask; + br_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask; + br_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask; + //} + // Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but + // still handled here for completeness. + + tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl_tl_x)); + tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl_tl_y)); + + br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br_br_x)); + br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br_br_y)); + if constexpr (clamp_to_surface_pitch) { + // Clamp the horizontal scissor to surface_pitch for safety, in case that's + // not done by the guest for some reason (it's not when doing draws without + // clipping in Direct3D 9, for instance), to prevent overflow - this is + // important for host implementations, both based on target-indepedent + // rasterization without render target width at all (pixel shader + // interlock-based custom RB implementations) and using conventional render + // targets, but padded to EDRAM tiles. + tl_x = std::min(tl_x, int32_t(surface_pitch)); br_x = std::min(br_x, int32_t(surface_pitch)); } @@ -599,6 +711,16 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out, scissor_out.offset[1] = uint32_t(tl_y); scissor_out.extent[0] = uint32_t(br_x - tl_x); scissor_out.extent[1] = uint32_t(br_y - tl_y); +#endif +} + +void GetScissor(const RegisterFile& XE_RESTRICT regs, + Scissor& XE_RESTRICT scissor_out, bool clamp_to_surface_pitch) { + if (clamp_to_surface_pitch) { + return GetScissorTmpl(regs, scissor_out); + } else { + return GetScissorTmpl(regs, scissor_out); + } } uint32_t GetNormalizedColorMask(const RegisterFile& regs, @@ -863,7 +985,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, y1 = y0 + int32_t(xenos::kMaxResolveSize); } // fails in forza horizon 1 - //x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100 + // x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100 assert_true(x0 <= x1 && y0 <= y1); if (x0 >= x1 || y0 >= y1) { XELOGE("Resolve region is empty"); @@ -1103,7 +1225,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32; info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32; info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32; - #if 0 +#if 0 XELOGD( "Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially " "modified memory range 0x{:08X} to 0x{:08X})", @@ -1114,7 +1236,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, xenos::ColorRenderTargetFormat(color_edram_info.format)), FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start, copy_dest_extent_end); - #endif +#endif return true; } XE_MSVC_OPTIMIZE_REVERT() diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index ececbaac9..8196830b8 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -433,13 +433,15 @@ struct GetViewportInfoArgs { void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args, ViewportInfo& viewport_info_out); -struct Scissor { +struct alignas(16) Scissor { // Offset from render target UV = 0 to +UV. uint32_t offset[2]; // Extent can be zero. uint32_t extent[2]; }; -void GetScissor(const RegisterFile& regs, Scissor& scissor_out, + +void GetScissor(const RegisterFile& XE_RESTRICT regs, + Scissor& XE_RESTRICT scissor_out, bool clamp_to_surface_pitch = true); // Returns the color component write mask for the draw command taking into diff --git a/src/xenia/gpu/pm4_command_processor_declare.h b/src/xenia/gpu/pm4_command_processor_declare.h index da0888f21..de6b66f6f 100644 --- a/src/xenia/gpu/pm4_command_processor_declare.h +++ b/src/xenia/gpu/pm4_command_processor_declare.h @@ -109,7 +109,7 @@ XE_NOINLINE XE_COLD bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT; -XE_NOINLINE +XE_FORCEINLINE XE_NOALIAS uint32_t GetCurrentRingReadCount(); diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h index 2fc63cef2..4f0aaa330 100644 --- a/src/xenia/gpu/pm4_command_processor_implement.h +++ b/src/xenia/gpu/pm4_command_processor_implement.h @@ -4,7 +4,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT { SCOPE_profile_cpu_f("gpu"); - trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t)); if (count != 0) { RingBuffer old_reader = reader_; @@ -32,10 +31,9 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr, trace_writer_.WriteIndirectBufferEnd(); reader_ = old_reader; } else { - //rare, but i've seen it happen! (and then a division by 0 occurs) + // rare, but i've seen it happen! (and then a division by 0 occurs) return; } - } bool COMMAND_PROCESSOR::ExecutePacket() { @@ -88,8 +86,9 @@ bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) { count * sizeof(uint32_t)); return false; } - /* - Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4 +/* + Todo: optimize this function this one along with execute packet type III are + the most frequently called functions for PM4 */ XE_NOINLINE bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT { @@ -99,7 +98,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT { uint32_t count = ((packet >> 16) & 0x3FFF) + 1; - if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >= count * sizeof(uint32_t)) { trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count); @@ -143,7 +141,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType2(uint32_t packet) XE_RESTRICT { trace_writer_.WritePacketEnd(); return true; } -XE_NOINLINE +XE_FORCEINLINE XE_NOALIAS uint32_t COMMAND_PROCESSOR::GetCurrentRingReadCount() { return reader_.read_count(); @@ -446,41 +444,46 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER( return true; } -XE_NOINLINE +/* + chrispy: this is fine to inline, as a noinline function it compiled down + to 54 bytes +*/ static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) { - /* - Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits - then use the wait_info value in order to select the bits that correctly implement the condition - If neither subtraction has the signbit set then that means the value is equal - */ - bool matched = false; - switch (wait_info & 0x7) { - case 0x0: // Never. - matched = false; - break; - case 0x1: // Less than reference. - matched = value < ref; - break; - case 0x2: // Less than or equal to reference. - matched = value <= ref; - break; - case 0x3: // Equal to reference. - matched = value == ref; - break; - case 0x4: // Not equal to reference. - matched = value != ref; - break; - case 0x5: // Greater than or equal to reference. - matched = value >= ref; - break; - case 0x6: // Greater than reference. - matched = value > ref; - break; - case 0x7: // Always - matched = true; - break; - } - return matched; +// smaller code is generated than the #else path, although whether it is faster +// i do not know. i don't think games do an enormous number of cond_write +// though, so we have picked +// the path with the smaller codegen. +// we do technically have more instructions executed vs the switch case method, +// but we have no mispredicts and most of our instructions are 0.25/0.3 +// throughput +#if 1 + uint32_t value_minus_ref = + static_cast(static_cast(value - ref) >> 31); + uint32_t ref_minus_value = + static_cast(static_cast(ref - value) >> 31); + uint32_t eqmask = ~(value_minus_ref | ref_minus_value); + uint32_t nemask = (value_minus_ref | ref_minus_value); + + uint32_t value_lt_mask = value_minus_ref; + uint32_t value_gt_mask = ref_minus_value; + uint32_t value_lte_mask = value_lt_mask | eqmask; + uint32_t value_gte_mask = value_gt_mask | eqmask; + + uint32_t bits_for_selecting = + (value_lt_mask & (1 << 1)) | (value_lte_mask & (1 << 2)) | + (eqmask & (1 << 3)) | (nemask & (1 << 4)) | (value_gte_mask & (1 << 5)) | + (value_gt_mask & (1 << 6)) | (1 << 7); + + return (bits_for_selecting >> (wait_info & 7)) & 1; + +#else + + return ((((value < ref) << 1) | ((value <= ref) << 2) | + ((value == ref) << 3) | ((value != ref) << 4) | + ((value >= ref) << 5) | ((value > ref) << 6) | (1 << 7)) >> + (wait_info & 7)) & + 1; +#endif } XE_NOINLINE bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM( @@ -520,19 +523,17 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM( PrepareForWait(); if (!cvars::vsync) { // User wants it fast and dangerous. - xe::threading::MaybeYield(); + // do nothing } else { xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100)); + ReturnFromWait(); } - // xe::threading::SyncMemory(); - ReturnFromWait(); if (!worker_running_) { // Short-circuited exit. return false; } } else { - xe::threading::MaybeYield(); } } } while (!matched); @@ -1128,7 +1129,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY( } uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index, - uint32_t write_index) { + uint32_t write_index) { SCOPE_profile_cpu_f("gpu"); #if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 // If we have a pending trace stream open it now. That way we ensure we get diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h index ff29fcb38..717273275 100644 --- a/src/xenia/gpu/texture_cache.h +++ b/src/xenia/gpu/texture_cache.h @@ -104,6 +104,15 @@ class TextureCache { void TextureFetchConstantWritten(uint32_t index) { texture_bindings_in_sync_ &= ~(UINT32_C(1) << index); } + void TextureFetchConstantsWritten(uint32_t first_index, uint32_t last_index) { + // generate a mask of all bits from before the first index, and xor it with + // all bits before the last index this produces a mask covering only the + // bits between first and last + uint32_t res = ((1U << first_index) - 1) ^ ((1U << (last_index + 1)) - 1); + // todo: check that this is right + + texture_bindings_in_sync_ &= ~res; + } virtual void RequestTextures(uint32_t used_texture_mask);