From f931c34ecb3406f68e3d7d10536bd61dc1d442a7 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Wed, 14 Dec 2022 11:34:33 -0800 Subject: [PATCH] Cleaned up for commit, moved WriteRegistersFromMemCommonSense code into WriteRegistersFromMem optimized copy_and_swap_32_unaligned further --- src/xenia/base/memory.cc | 23 ++- .../gpu/d3d12/d3d12_command_processor.cc | 135 +++--------------- src/xenia/gpu/d3d12/d3d12_command_processor.h | 11 +- .../gpu/pm4_command_processor_implement.h | 1 - 4 files changed, 49 insertions(+), 121 deletions(-) diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index 41972aae9..4fb537226 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -327,15 +327,34 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr, __m256i output1 = _mm256_shuffle_epi8(input1, shufmask); __m256i output2 = _mm256_shuffle_epi8(input2, shufmask); - + //chrispy: todo, benchmark this w/ and w/out these prefetches here on multiple machines + //finding a good distance for prefetchw in particular is probably important + //for when we're writing across 2 cachelines + #if 0 + if (i + 48 <= count) { + swcache::PrefetchNTA(&src[i + 32]); + if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) { + swcache::PrefetchW(&dest[i + 32]); + } + } + #endif _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1); _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2); } - for (; i + 8 <= count; i += 8) { + if (i + 8 <= count) { __m256i input = _mm256_loadu_si256(reinterpret_cast(&src[i])); __m256i output = _mm256_shuffle_epi8(input, shufmask); _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output); + i += 8; + } + if (i + 4 <= count) { + __m128i input = + _mm_loadu_si128(reinterpret_cast(&src[i])); + __m128i output = + _mm_shuffle_epi8(input, _mm256_castsi256_si128(shufmask)); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + i += 4; } } else { __m128i shufmask = diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 5c7c6722f..3d31e50de 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1793,10 +1793,8 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers) { - for (uint32_t i = 0; i < num_registers; ++i) { - uint32_t data = xe::load_and_swap(base + i); - D3D12CommandProcessor::WriteRegister(start_index + i, data); - } + WriteRegisterRangeFromMem_WithKnownBound<0, 0xFFFF>(start_index, base, + num_registers); } void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring, @@ -1903,11 +1901,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase( uint32_t num_regs_firstrange = static_cast(range.first_length / sizeof(uint32_t)); - D3D12CommandProcessor::WriteRegistersFromMemCommonSense( + D3D12CommandProcessor::WriteRegistersFromMem( base, reinterpret_cast(const_cast(range.first)), num_regs_firstrange); - D3D12CommandProcessor::WriteRegistersFromMemCommonSense( + D3D12CommandProcessor::WriteRegistersFromMem( base + num_regs_firstrange, reinterpret_cast(const_cast(range.second)), num_registers - num_regs_firstrange); @@ -1917,14 +1915,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase( void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base, uint32_t num_registers) { - // WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base, - // num_registers); - RingBuffer::ReadRange range = ring->BeginRead(num_registers * sizeof(uint32_t)); XE_LIKELY_IF(!range.second) { - WriteRegistersFromMemCommonSense( + WriteRegistersFromMem( base, reinterpret_cast(const_cast(range.first)), num_registers); @@ -1946,9 +1941,9 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) { bounds_may_have_reg( last_reg); } +XE_FORCEINLINE void D3D12CommandProcessor::WriteShaderConstantsFromMem( uint32_t start_index, uint32_t* base, uint32_t num_registers) { -#if 1 if (frame_open_) { bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date; bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date; @@ -1986,57 +1981,16 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem( } } } - -#if 0 - if (!cbuffer_vertex_uptodate) { - if (256 >= end_map_index) { - goto skip_map_checks; - } - map_index = 256; - } - - for (; map_index < end_map_index; ++map_index) { - uint32_t float_constant_index = map_index; - if (float_constant_index >= 256) { - float_constant_index -= 256; - if (current_float_constant_map_pixel_[float_constant_index >> 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_pixel_uptodate = false; - if (!cbuffer_vertex_uptodate) { - break; - } - } - } else { - if (current_float_constant_map_vertex_[float_constant_index >> 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_vertex_uptodate = false; - if (!cbuffer_pixel_uptodate) { - break; - } else { - map_index = 255; // skip to checking pixel - continue; // increment will put us at 256, then the check will - // happen - } - } - } - } - skip_map_checks:; -#endif } cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate; cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate; } -#else - if (frame_open_) { - cbuffer_binding_float_pixel_.up_to_date = false; - cbuffer_binding_float_vertex_.up_to_date = false; - } -#endif + // maybe use non-temporal copy if possible... copy_and_swap_32_unaligned(®ister_file_->values[start_index], base, num_registers); } - +XE_FORCEINLINE void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers) { @@ -2044,6 +1998,7 @@ void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index, copy_and_swap_32_unaligned(®ister_file_->values[start_index], base, num_registers); } +XE_FORCEINLINE void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers) { @@ -2061,56 +2016,6 @@ void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index, num_registers); } -void D3D12CommandProcessor::WriteRegistersFromMemCommonSense( - uint32_t start_index, uint32_t* base, uint32_t num_registers) { - uint32_t end = start_index + num_registers; - - uint32_t current_index = start_index; - - auto get_end_before_qty = [&end, current_index](uint32_t regnum) { - return std::min(regnum, end) - current_index; - }; -#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \ - copy_and_swap_32_unaligned(®ister_file_->values[i], b, n) -#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \ - WriteFetchFromMem(ind, b, n) -#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \ - WritePossiblySpecialRegistersFromMem(ind, bs, n) -#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \ - n) \ - WriteShaderConstantsFromMem(index, base, n) -#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n) - -#define DO_A_RANGE(start_range, end_range, cb) \ - if (current_index < (end_range)) { \ - uint32_t ntowrite = get_end_before_qty(end_range); \ - cb((start_range), (end_range), current_index, base, ntowrite); \ - current_index += ntowrite; \ - base += ntowrite; \ - } \ - if (current_index >= end) { \ - return; \ - } - - DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK); - - DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1, - SPECIAL_REG_RANGE_CALLBACK); - DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X, - REGULAR_WRITE_CALLBACK); - - DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X, - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, - WRITE_SHADER_CONSTANTS_CALLBACK); - - DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, - XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1, - WRITE_FETCH_CONSTANTS_CALLBACK); - DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, - XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK); - DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536, - REGULAR_WRITE_CALLBACK); -} void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem( uint32_t start_index, uint32_t* base, uint32_t numregs) { uint32_t end = numregs + start_index; @@ -2156,15 +2061,18 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( WriteShaderConstantsFromMem(index, base, n) #define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n) -#define DO_A_RANGE(start_range, end_range, cb) \ - if (current_index < (end_range)) { \ - uint32_t ntowrite = get_end_before_qty(end_range); \ - cb((start_range), (end_range), current_index, base, ntowrite); \ - current_index += ntowrite; \ - base += ntowrite; \ - } \ - if (current_index >= end) { \ - return; \ +#define DO_A_RANGE(start_range, end_range, cb) \ + if constexpr (start_range >= register_lower_bound || \ + end_range > register_lower_bound) { \ + if (current_index < (end_range)) { \ + uint32_t ntowrite = get_end_before_qty(end_range); \ + cb((start_range), (end_range), current_index, base, ntowrite); \ + current_index += ntowrite; \ + base += ntowrite; \ + } \ + if (current_index >= end) { \ + return; \ + } \ } #define REFRESH_MSVC_RANGE() \ @@ -2172,6 +2080,7 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( current_index < register_upper_bound) REFRESH_MSVC_RANGE(); + DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK); REFRESH_MSVC_RANGE(); DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1, diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 7bf11c9b7..1fbfba23c 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -214,18 +214,19 @@ class D3D12CommandProcessor final : public CommandProcessor { XE_FORCEINLINE void WriteRegisterForceinline(uint32_t index, uint32_t value); void WriteRegister(uint32_t index, uint32_t value) override; - XE_FORCEINLINE + virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers) override; - //SHADER_CONSTANT_blah_XWYZ + /*helper functions for WriteRegistersFromMem*/ + XE_FORCEINLINE void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base, - uint32_t num_registers); + uint32_t num_registers); + XE_FORCEINLINE void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers); + XE_FORCEINLINE void WriteFetchFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers); - void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base, - uint32_t num_registers) ; void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base, uint32_t num_registers); diff --git a/src/xenia/gpu/pm4_command_processor_implement.h b/src/xenia/gpu/pm4_command_processor_implement.h index 89dcdf7ca..4f0aaa330 100644 --- a/src/xenia/gpu/pm4_command_processor_implement.h +++ b/src/xenia/gpu/pm4_command_processor_implement.h @@ -534,7 +534,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM( return false; } } else { - //xe::threading::MaybeYield(); } } } while (!matched);