diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index cf6788f96..41972aae9 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -309,15 +309,45 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr, size_t count) { auto dest = reinterpret_cast(dest_ptr); auto src = reinterpret_cast(src_ptr); - __m128i shufmask = - _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, - 0x06, 0x07, 0x00, 0x01, 0x02, 0x03); - size_t i; - for (i = 0; i + 4 <= count; i += 4) { - __m128i input = _mm_loadu_si128(reinterpret_cast(&src[i])); - __m128i output = _mm_shuffle_epi8(input, shufmask); - _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + // chrispy: this optimization mightt backfire if our unaligned load spans two + // cachelines... which it probably will + if (amd64::GetFeatureFlags() & amd64::kX64EmitAVX2) { + __m256i shufmask = _mm256_set_epi8( + 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07, + 0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, + 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03); + // with vpshufb being a 0.5 through instruction, it makes the most sense to + // double up on our iters + for (i = 0; i + 16 <= count; i += 16) { + __m256i input1 = + _mm256_loadu_si256(reinterpret_cast(&src[i])); + __m256i input2 = + _mm256_loadu_si256(reinterpret_cast(&src[i + 8])); + + __m256i output1 = _mm256_shuffle_epi8(input1, shufmask); + __m256i output2 = _mm256_shuffle_epi8(input2, shufmask); + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2); + } + for (; i + 8 <= count; i += 8) { + __m256i input = + _mm256_loadu_si256(reinterpret_cast(&src[i])); + __m256i output = _mm256_shuffle_epi8(input, shufmask); + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output); + } + } else { + __m128i shufmask = + _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, + 0x06, 0x07, 0x00, 0x01, 0x02, 0x03); + + for (i = 0; i + 4 <= count; i += 4) { + __m128i input = + _mm_loadu_si128(reinterpret_cast(&src[i])); + __m128i output = _mm_shuffle_epi8(input, shufmask); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } } XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0); for (; i < count; ++i) { // handle residual elements diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index 70b09501e..09315be09 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1903,11 +1903,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase( uint32_t num_regs_firstrange = static_cast(range.first_length / sizeof(uint32_t)); - D3D12CommandProcessor::WriteRegistersFromMem( + D3D12CommandProcessor::WriteRegistersFromMemCommonSense( base, reinterpret_cast(const_cast(range.first)), num_regs_firstrange); - D3D12CommandProcessor::WriteRegistersFromMem( + D3D12CommandProcessor::WriteRegistersFromMemCommonSense( base + num_regs_firstrange, reinterpret_cast(const_cast(range.second)), num_registers - num_regs_firstrange); @@ -1948,6 +1948,7 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) { } void D3D12CommandProcessor::WriteShaderConstantsFromMem( uint32_t start_index, uint32_t* base, uint32_t num_registers) { +#if 1 if (frame_open_) { bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date; bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date; @@ -1964,12 +1965,36 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem( uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4; uint32_t end_map_index = (start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4; + + if (map_index < 256 && cbuffer_vertex_uptodate) { + for (; map_index < end_map_index; ++map_index) { + if (current_float_constant_map_vertex_[map_index >> 6] & + (1ull << map_index)) { + cbuffer_vertex_uptodate = false; + break; + } + } + } + if (end_map_index > 256 && cbuffer_pixel_uptodate) { + for (; map_index < end_map_index; ++map_index) { + uint32_t float_constant_index = map_index; + float_constant_index -= 256; + if (current_float_constant_map_pixel_[float_constant_index >> 6] & + (1ull << float_constant_index)) { + cbuffer_pixel_uptodate = false; + break; + } + } + } + +#if 0 if (!cbuffer_vertex_uptodate) { if (256 >= end_map_index) { goto skip_map_checks; } map_index = 256; } + for (; map_index < end_map_index; ++map_index) { uint32_t float_constant_index = map_index; if (float_constant_index >= 256) { @@ -1996,10 +2021,17 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem( } } skip_map_checks:; +#endif } cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate; cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate; } +#else + if (frame_open_) { + cbuffer_binding_float_pixel_.up_to_date = false; + cbuffer_binding_float_vertex_.up_to_date = false; + } +#endif // maybe use non-temporal copy if possible... copy_and_swap_32_unaligned(®ister_file_->values[start_index], base, num_registers); @@ -2038,10 +2070,16 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense( auto get_end_before_qty = [&end, current_index](uint32_t regnum) { return std::min(regnum, end) - current_index; }; - -#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n) \ - WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \ - index, base, n) +#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \ + copy_and_swap_32_unaligned(®ister_file_->values[i], b, n) +#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \ + WriteFetchFromMem(ind, b, n) +#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \ + WritePossiblySpecialRegistersFromMem(ind, bs, n) +#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \ + n) \ + WriteShaderConstantsFromMem(index, base, n) +#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n) #define DO_A_RANGE(start_range, end_range, cb) \ if (current_index < (end_range)) { \ @@ -2054,145 +2092,108 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense( return; \ } - if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) { // fairly common - goto shader_vars_start; - } - -#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \ - copy_and_swap_32_unaligned(®ister_file_->values[i], b, n) DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK); + DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1, - DO_A_RANGE_CALLBACK); + SPECIAL_REG_RANGE_CALLBACK); DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X, REGULAR_WRITE_CALLBACK); -#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \ - n) \ - WriteShaderConstantsFromMem(index, base, n) -shader_vars_start: DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X, XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, WRITE_SHADER_CONSTANTS_CALLBACK); -#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \ - WriteFetchFromMem(ind, b, n) DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1, WRITE_FETCH_CONSTANTS_CALLBACK); -#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n) DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK); DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536, REGULAR_WRITE_CALLBACK); } +void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem( + uint32_t start_index, uint32_t* base, uint32_t numregs) { + uint32_t end = numregs + start_index; + for (uint32_t index = start_index; index < end; ++index, ++base) { + uint32_t value = xe::load_and_swap(base); + + register_file_->values[index].u32 = value; + + unsigned expr = 0; + + expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8); + + expr |= (index == XE_GPU_REG_COHER_STATUS_HOST); + + expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= + (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); + + if (expr == 0) { + } else { + HandleSpecialRegisterWrite(index, value); + } + } +} template XE_FORCEINLINE void D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( - uint32_t base, uint32_t* range, uint32_t num_registers) { - constexpr auto bounds_has_reg = - bounds_may_have_reg; - constexpr auto bounds_has_bounds = - bounds_may_have_bounds; + uint32_t start_index, uint32_t* base, uint32_t num_registers) { + uint32_t end = start_index + num_registers; - bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date; - bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date; + uint32_t current_index = start_index; - bool skip_uptodate_checks = - (!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_); - for (uint32_t i = 0; i < num_registers; ++i) { - uint32_t data = xe::load_and_swap(range + i); - uint32_t index = base + i; - uint32_t value = data; - // cant if constexpr this one or we get unreferenced label errors, and if we - // move the label into the else we get errors about a jump from one if - // constexpr into another - if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) { - D3D12CommandProcessor::WriteRegisterForceinline(index, value); - } else { - XE_MSVC_ASSUME(index >= register_lower_bound && - index < register_upper_bound); - register_file_->values[index].u32 = value; + auto get_end_before_qty = [&end, current_index](uint32_t regnum) { + return std::min(regnum, end) - current_index; + }; +#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \ + copy_and_swap_32_unaligned(®ister_file_->values[i], b, n) +#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \ + WriteFetchFromMem(ind, b, n) +#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \ + WritePossiblySpecialRegistersFromMem(ind, bs, n) +#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \ + n) \ + WriteShaderConstantsFromMem(index, base, n) +#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n) - unsigned expr = 0; - - if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0, - XE_GPU_REG_SCRATCH_REG7)) { - expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8); - } - if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) { - expr |= (index == XE_GPU_REG_COHER_STATUS_HOST); - } - if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX, - XE_GPU_REG_DC_LUT_30_COLOR)) { - expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= - (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); - } - // chrispy: reordered for msvc branch probability (assumes - // if is taken and else is not) - if (XE_LIKELY(expr == 0)) { - XE_MSVC_REORDER_BARRIER(); - - } else { - HandleSpecialRegisterWrite(index, value); - goto write_done; - } - XE_MSVC_ASSUME(index >= register_lower_bound && - index < register_upper_bound); - if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X, - XE_GPU_REG_SHADER_CONSTANT_511_W)) { - if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && - index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { - if (!skip_uptodate_checks) { - uint32_t float_constant_index = - (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; - if (float_constant_index >= 256) { - float_constant_index -= 256; - if (current_float_constant_map_pixel_[float_constant_index >> 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_binding_float_pixel_.up_to_date = false; - } - } else { - if (current_float_constant_map_vertex_[float_constant_index >> - 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_binding_float_vertex_.up_to_date = false; - if (!cbuffer_pixel_uptodate) { - skip_uptodate_checks = true; - } - } - } - } - goto write_done; - } - } - XE_MSVC_ASSUME(index >= register_lower_bound && - index < register_upper_bound); - if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, - XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) { - if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 && - index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) { - cbuffer_binding_fetch_.up_to_date = false; - // texture cache is never nullptr - texture_cache_->TextureFetchConstantWritten( - (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); - - goto write_done; - } - } - - XE_MSVC_ASSUME(index >= register_lower_bound && - index < register_upper_bound); - if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, - XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) { - if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 && - index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) { - cbuffer_binding_bool_loop_.up_to_date = false; - goto write_done; - } - } - } - write_done:; +#define DO_A_RANGE(start_range, end_range, cb) \ + if (current_index < (end_range)) { \ + uint32_t ntowrite = get_end_before_qty(end_range); \ + cb((start_range), (end_range), current_index, base, ntowrite); \ + current_index += ntowrite; \ + base += ntowrite; \ + } \ + if (current_index >= end) { \ + return; \ } + +#define REFRESH_MSVC_RANGE() \ + XE_MSVC_ASSUME(current_index >= register_lower_bound && \ + current_index < register_upper_bound) + + REFRESH_MSVC_RANGE(); + DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1, + SPECIAL_REG_RANGE_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X, + REGULAR_WRITE_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X, + XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, + WRITE_SHADER_CONSTANTS_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, + XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1, + WRITE_FETCH_CONSTANTS_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, + XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK); + REFRESH_MSVC_RANGE(); + DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536, + REGULAR_WRITE_CALLBACK); + } template XE_FORCEINLINE void diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 0fcd76593..7bf11c9b7 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -226,6 +226,9 @@ class D3D12CommandProcessor final : public CommandProcessor { uint32_t num_registers); void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base, uint32_t num_registers) ; + + void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base, + uint32_t num_registers); template XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound( uint32_t start_index, uint32_t* base, uint32_t num_registers);