Cleaned up for commit, moved WriteRegistersFromMemCommonSense code into WriteRegistersFromMem

optimized copy_and_swap_32_unaligned further
This commit is contained in:
chss95cs@gmail.com 2022-12-14 11:34:33 -08:00
parent 754293ffc3
commit f931c34ecb
4 changed files with 49 additions and 121 deletions

View File

@ -327,15 +327,34 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask); __m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask); __m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
//chrispy: todo, benchmark this w/ and w/out these prefetches here on multiple machines
//finding a good distance for prefetchw in particular is probably important
//for when we're writing across 2 cachelines
#if 0
if (i + 48 <= count) {
swcache::PrefetchNTA(&src[i + 32]);
if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
swcache::PrefetchW(&dest[i + 32]);
}
}
#endif
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1); _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2); _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
} }
for (; i + 8 <= count; i += 8) { if (i + 8 <= count) {
__m256i input = __m256i input =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i])); _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
__m256i output = _mm256_shuffle_epi8(input, shufmask); __m256i output = _mm256_shuffle_epi8(input, shufmask);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output); _mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
i += 8;
}
if (i + 4 <= count) {
__m128i input =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
__m128i output =
_mm_shuffle_epi8(input, _mm256_castsi256_si128(shufmask));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
i += 4;
} }
} else { } else {
__m128i shufmask = __m128i shufmask =

View File

@ -1793,10 +1793,8 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
uint32_t* base, uint32_t* base,
uint32_t num_registers) { uint32_t num_registers) {
for (uint32_t i = 0; i < num_registers; ++i) { WriteRegisterRangeFromMem_WithKnownBound<0, 0xFFFF>(start_index, base,
uint32_t data = xe::load_and_swap<uint32_t>(base + i); num_registers);
D3D12CommandProcessor::WriteRegister(start_index + i, data);
}
} }
void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring, void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
@ -1903,11 +1901,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
uint32_t num_regs_firstrange = uint32_t num_regs_firstrange =
static_cast<uint32_t>(range.first_length / sizeof(uint32_t)); static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
D3D12CommandProcessor::WriteRegistersFromMemCommonSense( D3D12CommandProcessor::WriteRegistersFromMem(
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)), base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
num_regs_firstrange); num_regs_firstrange);
D3D12CommandProcessor::WriteRegistersFromMemCommonSense( D3D12CommandProcessor::WriteRegistersFromMem(
base + num_regs_firstrange, base + num_regs_firstrange,
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)), reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
num_registers - num_regs_firstrange); num_registers - num_regs_firstrange);
@ -1917,14 +1915,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
uint32_t base, uint32_t base,
uint32_t num_registers) { uint32_t num_registers) {
// WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
// num_registers);
RingBuffer::ReadRange range = RingBuffer::ReadRange range =
ring->BeginRead(num_registers * sizeof(uint32_t)); ring->BeginRead(num_registers * sizeof(uint32_t));
XE_LIKELY_IF(!range.second) { XE_LIKELY_IF(!range.second) {
WriteRegistersFromMemCommonSense( WriteRegistersFromMem(
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)), base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
num_registers); num_registers);
@ -1946,9 +1941,9 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
bounds_may_have_reg<register_lower_bound, register_upper_bound>( bounds_may_have_reg<register_lower_bound, register_upper_bound>(
last_reg); last_reg);
} }
XE_FORCEINLINE
void D3D12CommandProcessor::WriteShaderConstantsFromMem( void D3D12CommandProcessor::WriteShaderConstantsFromMem(
uint32_t start_index, uint32_t* base, uint32_t num_registers) { uint32_t start_index, uint32_t* base, uint32_t num_registers) {
#if 1
if (frame_open_) { if (frame_open_) {
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date; bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date; bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
@ -1986,57 +1981,16 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
} }
} }
} }
#if 0
if (!cbuffer_vertex_uptodate) {
if (256 >= end_map_index) {
goto skip_map_checks;
}
map_index = 256;
}
for (; map_index < end_map_index; ++map_index) {
uint32_t float_constant_index = map_index;
if (float_constant_index >= 256) {
float_constant_index -= 256;
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
(1ull << (float_constant_index & 63))) {
cbuffer_pixel_uptodate = false;
if (!cbuffer_vertex_uptodate) {
break;
}
}
} else {
if (current_float_constant_map_vertex_[float_constant_index >> 6] &
(1ull << (float_constant_index & 63))) {
cbuffer_vertex_uptodate = false;
if (!cbuffer_pixel_uptodate) {
break;
} else {
map_index = 255; // skip to checking pixel
continue; // increment will put us at 256, then the check will
// happen
}
}
}
}
skip_map_checks:;
#endif
} }
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate; cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate; cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
} }
#else
if (frame_open_) {
cbuffer_binding_float_pixel_.up_to_date = false;
cbuffer_binding_float_vertex_.up_to_date = false;
}
#endif
// maybe use non-temporal copy if possible... // maybe use non-temporal copy if possible...
copy_and_swap_32_unaligned(&register_file_->values[start_index], base, copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
num_registers); num_registers);
} }
XE_FORCEINLINE
void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index, void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
uint32_t* base, uint32_t* base,
uint32_t num_registers) { uint32_t num_registers) {
@ -2044,6 +1998,7 @@ void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
copy_and_swap_32_unaligned(&register_file_->values[start_index], base, copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
num_registers); num_registers);
} }
XE_FORCEINLINE
void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index, void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
uint32_t* base, uint32_t* base,
uint32_t num_registers) { uint32_t num_registers) {
@ -2061,56 +2016,6 @@ void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
num_registers); num_registers);
} }
void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
uint32_t end = start_index + num_registers;
uint32_t current_index = start_index;
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
return std::min<uint32_t>(regnum, end) - current_index;
};
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
WriteFetchFromMem(ind, b, n)
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
WritePossiblySpecialRegistersFromMem(ind, bs, n)
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
n) \
WriteShaderConstantsFromMem(index, base, n)
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
#define DO_A_RANGE(start_range, end_range, cb) \
if (current_index < (end_range)) { \
uint32_t ntowrite = get_end_before_qty(end_range); \
cb((start_range), (end_range), current_index, base, ntowrite); \
current_index += ntowrite; \
base += ntowrite; \
} \
if (current_index >= end) { \
return; \
}
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
SPECIAL_REG_RANGE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
REGULAR_WRITE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
WRITE_SHADER_CONSTANTS_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
WRITE_FETCH_CONSTANTS_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
REGULAR_WRITE_CALLBACK);
}
void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem( void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
uint32_t start_index, uint32_t* base, uint32_t numregs) { uint32_t start_index, uint32_t* base, uint32_t numregs) {
uint32_t end = numregs + start_index; uint32_t end = numregs + start_index;
@ -2156,15 +2061,18 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
WriteShaderConstantsFromMem(index, base, n) WriteShaderConstantsFromMem(index, base, n)
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n) #define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
#define DO_A_RANGE(start_range, end_range, cb) \ #define DO_A_RANGE(start_range, end_range, cb) \
if (current_index < (end_range)) { \ if constexpr (start_range >= register_lower_bound || \
uint32_t ntowrite = get_end_before_qty(end_range); \ end_range > register_lower_bound) { \
cb((start_range), (end_range), current_index, base, ntowrite); \ if (current_index < (end_range)) { \
current_index += ntowrite; \ uint32_t ntowrite = get_end_before_qty(end_range); \
base += ntowrite; \ cb((start_range), (end_range), current_index, base, ntowrite); \
} \ current_index += ntowrite; \
if (current_index >= end) { \ base += ntowrite; \
return; \ } \
if (current_index >= end) { \
return; \
} \
} }
#define REFRESH_MSVC_RANGE() \ #define REFRESH_MSVC_RANGE() \
@ -2172,6 +2080,7 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
current_index < register_upper_bound) current_index < register_upper_bound)
REFRESH_MSVC_RANGE(); REFRESH_MSVC_RANGE();
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK); DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
REFRESH_MSVC_RANGE(); REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1, DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,

View File

@ -214,18 +214,19 @@ class D3D12CommandProcessor final : public CommandProcessor {
XE_FORCEINLINE XE_FORCEINLINE
void WriteRegisterForceinline(uint32_t index, uint32_t value); void WriteRegisterForceinline(uint32_t index, uint32_t value);
void WriteRegister(uint32_t index, uint32_t value) override; void WriteRegister(uint32_t index, uint32_t value) override;
XE_FORCEINLINE
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base, virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers) override; uint32_t num_registers) override;
//SHADER_CONSTANT_blah_XWYZ /*helper functions for WriteRegistersFromMem*/
XE_FORCEINLINE
void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base, void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers); uint32_t num_registers);
XE_FORCEINLINE
void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base, void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers); uint32_t num_registers);
XE_FORCEINLINE
void WriteFetchFromMem(uint32_t start_index, uint32_t* base, void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers); uint32_t num_registers);
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
uint32_t num_registers) ;
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base, void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers); uint32_t num_registers);

View File

@ -534,7 +534,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
return false; return false;
} }
} else { } else {
//xe::threading::MaybeYield();
} }
} }
} while (!matched); } while (!matched);