Cleaned up for commit, moved WriteRegistersFromMemCommonSense code into WriteRegistersFromMem
optimized copy_and_swap_32_unaligned further
This commit is contained in:
parent
754293ffc3
commit
f931c34ecb
|
@ -327,15 +327,34 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
|
|||
|
||||
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
|
||||
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
|
||||
|
||||
//chrispy: todo, benchmark this w/ and w/out these prefetches here on multiple machines
|
||||
//finding a good distance for prefetchw in particular is probably important
|
||||
//for when we're writing across 2 cachelines
|
||||
#if 0
|
||||
if (i + 48 <= count) {
|
||||
swcache::PrefetchNTA(&src[i + 32]);
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
|
||||
swcache::PrefetchW(&dest[i + 32]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
|
||||
}
|
||||
for (; i + 8 <= count; i += 8) {
|
||||
if (i + 8 <= count) {
|
||||
__m256i input =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
||||
__m256i output = _mm256_shuffle_epi8(input, shufmask);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
|
||||
i += 8;
|
||||
}
|
||||
if (i + 4 <= count) {
|
||||
__m128i input =
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||
__m128i output =
|
||||
_mm_shuffle_epi8(input, _mm256_castsi256_si128(shufmask));
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||
i += 4;
|
||||
}
|
||||
} else {
|
||||
__m128i shufmask =
|
||||
|
|
|
@ -1793,10 +1793,8 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
|||
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
|
||||
D3D12CommandProcessor::WriteRegister(start_index + i, data);
|
||||
}
|
||||
WriteRegisterRangeFromMem_WithKnownBound<0, 0xFFFF>(start_index, base,
|
||||
num_registers);
|
||||
}
|
||||
|
||||
void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
|
||||
|
@ -1903,11 +1901,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
|||
uint32_t num_regs_firstrange =
|
||||
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
||||
|
||||
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||
num_regs_firstrange);
|
||||
|
||||
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||
base + num_regs_firstrange,
|
||||
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
|
||||
num_registers - num_regs_firstrange);
|
||||
|
@ -1917,14 +1915,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
|||
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
||||
uint32_t base,
|
||||
uint32_t num_registers) {
|
||||
// WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
|
||||
// num_registers);
|
||||
|
||||
RingBuffer::ReadRange range =
|
||||
ring->BeginRead(num_registers * sizeof(uint32_t));
|
||||
|
||||
XE_LIKELY_IF(!range.second) {
|
||||
WriteRegistersFromMemCommonSense(
|
||||
WriteRegistersFromMem(
|
||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||
num_registers);
|
||||
|
||||
|
@ -1946,9 +1941,9 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
|
|||
bounds_may_have_reg<register_lower_bound, register_upper_bound>(
|
||||
last_reg);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||
#if 1
|
||||
if (frame_open_) {
|
||||
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
||||
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
||||
|
@ -1986,57 +1981,16 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
if (!cbuffer_vertex_uptodate) {
|
||||
if (256 >= end_map_index) {
|
||||
goto skip_map_checks;
|
||||
}
|
||||
map_index = 256;
|
||||
}
|
||||
|
||||
for (; map_index < end_map_index; ++map_index) {
|
||||
uint32_t float_constant_index = map_index;
|
||||
if (float_constant_index >= 256) {
|
||||
float_constant_index -= 256;
|
||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
||||
(1ull << (float_constant_index & 63))) {
|
||||
cbuffer_pixel_uptodate = false;
|
||||
if (!cbuffer_vertex_uptodate) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (current_float_constant_map_vertex_[float_constant_index >> 6] &
|
||||
(1ull << (float_constant_index & 63))) {
|
||||
cbuffer_vertex_uptodate = false;
|
||||
if (!cbuffer_pixel_uptodate) {
|
||||
break;
|
||||
} else {
|
||||
map_index = 255; // skip to checking pixel
|
||||
continue; // increment will put us at 256, then the check will
|
||||
// happen
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
skip_map_checks:;
|
||||
#endif
|
||||
}
|
||||
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
|
||||
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
|
||||
}
|
||||
#else
|
||||
if (frame_open_) {
|
||||
cbuffer_binding_float_pixel_.up_to_date = false;
|
||||
cbuffer_binding_float_vertex_.up_to_date = false;
|
||||
}
|
||||
#endif
|
||||
|
||||
// maybe use non-temporal copy if possible...
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||
num_registers);
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
|
@ -2044,6 +1998,7 @@ void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
|
|||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||
num_registers);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
|
@ -2061,56 +2016,6 @@ void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
|
|||
num_registers);
|
||||
}
|
||||
|
||||
void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||
uint32_t end = start_index + num_registers;
|
||||
|
||||
uint32_t current_index = start_index;
|
||||
|
||||
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
|
||||
return std::min<uint32_t>(regnum, end) - current_index;
|
||||
};
|
||||
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
||||
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
||||
WriteFetchFromMem(ind, b, n)
|
||||
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
|
||||
WritePossiblySpecialRegistersFromMem(ind, bs, n)
|
||||
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
||||
n) \
|
||||
WriteShaderConstantsFromMem(index, base, n)
|
||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||
|
||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||
if (current_index < (end_range)) { \
|
||||
uint32_t ntowrite = get_end_before_qty(end_range); \
|
||||
cb((start_range), (end_range), current_index, base, ntowrite); \
|
||||
current_index += ntowrite; \
|
||||
base += ntowrite; \
|
||||
} \
|
||||
if (current_index >= end) { \
|
||||
return; \
|
||||
}
|
||||
|
||||
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
||||
|
||||
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
||||
SPECIAL_REG_RANGE_CALLBACK);
|
||||
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
REGULAR_WRITE_CALLBACK);
|
||||
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
WRITE_SHADER_CONSTANTS_CALLBACK);
|
||||
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
|
||||
WRITE_FETCH_CONSTANTS_CALLBACK);
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
|
||||
REGULAR_WRITE_CALLBACK);
|
||||
}
|
||||
void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
|
||||
uint32_t start_index, uint32_t* base, uint32_t numregs) {
|
||||
uint32_t end = numregs + start_index;
|
||||
|
@ -2156,15 +2061,18 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
|||
WriteShaderConstantsFromMem(index, base, n)
|
||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||
|
||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||
if (current_index < (end_range)) { \
|
||||
uint32_t ntowrite = get_end_before_qty(end_range); \
|
||||
cb((start_range), (end_range), current_index, base, ntowrite); \
|
||||
current_index += ntowrite; \
|
||||
base += ntowrite; \
|
||||
} \
|
||||
if (current_index >= end) { \
|
||||
return; \
|
||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||
if constexpr (start_range >= register_lower_bound || \
|
||||
end_range > register_lower_bound) { \
|
||||
if (current_index < (end_range)) { \
|
||||
uint32_t ntowrite = get_end_before_qty(end_range); \
|
||||
cb((start_range), (end_range), current_index, base, ntowrite); \
|
||||
current_index += ntowrite; \
|
||||
base += ntowrite; \
|
||||
} \
|
||||
if (current_index >= end) { \
|
||||
return; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define REFRESH_MSVC_RANGE() \
|
||||
|
@ -2172,6 +2080,7 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
|||
current_index < register_upper_bound)
|
||||
|
||||
REFRESH_MSVC_RANGE();
|
||||
|
||||
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
||||
|
|
|
@ -214,18 +214,19 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
XE_FORCEINLINE
|
||||
void WriteRegisterForceinline(uint32_t index, uint32_t value);
|
||||
void WriteRegister(uint32_t index, uint32_t value) override;
|
||||
XE_FORCEINLINE
|
||||
|
||||
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers) override;
|
||||
//SHADER_CONSTANT_blah_XWYZ
|
||||
/*helper functions for WriteRegistersFromMem*/
|
||||
XE_FORCEINLINE
|
||||
void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
uint32_t num_registers);
|
||||
XE_FORCEINLINE
|
||||
void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
XE_FORCEINLINE
|
||||
void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers) ;
|
||||
|
||||
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
|
|
@ -534,7 +534,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
|
|||
return false;
|
||||
}
|
||||
} else {
|
||||
//xe::threading::MaybeYield();
|
||||
}
|
||||
}
|
||||
} while (!matched);
|
||||
|
|
Loading…
Reference in New Issue