Cleaned up for commit, moved WriteRegistersFromMemCommonSense code into WriteRegistersFromMem
optimized copy_and_swap_32_unaligned further
This commit is contained in:
parent
754293ffc3
commit
f931c34ecb
|
@ -327,15 +327,34 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
|
||||||
|
|
||||||
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
|
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
|
||||||
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
|
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
|
||||||
|
//chrispy: todo, benchmark this w/ and w/out these prefetches here on multiple machines
|
||||||
|
//finding a good distance for prefetchw in particular is probably important
|
||||||
|
//for when we're writing across 2 cachelines
|
||||||
|
#if 0
|
||||||
|
if (i + 48 <= count) {
|
||||||
|
swcache::PrefetchNTA(&src[i + 32]);
|
||||||
|
if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
|
||||||
|
swcache::PrefetchW(&dest[i + 32]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
|
||||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
|
||||||
}
|
}
|
||||||
for (; i + 8 <= count; i += 8) {
|
if (i + 8 <= count) {
|
||||||
__m256i input =
|
__m256i input =
|
||||||
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
||||||
__m256i output = _mm256_shuffle_epi8(input, shufmask);
|
__m256i output = _mm256_shuffle_epi8(input, shufmask);
|
||||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
|
||||||
|
i += 8;
|
||||||
|
}
|
||||||
|
if (i + 4 <= count) {
|
||||||
|
__m128i input =
|
||||||
|
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||||
|
__m128i output =
|
||||||
|
_mm_shuffle_epi8(input, _mm256_castsi256_si128(shufmask));
|
||||||
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||||
|
i += 4;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
__m128i shufmask =
|
__m128i shufmask =
|
||||||
|
|
|
@ -1793,10 +1793,8 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
||||||
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
||||||
uint32_t* base,
|
uint32_t* base,
|
||||||
uint32_t num_registers) {
|
uint32_t num_registers) {
|
||||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
WriteRegisterRangeFromMem_WithKnownBound<0, 0xFFFF>(start_index, base,
|
||||||
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
|
num_registers);
|
||||||
D3D12CommandProcessor::WriteRegister(start_index + i, data);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
|
void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
|
||||||
|
@ -1903,11 +1901,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
||||||
uint32_t num_regs_firstrange =
|
uint32_t num_regs_firstrange =
|
||||||
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
||||||
|
|
||||||
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||||
num_regs_firstrange);
|
num_regs_firstrange);
|
||||||
|
|
||||||
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||||
base + num_regs_firstrange,
|
base + num_regs_firstrange,
|
||||||
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
|
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
|
||||||
num_registers - num_regs_firstrange);
|
num_registers - num_regs_firstrange);
|
||||||
|
@ -1917,14 +1915,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
||||||
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
||||||
uint32_t base,
|
uint32_t base,
|
||||||
uint32_t num_registers) {
|
uint32_t num_registers) {
|
||||||
// WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
|
|
||||||
// num_registers);
|
|
||||||
|
|
||||||
RingBuffer::ReadRange range =
|
RingBuffer::ReadRange range =
|
||||||
ring->BeginRead(num_registers * sizeof(uint32_t));
|
ring->BeginRead(num_registers * sizeof(uint32_t));
|
||||||
|
|
||||||
XE_LIKELY_IF(!range.second) {
|
XE_LIKELY_IF(!range.second) {
|
||||||
WriteRegistersFromMemCommonSense(
|
WriteRegistersFromMem(
|
||||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||||
num_registers);
|
num_registers);
|
||||||
|
|
||||||
|
@ -1946,9 +1941,9 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
|
||||||
bounds_may_have_reg<register_lower_bound, register_upper_bound>(
|
bounds_may_have_reg<register_lower_bound, register_upper_bound>(
|
||||||
last_reg);
|
last_reg);
|
||||||
}
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
||||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||||
#if 1
|
|
||||||
if (frame_open_) {
|
if (frame_open_) {
|
||||||
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
||||||
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
||||||
|
@ -1986,57 +1981,16 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if 0
|
|
||||||
if (!cbuffer_vertex_uptodate) {
|
|
||||||
if (256 >= end_map_index) {
|
|
||||||
goto skip_map_checks;
|
|
||||||
}
|
|
||||||
map_index = 256;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (; map_index < end_map_index; ++map_index) {
|
|
||||||
uint32_t float_constant_index = map_index;
|
|
||||||
if (float_constant_index >= 256) {
|
|
||||||
float_constant_index -= 256;
|
|
||||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
|
||||||
(1ull << (float_constant_index & 63))) {
|
|
||||||
cbuffer_pixel_uptodate = false;
|
|
||||||
if (!cbuffer_vertex_uptodate) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (current_float_constant_map_vertex_[float_constant_index >> 6] &
|
|
||||||
(1ull << (float_constant_index & 63))) {
|
|
||||||
cbuffer_vertex_uptodate = false;
|
|
||||||
if (!cbuffer_pixel_uptodate) {
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
map_index = 255; // skip to checking pixel
|
|
||||||
continue; // increment will put us at 256, then the check will
|
|
||||||
// happen
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
skip_map_checks:;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
|
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
|
||||||
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
|
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
if (frame_open_) {
|
|
||||||
cbuffer_binding_float_pixel_.up_to_date = false;
|
|
||||||
cbuffer_binding_float_vertex_.up_to_date = false;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
// maybe use non-temporal copy if possible...
|
// maybe use non-temporal copy if possible...
|
||||||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||||
num_registers);
|
num_registers);
|
||||||
}
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
|
void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
|
||||||
uint32_t* base,
|
uint32_t* base,
|
||||||
uint32_t num_registers) {
|
uint32_t num_registers) {
|
||||||
|
@ -2044,6 +1998,7 @@ void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
|
||||||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||||
num_registers);
|
num_registers);
|
||||||
}
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
|
void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
|
||||||
uint32_t* base,
|
uint32_t* base,
|
||||||
uint32_t num_registers) {
|
uint32_t num_registers) {
|
||||||
|
@ -2061,56 +2016,6 @@ void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
|
||||||
num_registers);
|
num_registers);
|
||||||
}
|
}
|
||||||
|
|
||||||
void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
|
||||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
|
||||||
uint32_t end = start_index + num_registers;
|
|
||||||
|
|
||||||
uint32_t current_index = start_index;
|
|
||||||
|
|
||||||
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
|
|
||||||
return std::min<uint32_t>(regnum, end) - current_index;
|
|
||||||
};
|
|
||||||
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
|
||||||
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
|
||||||
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
|
||||||
WriteFetchFromMem(ind, b, n)
|
|
||||||
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
|
|
||||||
WritePossiblySpecialRegistersFromMem(ind, bs, n)
|
|
||||||
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
|
||||||
n) \
|
|
||||||
WriteShaderConstantsFromMem(index, base, n)
|
|
||||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
|
||||||
|
|
||||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
|
||||||
if (current_index < (end_range)) { \
|
|
||||||
uint32_t ntowrite = get_end_before_qty(end_range); \
|
|
||||||
cb((start_range), (end_range), current_index, base, ntowrite); \
|
|
||||||
current_index += ntowrite; \
|
|
||||||
base += ntowrite; \
|
|
||||||
} \
|
|
||||||
if (current_index >= end) { \
|
|
||||||
return; \
|
|
||||||
}
|
|
||||||
|
|
||||||
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
|
||||||
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
|
||||||
SPECIAL_REG_RANGE_CALLBACK);
|
|
||||||
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
|
|
||||||
REGULAR_WRITE_CALLBACK);
|
|
||||||
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
|
||||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
|
||||||
WRITE_SHADER_CONSTANTS_CALLBACK);
|
|
||||||
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
|
||||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
|
|
||||||
WRITE_FETCH_CONSTANTS_CALLBACK);
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
|
||||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
|
|
||||||
REGULAR_WRITE_CALLBACK);
|
|
||||||
}
|
|
||||||
void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
|
void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
|
||||||
uint32_t start_index, uint32_t* base, uint32_t numregs) {
|
uint32_t start_index, uint32_t* base, uint32_t numregs) {
|
||||||
uint32_t end = numregs + start_index;
|
uint32_t end = numregs + start_index;
|
||||||
|
@ -2156,15 +2061,18 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
||||||
WriteShaderConstantsFromMem(index, base, n)
|
WriteShaderConstantsFromMem(index, base, n)
|
||||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||||
|
|
||||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||||
if (current_index < (end_range)) { \
|
if constexpr (start_range >= register_lower_bound || \
|
||||||
uint32_t ntowrite = get_end_before_qty(end_range); \
|
end_range > register_lower_bound) { \
|
||||||
cb((start_range), (end_range), current_index, base, ntowrite); \
|
if (current_index < (end_range)) { \
|
||||||
current_index += ntowrite; \
|
uint32_t ntowrite = get_end_before_qty(end_range); \
|
||||||
base += ntowrite; \
|
cb((start_range), (end_range), current_index, base, ntowrite); \
|
||||||
} \
|
current_index += ntowrite; \
|
||||||
if (current_index >= end) { \
|
base += ntowrite; \
|
||||||
return; \
|
} \
|
||||||
|
if (current_index >= end) { \
|
||||||
|
return; \
|
||||||
|
} \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define REFRESH_MSVC_RANGE() \
|
#define REFRESH_MSVC_RANGE() \
|
||||||
|
@ -2172,6 +2080,7 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
||||||
current_index < register_upper_bound)
|
current_index < register_upper_bound)
|
||||||
|
|
||||||
REFRESH_MSVC_RANGE();
|
REFRESH_MSVC_RANGE();
|
||||||
|
|
||||||
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
||||||
REFRESH_MSVC_RANGE();
|
REFRESH_MSVC_RANGE();
|
||||||
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
||||||
|
|
|
@ -214,18 +214,19 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
void WriteRegisterForceinline(uint32_t index, uint32_t value);
|
void WriteRegisterForceinline(uint32_t index, uint32_t value);
|
||||||
void WriteRegister(uint32_t index, uint32_t value) override;
|
void WriteRegister(uint32_t index, uint32_t value) override;
|
||||||
XE_FORCEINLINE
|
|
||||||
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||||
uint32_t num_registers) override;
|
uint32_t num_registers) override;
|
||||||
//SHADER_CONSTANT_blah_XWYZ
|
/*helper functions for WriteRegistersFromMem*/
|
||||||
|
XE_FORCEINLINE
|
||||||
void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
|
void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
|
||||||
uint32_t num_registers);
|
uint32_t num_registers);
|
||||||
|
XE_FORCEINLINE
|
||||||
void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
|
void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
|
||||||
uint32_t num_registers);
|
uint32_t num_registers);
|
||||||
|
XE_FORCEINLINE
|
||||||
void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
|
void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
|
||||||
uint32_t num_registers);
|
uint32_t num_registers);
|
||||||
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
|
|
||||||
uint32_t num_registers) ;
|
|
||||||
|
|
||||||
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
|
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||||
uint32_t num_registers);
|
uint32_t num_registers);
|
||||||
|
|
|
@ -534,7 +534,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//xe::threading::MaybeYield();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (!matched);
|
} while (!matched);
|
||||||
|
|
Loading…
Reference in New Issue