add avx2 codepath for copy_and_swap_32_unaligned

use the new writerange approach in WriteRegisterRangeFromMem_WithKnownBound
This commit is contained in:
chss95cs@gmail.com 2022-12-14 07:53:21 -08:00
parent 82dcf3f951
commit ab6d9dade0
3 changed files with 163 additions and 129 deletions

View File

@ -309,16 +309,46 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
size_t count) { size_t count) {
auto dest = reinterpret_cast<uint32_t*>(dest_ptr); auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
auto src = reinterpret_cast<const uint32_t*>(src_ptr); auto src = reinterpret_cast<const uint32_t*>(src_ptr);
size_t i;
// chrispy: this optimization mightt backfire if our unaligned load spans two
// cachelines... which it probably will
if (amd64::GetFeatureFlags() & amd64::kX64EmitAVX2) {
__m256i shufmask = _mm256_set_epi8(
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07,
0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
// with vpshufb being a 0.5 through instruction, it makes the most sense to
// double up on our iters
for (i = 0; i + 16 <= count; i += 16) {
__m256i input1 =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
__m256i input2 =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i + 8]));
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
}
for (; i + 8 <= count; i += 8) {
__m256i input =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
__m256i output = _mm256_shuffle_epi8(input, shufmask);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
}
} else {
__m128i shufmask = __m128i shufmask =
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
0x06, 0x07, 0x00, 0x01, 0x02, 0x03); 0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
size_t i;
for (i = 0; i + 4 <= count; i += 4) { for (i = 0; i + 4 <= count; i += 4) {
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i])); __m128i input =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
__m128i output = _mm_shuffle_epi8(input, shufmask); __m128i output = _mm_shuffle_epi8(input, shufmask);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
} }
}
XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0); XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0);
for (; i < count; ++i) { // handle residual elements for (; i < count; ++i) { // handle residual elements
dest[i] = byte_swap(src[i]); dest[i] = byte_swap(src[i]);

View File

@ -1903,11 +1903,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
uint32_t num_regs_firstrange = uint32_t num_regs_firstrange =
static_cast<uint32_t>(range.first_length / sizeof(uint32_t)); static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
D3D12CommandProcessor::WriteRegistersFromMem( D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)), base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
num_regs_firstrange); num_regs_firstrange);
D3D12CommandProcessor::WriteRegistersFromMem( D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
base + num_regs_firstrange, base + num_regs_firstrange,
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)), reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
num_registers - num_regs_firstrange); num_registers - num_regs_firstrange);
@ -1948,6 +1948,7 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
} }
void D3D12CommandProcessor::WriteShaderConstantsFromMem( void D3D12CommandProcessor::WriteShaderConstantsFromMem(
uint32_t start_index, uint32_t* base, uint32_t num_registers) { uint32_t start_index, uint32_t* base, uint32_t num_registers) {
#if 1
if (frame_open_) { if (frame_open_) {
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date; bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date; bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
@ -1964,12 +1965,36 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4; uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
uint32_t end_map_index = uint32_t end_map_index =
(start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4; (start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
if (map_index < 256 && cbuffer_vertex_uptodate) {
for (; map_index < end_map_index; ++map_index) {
if (current_float_constant_map_vertex_[map_index >> 6] &
(1ull << map_index)) {
cbuffer_vertex_uptodate = false;
break;
}
}
}
if (end_map_index > 256 && cbuffer_pixel_uptodate) {
for (; map_index < end_map_index; ++map_index) {
uint32_t float_constant_index = map_index;
float_constant_index -= 256;
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
(1ull << float_constant_index)) {
cbuffer_pixel_uptodate = false;
break;
}
}
}
#if 0
if (!cbuffer_vertex_uptodate) { if (!cbuffer_vertex_uptodate) {
if (256 >= end_map_index) { if (256 >= end_map_index) {
goto skip_map_checks; goto skip_map_checks;
} }
map_index = 256; map_index = 256;
} }
for (; map_index < end_map_index; ++map_index) { for (; map_index < end_map_index; ++map_index) {
uint32_t float_constant_index = map_index; uint32_t float_constant_index = map_index;
if (float_constant_index >= 256) { if (float_constant_index >= 256) {
@ -1996,10 +2021,17 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
} }
} }
skip_map_checks:; skip_map_checks:;
#endif
} }
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate; cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate; cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
} }
#else
if (frame_open_) {
cbuffer_binding_float_pixel_.up_to_date = false;
cbuffer_binding_float_vertex_.up_to_date = false;
}
#endif
// maybe use non-temporal copy if possible... // maybe use non-temporal copy if possible...
copy_and_swap_32_unaligned(&register_file_->values[start_index], base, copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
num_registers); num_registers);
@ -2038,10 +2070,16 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
auto get_end_before_qty = [&end, current_index](uint32_t regnum) { auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
return std::min<uint32_t>(regnum, end) - current_index; return std::min<uint32_t>(regnum, end) - current_index;
}; };
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n) \ copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \ #define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
index, base, n) WriteFetchFromMem(ind, b, n)
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
WritePossiblySpecialRegistersFromMem(ind, bs, n)
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
n) \
WriteShaderConstantsFromMem(index, base, n)
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
#define DO_A_RANGE(start_range, end_range, cb) \ #define DO_A_RANGE(start_range, end_range, cb) \
if (current_index < (end_range)) { \ if (current_index < (end_range)) { \
@ -2054,145 +2092,108 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
return; \ return; \
} }
if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) { // fairly common
goto shader_vars_start;
}
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK); DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1, DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
DO_A_RANGE_CALLBACK); SPECIAL_REG_RANGE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X, DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
REGULAR_WRITE_CALLBACK); REGULAR_WRITE_CALLBACK);
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
n) \
WriteShaderConstantsFromMem(index, base, n)
shader_vars_start:
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X, DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
WRITE_SHADER_CONSTANTS_CALLBACK); WRITE_SHADER_CONSTANTS_CALLBACK);
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
WriteFetchFromMem(ind, b, n)
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0, DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1, XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
WRITE_FETCH_CONSTANTS_CALLBACK); WRITE_FETCH_CONSTANTS_CALLBACK);
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK); XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536, DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
REGULAR_WRITE_CALLBACK); REGULAR_WRITE_CALLBACK);
} }
template <uint32_t register_lower_bound, uint32_t register_upper_bound> void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
XE_FORCEINLINE void uint32_t start_index, uint32_t* base, uint32_t numregs) {
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound( uint32_t end = numregs + start_index;
uint32_t base, uint32_t* range, uint32_t num_registers) { for (uint32_t index = start_index; index < end; ++index, ++base) {
constexpr auto bounds_has_reg = uint32_t value = xe::load_and_swap<uint32_t>(base);
bounds_may_have_reg<register_lower_bound, register_upper_bound>;
constexpr auto bounds_has_bounds =
bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
bool skip_uptodate_checks =
(!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = xe::load_and_swap<uint32_t>(range + i);
uint32_t index = base + i;
uint32_t value = data;
// cant if constexpr this one or we get unreferenced label errors, and if we
// move the label into the else we get errors about a jump from one if
// constexpr into another
if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
D3D12CommandProcessor::WriteRegisterForceinline(index, value);
} else {
XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound);
register_file_->values[index].u32 = value; register_file_->values[index].u32 = value;
unsigned expr = 0; unsigned expr = 0;
if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0,
XE_GPU_REG_SCRATCH_REG7)) {
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8); expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
}
if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) {
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST); expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
}
if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX,
XE_GPU_REG_DC_LUT_30_COLOR)) {
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
}
// chrispy: reordered for msvc branch probability (assumes
// if is taken and else is not)
if (XE_LIKELY(expr == 0)) {
XE_MSVC_REORDER_BARRIER();
if (expr == 0) {
} else { } else {
HandleSpecialRegisterWrite(index, value); HandleSpecialRegisterWrite(index, value);
goto write_done;
}
XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_511_W)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
if (!skip_uptodate_checks) {
uint32_t float_constant_index =
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
if (float_constant_index >= 256) {
float_constant_index -= 256;
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
(1ull << (float_constant_index & 63))) {
cbuffer_binding_float_pixel_.up_to_date = false;
}
} else {
if (current_float_constant_map_vertex_[float_constant_index >>
6] &
(1ull << (float_constant_index & 63))) {
cbuffer_binding_float_vertex_.up_to_date = false;
if (!cbuffer_pixel_uptodate) {
skip_uptodate_checks = true;
} }
} }
} }
} template <uint32_t register_lower_bound, uint32_t register_upper_bound>
goto write_done; XE_FORCEINLINE void
} D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
} uint32_t start_index, uint32_t* base, uint32_t num_registers) {
XE_MSVC_ASSUME(index >= register_lower_bound && uint32_t end = start_index + num_registers;
index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
cbuffer_binding_fetch_.up_to_date = false;
// texture cache is never nullptr
texture_cache_->TextureFetchConstantWritten(
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
goto write_done; uint32_t current_index = start_index;
}
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
return std::min<uint32_t>(regnum, end) - current_index;
};
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
WriteFetchFromMem(ind, b, n)
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
WritePossiblySpecialRegistersFromMem(ind, bs, n)
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
n) \
WriteShaderConstantsFromMem(index, base, n)
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
#define DO_A_RANGE(start_range, end_range, cb) \
if (current_index < (end_range)) { \
uint32_t ntowrite = get_end_before_qty(end_range); \
cb((start_range), (end_range), current_index, base, ntowrite); \
current_index += ntowrite; \
base += ntowrite; \
} \
if (current_index >= end) { \
return; \
} }
XE_MSVC_ASSUME(index >= register_lower_bound && #define REFRESH_MSVC_RANGE() \
index < register_upper_bound); XE_MSVC_ASSUME(current_index >= register_lower_bound && \
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031, current_index < register_upper_bound)
XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 && REFRESH_MSVC_RANGE();
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) { DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
cbuffer_binding_bool_loop_.up_to_date = false; REFRESH_MSVC_RANGE();
goto write_done; DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
} SPECIAL_REG_RANGE_CALLBACK);
} REFRESH_MSVC_RANGE();
} DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
write_done:; REGULAR_WRITE_CALLBACK);
} REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
WRITE_SHADER_CONSTANTS_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
WRITE_FETCH_CONSTANTS_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
REGULAR_WRITE_CALLBACK);
} }
template <uint32_t register_lower_bound, uint32_t register_upper_bound> template <uint32_t register_lower_bound, uint32_t register_upper_bound>
XE_FORCEINLINE void XE_FORCEINLINE void

View File

@ -226,6 +226,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
uint32_t num_registers); uint32_t num_registers);
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base, void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
uint32_t num_registers) ; uint32_t num_registers) ;
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers);
template <uint32_t register_lower_bound, uint32_t register_upper_bound> template <uint32_t register_lower_bound, uint32_t register_upper_bound>
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound( XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
uint32_t start_index, uint32_t* base, uint32_t num_registers); uint32_t start_index, uint32_t* base, uint32_t num_registers);