add avx2 codepath for copy_and_swap_32_unaligned

use the new writerange approach in WriteRegisterRangeFromMem_WithKnownBound
This commit is contained in:
chss95cs@gmail.com 2022-12-14 07:53:21 -08:00
parent 82dcf3f951
commit ab6d9dade0
3 changed files with 163 additions and 129 deletions

View File

@ -309,15 +309,45 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
size_t count) {
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
__m128i shufmask =
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
size_t i;
for (i = 0; i + 4 <= count; i += 4) {
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
__m128i output = _mm_shuffle_epi8(input, shufmask);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
// chrispy: this optimization mightt backfire if our unaligned load spans two
// cachelines... which it probably will
if (amd64::GetFeatureFlags() & amd64::kX64EmitAVX2) {
__m256i shufmask = _mm256_set_epi8(
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07,
0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B,
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
// with vpshufb being a 0.5 through instruction, it makes the most sense to
// double up on our iters
for (i = 0; i + 16 <= count; i += 16) {
__m256i input1 =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
__m256i input2 =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i + 8]));
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
}
for (; i + 8 <= count; i += 8) {
__m256i input =
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
__m256i output = _mm256_shuffle_epi8(input, shufmask);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
}
} else {
__m128i shufmask =
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
for (i = 0; i + 4 <= count; i += 4) {
__m128i input =
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
__m128i output = _mm_shuffle_epi8(input, shufmask);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
}
}
XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0);
for (; i < count; ++i) { // handle residual elements

View File

@ -1903,11 +1903,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
uint32_t num_regs_firstrange =
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
D3D12CommandProcessor::WriteRegistersFromMem(
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
num_regs_firstrange);
D3D12CommandProcessor::WriteRegistersFromMem(
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
base + num_regs_firstrange,
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
num_registers - num_regs_firstrange);
@ -1948,6 +1948,7 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
}
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
#if 1
if (frame_open_) {
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
@ -1964,12 +1965,36 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
uint32_t end_map_index =
(start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
if (map_index < 256 && cbuffer_vertex_uptodate) {
for (; map_index < end_map_index; ++map_index) {
if (current_float_constant_map_vertex_[map_index >> 6] &
(1ull << map_index)) {
cbuffer_vertex_uptodate = false;
break;
}
}
}
if (end_map_index > 256 && cbuffer_pixel_uptodate) {
for (; map_index < end_map_index; ++map_index) {
uint32_t float_constant_index = map_index;
float_constant_index -= 256;
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
(1ull << float_constant_index)) {
cbuffer_pixel_uptodate = false;
break;
}
}
}
#if 0
if (!cbuffer_vertex_uptodate) {
if (256 >= end_map_index) {
goto skip_map_checks;
}
map_index = 256;
}
for (; map_index < end_map_index; ++map_index) {
uint32_t float_constant_index = map_index;
if (float_constant_index >= 256) {
@ -1996,10 +2021,17 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
}
}
skip_map_checks:;
#endif
}
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
}
#else
if (frame_open_) {
cbuffer_binding_float_pixel_.up_to_date = false;
cbuffer_binding_float_vertex_.up_to_date = false;
}
#endif
// maybe use non-temporal copy if possible...
copy_and_swap_32_unaligned(&register_file_->values[start_index], base,
num_registers);
@ -2038,10 +2070,16 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
return std::min<uint32_t>(regnum, end) - current_index;
};
#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n) \
WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \
index, base, n)
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
WriteFetchFromMem(ind, b, n)
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
WritePossiblySpecialRegistersFromMem(ind, bs, n)
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
n) \
WriteShaderConstantsFromMem(index, base, n)
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
#define DO_A_RANGE(start_range, end_range, cb) \
if (current_index < (end_range)) { \
@ -2054,145 +2092,108 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
return; \
}
if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) { // fairly common
goto shader_vars_start;
}
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
DO_A_RANGE_CALLBACK);
SPECIAL_REG_RANGE_CALLBACK);
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
REGULAR_WRITE_CALLBACK);
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
n) \
WriteShaderConstantsFromMem(index, base, n)
shader_vars_start:
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
WRITE_SHADER_CONSTANTS_CALLBACK);
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
WriteFetchFromMem(ind, b, n)
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
WRITE_FETCH_CONSTANTS_CALLBACK);
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
REGULAR_WRITE_CALLBACK);
}
void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
uint32_t start_index, uint32_t* base, uint32_t numregs) {
uint32_t end = numregs + start_index;
for (uint32_t index = start_index; index < end; ++index, ++base) {
uint32_t value = xe::load_and_swap<uint32_t>(base);
register_file_->values[index].u32 = value;
unsigned expr = 0;
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
if (expr == 0) {
} else {
HandleSpecialRegisterWrite(index, value);
}
}
}
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
XE_FORCEINLINE void
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
uint32_t base, uint32_t* range, uint32_t num_registers) {
constexpr auto bounds_has_reg =
bounds_may_have_reg<register_lower_bound, register_upper_bound>;
constexpr auto bounds_has_bounds =
bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
uint32_t end = start_index + num_registers;
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
uint32_t current_index = start_index;
bool skip_uptodate_checks =
(!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
for (uint32_t i = 0; i < num_registers; ++i) {
uint32_t data = xe::load_and_swap<uint32_t>(range + i);
uint32_t index = base + i;
uint32_t value = data;
// cant if constexpr this one or we get unreferenced label errors, and if we
// move the label into the else we get errors about a jump from one if
// constexpr into another
if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
D3D12CommandProcessor::WriteRegisterForceinline(index, value);
} else {
XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound);
register_file_->values[index].u32 = value;
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
return std::min<uint32_t>(regnum, end) - current_index;
};
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
copy_and_swap_32_unaligned(&register_file_->values[i], b, n)
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
WriteFetchFromMem(ind, b, n)
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
WritePossiblySpecialRegistersFromMem(ind, bs, n)
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
n) \
WriteShaderConstantsFromMem(index, base, n)
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
unsigned expr = 0;
if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0,
XE_GPU_REG_SCRATCH_REG7)) {
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
}
if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) {
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
}
if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX,
XE_GPU_REG_DC_LUT_30_COLOR)) {
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
}
// chrispy: reordered for msvc branch probability (assumes
// if is taken and else is not)
if (XE_LIKELY(expr == 0)) {
XE_MSVC_REORDER_BARRIER();
} else {
HandleSpecialRegisterWrite(index, value);
goto write_done;
}
XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_511_W)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
if (!skip_uptodate_checks) {
uint32_t float_constant_index =
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
if (float_constant_index >= 256) {
float_constant_index -= 256;
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
(1ull << (float_constant_index & 63))) {
cbuffer_binding_float_pixel_.up_to_date = false;
}
} else {
if (current_float_constant_map_vertex_[float_constant_index >>
6] &
(1ull << (float_constant_index & 63))) {
cbuffer_binding_float_vertex_.up_to_date = false;
if (!cbuffer_pixel_uptodate) {
skip_uptodate_checks = true;
}
}
}
}
goto write_done;
}
}
XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
cbuffer_binding_fetch_.up_to_date = false;
// texture cache is never nullptr
texture_cache_->TextureFetchConstantWritten(
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
goto write_done;
}
}
XE_MSVC_ASSUME(index >= register_lower_bound &&
index < register_upper_bound);
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) {
if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
cbuffer_binding_bool_loop_.up_to_date = false;
goto write_done;
}
}
}
write_done:;
#define DO_A_RANGE(start_range, end_range, cb) \
if (current_index < (end_range)) { \
uint32_t ntowrite = get_end_before_qty(end_range); \
cb((start_range), (end_range), current_index, base, ntowrite); \
current_index += ntowrite; \
base += ntowrite; \
} \
if (current_index >= end) { \
return; \
}
#define REFRESH_MSVC_RANGE() \
XE_MSVC_ASSUME(current_index >= register_lower_bound && \
current_index < register_upper_bound)
REFRESH_MSVC_RANGE();
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
SPECIAL_REG_RANGE_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
REGULAR_WRITE_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
WRITE_SHADER_CONSTANTS_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
WRITE_FETCH_CONSTANTS_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
REFRESH_MSVC_RANGE();
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
REGULAR_WRITE_CALLBACK);
}
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
XE_FORCEINLINE void

View File

@ -226,6 +226,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
uint32_t num_registers);
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
uint32_t num_registers) ;
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
uint32_t num_registers);
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
uint32_t start_index, uint32_t* base, uint32_t num_registers);