add avx2 codepath for copy_and_swap_32_unaligned
use the new writerange approach in WriteRegisterRangeFromMem_WithKnownBound
This commit is contained in:
parent
82dcf3f951
commit
ab6d9dade0
|
@ -309,15 +309,45 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
|
|||
size_t count) {
|
||||
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
|
||||
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
|
||||
__m128i shufmask =
|
||||
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
|
||||
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i + 4 <= count; i += 4) {
|
||||
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||
__m128i output = _mm_shuffle_epi8(input, shufmask);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||
// chrispy: this optimization mightt backfire if our unaligned load spans two
|
||||
// cachelines... which it probably will
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitAVX2) {
|
||||
__m256i shufmask = _mm256_set_epi8(
|
||||
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07,
|
||||
0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||
// with vpshufb being a 0.5 through instruction, it makes the most sense to
|
||||
// double up on our iters
|
||||
for (i = 0; i + 16 <= count; i += 16) {
|
||||
__m256i input1 =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
||||
__m256i input2 =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i + 8]));
|
||||
|
||||
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
|
||||
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
|
||||
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
|
||||
}
|
||||
for (; i + 8 <= count; i += 8) {
|
||||
__m256i input =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
||||
__m256i output = _mm256_shuffle_epi8(input, shufmask);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
|
||||
}
|
||||
} else {
|
||||
__m128i shufmask =
|
||||
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
|
||||
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||
|
||||
for (i = 0; i + 4 <= count; i += 4) {
|
||||
__m128i input =
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||
__m128i output = _mm_shuffle_epi8(input, shufmask);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||
}
|
||||
}
|
||||
XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0);
|
||||
for (; i < count; ++i) { // handle residual elements
|
||||
|
|
|
@ -1903,11 +1903,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
|||
uint32_t num_regs_firstrange =
|
||||
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
||||
|
||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||
num_regs_firstrange);
|
||||
|
||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
||||
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||
base + num_regs_firstrange,
|
||||
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
|
||||
num_registers - num_regs_firstrange);
|
||||
|
@ -1948,6 +1948,7 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
|
|||
}
|
||||
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||
#if 1
|
||||
if (frame_open_) {
|
||||
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
||||
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
||||
|
@ -1964,12 +1965,36 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
|||
uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
|
||||
uint32_t end_map_index =
|
||||
(start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
|
||||
|
||||
if (map_index < 256 && cbuffer_vertex_uptodate) {
|
||||
for (; map_index < end_map_index; ++map_index) {
|
||||
if (current_float_constant_map_vertex_[map_index >> 6] &
|
||||
(1ull << map_index)) {
|
||||
cbuffer_vertex_uptodate = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (end_map_index > 256 && cbuffer_pixel_uptodate) {
|
||||
for (; map_index < end_map_index; ++map_index) {
|
||||
uint32_t float_constant_index = map_index;
|
||||
float_constant_index -= 256;
|
||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
||||
(1ull << float_constant_index)) {
|
||||
cbuffer_pixel_uptodate = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
if (!cbuffer_vertex_uptodate) {
|
||||
if (256 >= end_map_index) {
|
||||
goto skip_map_checks;
|
||||
}
|
||||
map_index = 256;
|
||||
}
|
||||
|
||||
for (; map_index < end_map_index; ++map_index) {
|
||||
uint32_t float_constant_index = map_index;
|
||||
if (float_constant_index >= 256) {
|
||||
|
@ -1996,10 +2021,17 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
|||
}
|
||||
}
|
||||
skip_map_checks:;
|
||||
#endif
|
||||
}
|
||||
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
|
||||
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
|
||||
}
|
||||
#else
|
||||
if (frame_open_) {
|
||||
cbuffer_binding_float_pixel_.up_to_date = false;
|
||||
cbuffer_binding_float_vertex_.up_to_date = false;
|
||||
}
|
||||
#endif
|
||||
// maybe use non-temporal copy if possible...
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||
num_registers);
|
||||
|
@ -2038,10 +2070,16 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
|||
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
|
||||
return std::min<uint32_t>(regnum, end) - current_index;
|
||||
};
|
||||
|
||||
#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n) \
|
||||
WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \
|
||||
index, base, n)
|
||||
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
||||
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
||||
WriteFetchFromMem(ind, b, n)
|
||||
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
|
||||
WritePossiblySpecialRegistersFromMem(ind, bs, n)
|
||||
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
||||
n) \
|
||||
WriteShaderConstantsFromMem(index, base, n)
|
||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||
|
||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||
if (current_index < (end_range)) { \
|
||||
|
@ -2054,145 +2092,108 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
|||
return; \
|
||||
}
|
||||
|
||||
if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) { // fairly common
|
||||
goto shader_vars_start;
|
||||
}
|
||||
|
||||
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
||||
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
||||
|
||||
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
||||
DO_A_RANGE_CALLBACK);
|
||||
SPECIAL_REG_RANGE_CALLBACK);
|
||||
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
REGULAR_WRITE_CALLBACK);
|
||||
|
||||
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
||||
n) \
|
||||
WriteShaderConstantsFromMem(index, base, n)
|
||||
shader_vars_start:
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
WRITE_SHADER_CONSTANTS_CALLBACK);
|
||||
|
||||
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
||||
WriteFetchFromMem(ind, b, n)
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
|
||||
WRITE_FETCH_CONSTANTS_CALLBACK);
|
||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
|
||||
REGULAR_WRITE_CALLBACK);
|
||||
}
|
||||
void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
|
||||
uint32_t start_index, uint32_t* base, uint32_t numregs) {
|
||||
uint32_t end = numregs + start_index;
|
||||
for (uint32_t index = start_index; index < end; ++index, ++base) {
|
||||
uint32_t value = xe::load_and_swap<uint32_t>(base);
|
||||
|
||||
register_file_->values[index].u32 = value;
|
||||
|
||||
unsigned expr = 0;
|
||||
|
||||
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
|
||||
|
||||
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
|
||||
|
||||
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
||||
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
||||
|
||||
if (expr == 0) {
|
||||
} else {
|
||||
HandleSpecialRegisterWrite(index, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void
|
||||
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
||||
uint32_t base, uint32_t* range, uint32_t num_registers) {
|
||||
constexpr auto bounds_has_reg =
|
||||
bounds_may_have_reg<register_lower_bound, register_upper_bound>;
|
||||
constexpr auto bounds_has_bounds =
|
||||
bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||
uint32_t end = start_index + num_registers;
|
||||
|
||||
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
||||
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
||||
uint32_t current_index = start_index;
|
||||
|
||||
bool skip_uptodate_checks =
|
||||
(!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
|
||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||
uint32_t data = xe::load_and_swap<uint32_t>(range + i);
|
||||
uint32_t index = base + i;
|
||||
uint32_t value = data;
|
||||
// cant if constexpr this one or we get unreferenced label errors, and if we
|
||||
// move the label into the else we get errors about a jump from one if
|
||||
// constexpr into another
|
||||
if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
|
||||
D3D12CommandProcessor::WriteRegisterForceinline(index, value);
|
||||
} else {
|
||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||
index < register_upper_bound);
|
||||
register_file_->values[index].u32 = value;
|
||||
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
|
||||
return std::min<uint32_t>(regnum, end) - current_index;
|
||||
};
|
||||
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
||||
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
||||
WriteFetchFromMem(ind, b, n)
|
||||
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
|
||||
WritePossiblySpecialRegistersFromMem(ind, bs, n)
|
||||
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
||||
n) \
|
||||
WriteShaderConstantsFromMem(index, base, n)
|
||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||
|
||||
unsigned expr = 0;
|
||||
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0,
|
||||
XE_GPU_REG_SCRATCH_REG7)) {
|
||||
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
|
||||
}
|
||||
if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) {
|
||||
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
|
||||
}
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX,
|
||||
XE_GPU_REG_DC_LUT_30_COLOR)) {
|
||||
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
||||
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
||||
}
|
||||
// chrispy: reordered for msvc branch probability (assumes
|
||||
// if is taken and else is not)
|
||||
if (XE_LIKELY(expr == 0)) {
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
|
||||
} else {
|
||||
HandleSpecialRegisterWrite(index, value);
|
||||
goto write_done;
|
||||
}
|
||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||
index < register_upper_bound);
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
XE_GPU_REG_SHADER_CONSTANT_511_W)) {
|
||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
|
||||
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
|
||||
if (!skip_uptodate_checks) {
|
||||
uint32_t float_constant_index =
|
||||
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
|
||||
if (float_constant_index >= 256) {
|
||||
float_constant_index -= 256;
|
||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
||||
(1ull << (float_constant_index & 63))) {
|
||||
cbuffer_binding_float_pixel_.up_to_date = false;
|
||||
}
|
||||
} else {
|
||||
if (current_float_constant_map_vertex_[float_constant_index >>
|
||||
6] &
|
||||
(1ull << (float_constant_index & 63))) {
|
||||
cbuffer_binding_float_vertex_.up_to_date = false;
|
||||
if (!cbuffer_pixel_uptodate) {
|
||||
skip_uptodate_checks = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
goto write_done;
|
||||
}
|
||||
}
|
||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||
index < register_upper_bound);
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
|
||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
|
||||
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
|
||||
cbuffer_binding_fetch_.up_to_date = false;
|
||||
// texture cache is never nullptr
|
||||
texture_cache_->TextureFetchConstantWritten(
|
||||
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
|
||||
|
||||
goto write_done;
|
||||
}
|
||||
}
|
||||
|
||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||
index < register_upper_bound);
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) {
|
||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
|
||||
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
|
||||
cbuffer_binding_bool_loop_.up_to_date = false;
|
||||
goto write_done;
|
||||
}
|
||||
}
|
||||
}
|
||||
write_done:;
|
||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||
if (current_index < (end_range)) { \
|
||||
uint32_t ntowrite = get_end_before_qty(end_range); \
|
||||
cb((start_range), (end_range), current_index, base, ntowrite); \
|
||||
current_index += ntowrite; \
|
||||
base += ntowrite; \
|
||||
} \
|
||||
if (current_index >= end) { \
|
||||
return; \
|
||||
}
|
||||
|
||||
#define REFRESH_MSVC_RANGE() \
|
||||
XE_MSVC_ASSUME(current_index >= register_lower_bound && \
|
||||
current_index < register_upper_bound)
|
||||
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
||||
SPECIAL_REG_RANGE_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
REGULAR_WRITE_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
WRITE_SHADER_CONSTANTS_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
|
||||
WRITE_FETCH_CONSTANTS_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
|
||||
REGULAR_WRITE_CALLBACK);
|
||||
|
||||
}
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void
|
||||
|
|
|
@ -226,6 +226,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
uint32_t num_registers);
|
||||
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers) ;
|
||||
|
||||
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers);
|
||||
|
|
Loading…
Reference in New Issue