add avx2 codepath for copy_and_swap_32_unaligned
use the new writerange approach in WriteRegisterRangeFromMem_WithKnownBound
This commit is contained in:
parent
82dcf3f951
commit
ab6d9dade0
|
@ -309,16 +309,46 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
|
||||||
size_t count) {
|
size_t count) {
|
||||||
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
|
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
|
||||||
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
|
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
|
||||||
|
size_t i;
|
||||||
|
// chrispy: this optimization mightt backfire if our unaligned load spans two
|
||||||
|
// cachelines... which it probably will
|
||||||
|
if (amd64::GetFeatureFlags() & amd64::kX64EmitAVX2) {
|
||||||
|
__m256i shufmask = _mm256_set_epi8(
|
||||||
|
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07,
|
||||||
|
0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B,
|
||||||
|
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||||
|
// with vpshufb being a 0.5 through instruction, it makes the most sense to
|
||||||
|
// double up on our iters
|
||||||
|
for (i = 0; i + 16 <= count; i += 16) {
|
||||||
|
__m256i input1 =
|
||||||
|
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
||||||
|
__m256i input2 =
|
||||||
|
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i + 8]));
|
||||||
|
|
||||||
|
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
|
||||||
|
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
|
||||||
|
|
||||||
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
|
||||||
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
|
||||||
|
}
|
||||||
|
for (; i + 8 <= count; i += 8) {
|
||||||
|
__m256i input =
|
||||||
|
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
||||||
|
__m256i output = _mm256_shuffle_epi8(input, shufmask);
|
||||||
|
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
__m128i shufmask =
|
__m128i shufmask =
|
||||||
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
|
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
|
||||||
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||||
|
|
||||||
size_t i;
|
|
||||||
for (i = 0; i + 4 <= count; i += 4) {
|
for (i = 0; i + 4 <= count; i += 4) {
|
||||||
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
__m128i input =
|
||||||
|
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||||
__m128i output = _mm_shuffle_epi8(input, shufmask);
|
__m128i output = _mm_shuffle_epi8(input, shufmask);
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0);
|
XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0);
|
||||||
for (; i < count; ++i) { // handle residual elements
|
for (; i < count; ++i) { // handle residual elements
|
||||||
dest[i] = byte_swap(src[i]);
|
dest[i] = byte_swap(src[i]);
|
||||||
|
|
|
@ -1903,11 +1903,11 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
||||||
uint32_t num_regs_firstrange =
|
uint32_t num_regs_firstrange =
|
||||||
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
static_cast<uint32_t>(range.first_length / sizeof(uint32_t));
|
||||||
|
|
||||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||||
num_regs_firstrange);
|
num_regs_firstrange);
|
||||||
|
|
||||||
D3D12CommandProcessor::WriteRegistersFromMem(
|
D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||||
base + num_regs_firstrange,
|
base + num_regs_firstrange,
|
||||||
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
|
reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.second)),
|
||||||
num_registers - num_regs_firstrange);
|
num_registers - num_regs_firstrange);
|
||||||
|
@ -1948,6 +1948,7 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
|
||||||
}
|
}
|
||||||
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
||||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||||
|
#if 1
|
||||||
if (frame_open_) {
|
if (frame_open_) {
|
||||||
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
||||||
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
||||||
|
@ -1964,12 +1965,36 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
||||||
uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
|
uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
|
||||||
uint32_t end_map_index =
|
uint32_t end_map_index =
|
||||||
(start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
|
(start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
|
||||||
|
|
||||||
|
if (map_index < 256 && cbuffer_vertex_uptodate) {
|
||||||
|
for (; map_index < end_map_index; ++map_index) {
|
||||||
|
if (current_float_constant_map_vertex_[map_index >> 6] &
|
||||||
|
(1ull << map_index)) {
|
||||||
|
cbuffer_vertex_uptodate = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (end_map_index > 256 && cbuffer_pixel_uptodate) {
|
||||||
|
for (; map_index < end_map_index; ++map_index) {
|
||||||
|
uint32_t float_constant_index = map_index;
|
||||||
|
float_constant_index -= 256;
|
||||||
|
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
||||||
|
(1ull << float_constant_index)) {
|
||||||
|
cbuffer_pixel_uptodate = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
if (!cbuffer_vertex_uptodate) {
|
if (!cbuffer_vertex_uptodate) {
|
||||||
if (256 >= end_map_index) {
|
if (256 >= end_map_index) {
|
||||||
goto skip_map_checks;
|
goto skip_map_checks;
|
||||||
}
|
}
|
||||||
map_index = 256;
|
map_index = 256;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; map_index < end_map_index; ++map_index) {
|
for (; map_index < end_map_index; ++map_index) {
|
||||||
uint32_t float_constant_index = map_index;
|
uint32_t float_constant_index = map_index;
|
||||||
if (float_constant_index >= 256) {
|
if (float_constant_index >= 256) {
|
||||||
|
@ -1996,10 +2021,17 @@ void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
skip_map_checks:;
|
skip_map_checks:;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
|
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
|
||||||
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
|
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
if (frame_open_) {
|
||||||
|
cbuffer_binding_float_pixel_.up_to_date = false;
|
||||||
|
cbuffer_binding_float_vertex_.up_to_date = false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
// maybe use non-temporal copy if possible...
|
// maybe use non-temporal copy if possible...
|
||||||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||||
num_registers);
|
num_registers);
|
||||||
|
@ -2038,10 +2070,16 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||||
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
|
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
|
||||||
return std::min<uint32_t>(regnum, end) - current_index;
|
return std::min<uint32_t>(regnum, end) - current_index;
|
||||||
};
|
};
|
||||||
|
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
||||||
#define DO_A_RANGE_CALLBACK(start_range, end_range, index, base, n) \
|
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
||||||
WriteRegisterRangeFromMem_WithKnownBound<(start_range), (end_range)>( \
|
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
||||||
index, base, n)
|
WriteFetchFromMem(ind, b, n)
|
||||||
|
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
|
||||||
|
WritePossiblySpecialRegistersFromMem(ind, bs, n)
|
||||||
|
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
||||||
|
n) \
|
||||||
|
WriteShaderConstantsFromMem(index, base, n)
|
||||||
|
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||||
|
|
||||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||||
if (current_index < (end_range)) { \
|
if (current_index < (end_range)) { \
|
||||||
|
@ -2054,145 +2092,108 @@ void D3D12CommandProcessor::WriteRegistersFromMemCommonSense(
|
||||||
return; \
|
return; \
|
||||||
}
|
}
|
||||||
|
|
||||||
if (start_index >= XE_GPU_REG_SHADER_CONSTANT_000_X) { // fairly common
|
|
||||||
goto shader_vars_start;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
|
||||||
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
|
||||||
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
||||||
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
||||||
DO_A_RANGE_CALLBACK);
|
SPECIAL_REG_RANGE_CALLBACK);
|
||||||
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
|
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||||
REGULAR_WRITE_CALLBACK);
|
REGULAR_WRITE_CALLBACK);
|
||||||
|
|
||||||
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
|
||||||
n) \
|
|
||||||
WriteShaderConstantsFromMem(index, base, n)
|
|
||||||
shader_vars_start:
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||||
WRITE_SHADER_CONSTANTS_CALLBACK);
|
WRITE_SHADER_CONSTANTS_CALLBACK);
|
||||||
|
|
||||||
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
|
||||||
WriteFetchFromMem(ind, b, n)
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
|
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
|
||||||
WRITE_FETCH_CONSTANTS_CALLBACK);
|
WRITE_FETCH_CONSTANTS_CALLBACK);
|
||||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
|
||||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
|
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
|
||||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
|
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
|
||||||
REGULAR_WRITE_CALLBACK);
|
REGULAR_WRITE_CALLBACK);
|
||||||
}
|
}
|
||||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
|
||||||
XE_FORCEINLINE void
|
uint32_t start_index, uint32_t* base, uint32_t numregs) {
|
||||||
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
uint32_t end = numregs + start_index;
|
||||||
uint32_t base, uint32_t* range, uint32_t num_registers) {
|
for (uint32_t index = start_index; index < end; ++index, ++base) {
|
||||||
constexpr auto bounds_has_reg =
|
uint32_t value = xe::load_and_swap<uint32_t>(base);
|
||||||
bounds_may_have_reg<register_lower_bound, register_upper_bound>;
|
|
||||||
constexpr auto bounds_has_bounds =
|
|
||||||
bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
|
|
||||||
|
|
||||||
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
|
||||||
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
|
||||||
|
|
||||||
bool skip_uptodate_checks =
|
|
||||||
(!cbuffer_pixel_uptodate && !cbuffer_vertex_uptodate) || (!frame_open_);
|
|
||||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
|
||||||
uint32_t data = xe::load_and_swap<uint32_t>(range + i);
|
|
||||||
uint32_t index = base + i;
|
|
||||||
uint32_t value = data;
|
|
||||||
// cant if constexpr this one or we get unreferenced label errors, and if we
|
|
||||||
// move the label into the else we get errors about a jump from one if
|
|
||||||
// constexpr into another
|
|
||||||
if (register_lower_bound == 0 && register_upper_bound == 0xFFFF) {
|
|
||||||
D3D12CommandProcessor::WriteRegisterForceinline(index, value);
|
|
||||||
} else {
|
|
||||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
|
||||||
index < register_upper_bound);
|
|
||||||
register_file_->values[index].u32 = value;
|
register_file_->values[index].u32 = value;
|
||||||
|
|
||||||
unsigned expr = 0;
|
unsigned expr = 0;
|
||||||
|
|
||||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0,
|
|
||||||
XE_GPU_REG_SCRATCH_REG7)) {
|
|
||||||
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
|
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
|
||||||
}
|
|
||||||
if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) {
|
|
||||||
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
|
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
|
||||||
}
|
|
||||||
if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX,
|
|
||||||
XE_GPU_REG_DC_LUT_30_COLOR)) {
|
|
||||||
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
||||||
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
||||||
}
|
|
||||||
// chrispy: reordered for msvc branch probability (assumes
|
|
||||||
// if is taken and else is not)
|
|
||||||
if (XE_LIKELY(expr == 0)) {
|
|
||||||
XE_MSVC_REORDER_BARRIER();
|
|
||||||
|
|
||||||
|
if (expr == 0) {
|
||||||
} else {
|
} else {
|
||||||
HandleSpecialRegisterWrite(index, value);
|
HandleSpecialRegisterWrite(index, value);
|
||||||
goto write_done;
|
|
||||||
}
|
|
||||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
|
||||||
index < register_upper_bound);
|
|
||||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
|
||||||
XE_GPU_REG_SHADER_CONSTANT_511_W)) {
|
|
||||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
|
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
|
|
||||||
if (!skip_uptodate_checks) {
|
|
||||||
uint32_t float_constant_index =
|
|
||||||
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
|
|
||||||
if (float_constant_index >= 256) {
|
|
||||||
float_constant_index -= 256;
|
|
||||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
|
||||||
(1ull << (float_constant_index & 63))) {
|
|
||||||
cbuffer_binding_float_pixel_.up_to_date = false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (current_float_constant_map_vertex_[float_constant_index >>
|
|
||||||
6] &
|
|
||||||
(1ull << (float_constant_index & 63))) {
|
|
||||||
cbuffer_binding_float_vertex_.up_to_date = false;
|
|
||||||
if (!cbuffer_pixel_uptodate) {
|
|
||||||
skip_uptodate_checks = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
goto write_done;
|
XE_FORCEINLINE void
|
||||||
}
|
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
||||||
}
|
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
uint32_t end = start_index + num_registers;
|
||||||
index < register_upper_bound);
|
|
||||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
|
||||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
|
|
||||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
|
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
|
|
||||||
cbuffer_binding_fetch_.up_to_date = false;
|
|
||||||
// texture cache is never nullptr
|
|
||||||
texture_cache_->TextureFetchConstantWritten(
|
|
||||||
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
|
|
||||||
|
|
||||||
goto write_done;
|
uint32_t current_index = start_index;
|
||||||
}
|
|
||||||
|
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
|
||||||
|
return std::min<uint32_t>(regnum, end) - current_index;
|
||||||
|
};
|
||||||
|
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
||||||
|
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
||||||
|
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
||||||
|
WriteFetchFromMem(ind, b, n)
|
||||||
|
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
|
||||||
|
WritePossiblySpecialRegistersFromMem(ind, bs, n)
|
||||||
|
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
||||||
|
n) \
|
||||||
|
WriteShaderConstantsFromMem(index, base, n)
|
||||||
|
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||||
|
|
||||||
|
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||||
|
if (current_index < (end_range)) { \
|
||||||
|
uint32_t ntowrite = get_end_before_qty(end_range); \
|
||||||
|
cb((start_range), (end_range), current_index, base, ntowrite); \
|
||||||
|
current_index += ntowrite; \
|
||||||
|
base += ntowrite; \
|
||||||
|
} \
|
||||||
|
if (current_index >= end) { \
|
||||||
|
return; \
|
||||||
}
|
}
|
||||||
|
|
||||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
#define REFRESH_MSVC_RANGE() \
|
||||||
index < register_upper_bound);
|
XE_MSVC_ASSUME(current_index >= register_lower_bound && \
|
||||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
current_index < register_upper_bound)
|
||||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) {
|
|
||||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
|
REFRESH_MSVC_RANGE();
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
|
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
||||||
cbuffer_binding_bool_loop_.up_to_date = false;
|
REFRESH_MSVC_RANGE();
|
||||||
goto write_done;
|
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
||||||
}
|
SPECIAL_REG_RANGE_CALLBACK);
|
||||||
}
|
REFRESH_MSVC_RANGE();
|
||||||
}
|
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||||
write_done:;
|
REGULAR_WRITE_CALLBACK);
|
||||||
}
|
REFRESH_MSVC_RANGE();
|
||||||
|
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||||
|
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||||
|
WRITE_SHADER_CONSTANTS_CALLBACK);
|
||||||
|
REFRESH_MSVC_RANGE();
|
||||||
|
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||||
|
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
|
||||||
|
WRITE_FETCH_CONSTANTS_CALLBACK);
|
||||||
|
REFRESH_MSVC_RANGE();
|
||||||
|
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||||
|
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
|
||||||
|
REFRESH_MSVC_RANGE();
|
||||||
|
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
|
||||||
|
REGULAR_WRITE_CALLBACK);
|
||||||
|
|
||||||
}
|
}
|
||||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
XE_FORCEINLINE void
|
XE_FORCEINLINE void
|
||||||
|
|
|
@ -226,6 +226,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
uint32_t num_registers);
|
uint32_t num_registers);
|
||||||
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
|
void WriteRegistersFromMemCommonSense(uint32_t start_index, uint32_t* base,
|
||||||
uint32_t num_registers) ;
|
uint32_t num_registers) ;
|
||||||
|
|
||||||
|
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
|
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
|
||||||
uint32_t start_index, uint32_t* base, uint32_t num_registers);
|
uint32_t start_index, uint32_t* base, uint32_t num_registers);
|
||||||
|
|
Loading…
Reference in New Issue