Merge pull request #103 from chrisps/command_processor_optimizations
Command processor optimizations
This commit is contained in:
commit
b0268ab876
|
@ -9,8 +9,8 @@
|
|||
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/platform.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/platform.h"
|
||||
|
||||
#if XE_ARCH_ARM64
|
||||
#include <arm_neon.h>
|
||||
|
@ -59,7 +59,7 @@ static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
|
|||
|
||||
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
|
||||
|
||||
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
|
||||
xe::swcache::CacheLine line0, line1, line2, line3;
|
||||
|
||||
|
@ -173,7 +173,12 @@ static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
|
|||
_movdir64b(physaddr + i, rdmapping + i);
|
||||
}
|
||||
}
|
||||
|
||||
static void vastcpy_impl_repmovs(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
__movsq((unsigned long long*)physaddr, (unsigned long long*)rdmapping,
|
||||
written_length / 8);
|
||||
}
|
||||
XE_COLD
|
||||
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
|
@ -189,6 +194,9 @@ static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
|
|||
if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
|
||||
XELOGI("Selecting MOVDIR64M vastcpy.");
|
||||
dispatch_to_use = vastcpy_impl_movdir64m;
|
||||
} else if (amd64::GetFeatureFlags() & amd64::kX64FastRepMovs) {
|
||||
XELOGI("Selecting rep movs vastcpy.");
|
||||
dispatch_to_use = vastcpy_impl_repmovs;
|
||||
} else {
|
||||
XELOGI("Selecting generic AVX vastcpy.");
|
||||
dispatch_to_use = vastcpy_impl_avx;
|
||||
|
@ -301,15 +309,64 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
|
|||
size_t count) {
|
||||
auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
|
||||
auto src = reinterpret_cast<const uint32_t*>(src_ptr);
|
||||
__m128i shufmask =
|
||||
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
|
||||
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||
|
||||
size_t i;
|
||||
for (i = 0; i + 4 <= count; i += 4) {
|
||||
__m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||
__m128i output = _mm_shuffle_epi8(input, shufmask);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||
// chrispy: this optimization mightt backfire if our unaligned load spans two
|
||||
// cachelines... which it probably will
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitAVX2) {
|
||||
__m256i shufmask = _mm256_set_epi8(
|
||||
0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05, 0x06, 0x07,
|
||||
0x00, 0x01, 0x02, 0x03, 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B,
|
||||
0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||
// with vpshufb being a 0.5 through instruction, it makes the most sense to
|
||||
// double up on our iters
|
||||
for (i = 0; i + 16 <= count; i += 16) {
|
||||
__m256i input1 =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
||||
__m256i input2 =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i + 8]));
|
||||
|
||||
__m256i output1 = _mm256_shuffle_epi8(input1, shufmask);
|
||||
__m256i output2 = _mm256_shuffle_epi8(input2, shufmask);
|
||||
//chrispy: todo, benchmark this w/ and w/out these prefetches here on multiple machines
|
||||
//finding a good distance for prefetchw in particular is probably important
|
||||
//for when we're writing across 2 cachelines
|
||||
#if 0
|
||||
if (i + 48 <= count) {
|
||||
swcache::PrefetchNTA(&src[i + 32]);
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitPrefetchW) {
|
||||
swcache::PrefetchW(&dest[i + 32]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output1);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i + 8]), output2);
|
||||
}
|
||||
if (i + 8 <= count) {
|
||||
__m256i input =
|
||||
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(&src[i]));
|
||||
__m256i output = _mm256_shuffle_epi8(input, shufmask);
|
||||
_mm256_storeu_si256(reinterpret_cast<__m256i*>(&dest[i]), output);
|
||||
i += 8;
|
||||
}
|
||||
if (i + 4 <= count) {
|
||||
__m128i input =
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||
__m128i output =
|
||||
_mm_shuffle_epi8(input, _mm256_castsi256_si128(shufmask));
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||
i += 4;
|
||||
}
|
||||
} else {
|
||||
__m128i shufmask =
|
||||
_mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
|
||||
0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
|
||||
|
||||
for (i = 0; i + 4 <= count; i += 4) {
|
||||
__m128i input =
|
||||
_mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
|
||||
__m128i output = _mm_shuffle_epi8(input, shufmask);
|
||||
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
|
||||
}
|
||||
}
|
||||
XE_WORKAROUND_CONSTANT_RETURN_IF(count % 4 == 0);
|
||||
for (; i < count; ++i) { // handle residual elements
|
||||
|
|
|
@ -60,6 +60,12 @@ namespace xe {
|
|||
namespace threading {
|
||||
|
||||
void EnableAffinityConfiguration() {
|
||||
// chrispy: i don't think this is necessary,
|
||||
// affinity always seems to be the system mask? research more
|
||||
// also, maybe if ignore_thread_affinities is on we should use
|
||||
// SetProcessAffinityUpdateMode to allow windows to dynamically update
|
||||
// our process' affinity (by default windows cannot change the affinity itself
|
||||
// at runtime, user code must do it)
|
||||
HANDLE process_handle = GetCurrentProcess();
|
||||
DWORD_PTR process_affinity_mask;
|
||||
DWORD_PTR system_affinity_mask;
|
||||
|
@ -117,7 +123,7 @@ void set_name(const std::string_view name) {
|
|||
|
||||
// checked ntoskrnl, it does not modify delay, so we can place this as a
|
||||
// constant and avoid creating a stack variable
|
||||
static const LARGE_INTEGER sleepdelay0_for_maybeyield{{0LL}};
|
||||
static const LARGE_INTEGER sleepdelay0_for_maybeyield{{~0u, -1}};
|
||||
|
||||
void MaybeYield() {
|
||||
#if 0
|
||||
|
|
|
@ -1712,8 +1712,10 @@ void D3D12CommandProcessor::ShutdownContext() {
|
|||
|
||||
CommandProcessor::ShutdownContext();
|
||||
}
|
||||
// todo: bit-pack the bools and use bitarith to reduce branches
|
||||
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
||||
|
||||
XE_FORCEINLINE
|
||||
void D3D12CommandProcessor::WriteRegisterForceinline(uint32_t index,
|
||||
uint32_t value) {
|
||||
__m128i to_rangecheck = _mm_set1_epi16(static_cast<short>(index));
|
||||
|
||||
__m128i lower_bounds = _mm_setr_epi16(
|
||||
|
@ -1783,14 +1785,16 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
|||
return;
|
||||
}
|
||||
}
|
||||
// todo: bit-pack the bools and use bitarith to reduce branches
|
||||
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
||||
WriteRegisterForceinline(index, value);
|
||||
}
|
||||
|
||||
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||
uint32_t data = xe::load_and_swap<uint32_t>(base + i);
|
||||
D3D12CommandProcessor::WriteRegister(start_index + i, data);
|
||||
}
|
||||
WriteRegisterRangeFromMem_WithKnownBound<0, 0xFFFF>(start_index, base,
|
||||
num_registers);
|
||||
}
|
||||
|
||||
void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
|
||||
|
@ -1911,8 +1915,19 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase(
|
|||
void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
||||
uint32_t base,
|
||||
uint32_t num_registers) {
|
||||
WriteRegisterRangeFromRing_WithKnownBound<0, 0xFFFF>(ring, base,
|
||||
num_registers);
|
||||
RingBuffer::ReadRange range =
|
||||
ring->BeginRead(num_registers * sizeof(uint32_t));
|
||||
|
||||
XE_LIKELY_IF(!range.second) {
|
||||
WriteRegistersFromMem(
|
||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||
num_registers);
|
||||
|
||||
ring->EndRead(range);
|
||||
}
|
||||
else {
|
||||
return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
|
||||
}
|
||||
}
|
||||
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
|
@ -1926,101 +1941,167 @@ constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
|
|||
bounds_may_have_reg<register_lower_bound, register_upper_bound>(
|
||||
last_reg);
|
||||
}
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void
|
||||
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
||||
uint32_t base, uint32_t* range, uint32_t num_registers) {
|
||||
constexpr auto bounds_has_reg =
|
||||
bounds_may_have_reg<register_lower_bound, register_upper_bound>;
|
||||
constexpr auto bounds_has_bounds =
|
||||
bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
|
||||
XE_FORCEINLINE
|
||||
void D3D12CommandProcessor::WriteShaderConstantsFromMem(
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||
if (frame_open_) {
|
||||
bool cbuffer_pixel_uptodate = cbuffer_binding_float_pixel_.up_to_date;
|
||||
bool cbuffer_vertex_uptodate = cbuffer_binding_float_vertex_.up_to_date;
|
||||
if (cbuffer_pixel_uptodate || cbuffer_vertex_uptodate) {
|
||||
// super naive, could just do some bit magic and interval checking,
|
||||
// but we just need this hoisted out of the copy so we do a bulk copy
|
||||
// because its the actual load/swap/store we're getting murdered by
|
||||
// this precheck followed by copy_and_swap_32_unaligned reduced the cpu
|
||||
// usage from packettype0/writeregistersfrommem from 10-11% of cpu time
|
||||
// spent on xenia to like 1%
|
||||
// chrispy: todo, this can be reduced even further, should be split into
|
||||
// two loops and should skip whole words, this could net us even bigger
|
||||
// gains
|
||||
uint32_t map_index = (start_index - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
|
||||
uint32_t end_map_index =
|
||||
(start_index + num_registers - XE_GPU_REG_SHADER_CONSTANT_000_X) / 4;
|
||||
|
||||
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||
uint32_t data = xe::load_and_swap<uint32_t>(range + i);
|
||||
|
||||
{
|
||||
uint32_t index = base + i;
|
||||
uint32_t value = data;
|
||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||
index < register_upper_bound);
|
||||
register_file_->values[index].u32 = value;
|
||||
|
||||
unsigned expr = 0;
|
||||
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0,
|
||||
XE_GPU_REG_SCRATCH_REG7)) {
|
||||
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
|
||||
}
|
||||
if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) {
|
||||
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
|
||||
}
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX,
|
||||
XE_GPU_REG_DC_LUT_30_COLOR)) {
|
||||
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
||||
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
||||
}
|
||||
// chrispy: reordered for msvc branch probability (assumes
|
||||
// if is taken and else is not)
|
||||
if (XE_LIKELY(expr == 0)) {
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
|
||||
} else {
|
||||
HandleSpecialRegisterWrite(index, value);
|
||||
goto write_done;
|
||||
}
|
||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||
index < register_upper_bound);
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
|
||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
|
||||
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
|
||||
cbuffer_binding_fetch_.up_to_date = false;
|
||||
// texture cache is never nullptr
|
||||
texture_cache_->TextureFetchConstantWritten(
|
||||
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
|
||||
|
||||
goto write_done;
|
||||
}
|
||||
}
|
||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||
index < register_upper_bound);
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
XE_GPU_REG_SHADER_CONSTANT_511_W)) {
|
||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
|
||||
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
|
||||
if (frame_open_) {
|
||||
uint32_t float_constant_index =
|
||||
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
|
||||
if (float_constant_index >= 256) {
|
||||
float_constant_index -= 256;
|
||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
||||
(1ull << (float_constant_index & 63))) {
|
||||
cbuffer_binding_float_pixel_.up_to_date = false;
|
||||
}
|
||||
} else {
|
||||
if (current_float_constant_map_vertex_[float_constant_index >>
|
||||
6] &
|
||||
(1ull << (float_constant_index & 63))) {
|
||||
cbuffer_binding_float_vertex_.up_to_date = false;
|
||||
}
|
||||
}
|
||||
if (map_index < 256 && cbuffer_vertex_uptodate) {
|
||||
for (; map_index < end_map_index; ++map_index) {
|
||||
if (current_float_constant_map_vertex_[map_index >> 6] &
|
||||
(1ull << map_index)) {
|
||||
cbuffer_vertex_uptodate = false;
|
||||
break;
|
||||
}
|
||||
goto write_done;
|
||||
}
|
||||
}
|
||||
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||
index < register_upper_bound);
|
||||
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) {
|
||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
|
||||
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
|
||||
cbuffer_binding_bool_loop_.up_to_date = false;
|
||||
goto write_done;
|
||||
if (end_map_index > 256 && cbuffer_pixel_uptodate) {
|
||||
for (; map_index < end_map_index; ++map_index) {
|
||||
uint32_t float_constant_index = map_index;
|
||||
float_constant_index -= 256;
|
||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
||||
(1ull << float_constant_index)) {
|
||||
cbuffer_pixel_uptodate = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
write_done:;
|
||||
cbuffer_binding_float_pixel_.up_to_date = cbuffer_pixel_uptodate;
|
||||
cbuffer_binding_float_vertex_.up_to_date = cbuffer_vertex_uptodate;
|
||||
}
|
||||
|
||||
// maybe use non-temporal copy if possible...
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||
num_registers);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
void D3D12CommandProcessor::WriteBoolLoopFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
cbuffer_binding_bool_loop_.up_to_date = false;
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||
num_registers);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
void D3D12CommandProcessor::WriteFetchFromMem(uint32_t start_index,
|
||||
uint32_t* base,
|
||||
uint32_t num_registers) {
|
||||
cbuffer_binding_fetch_.up_to_date = false;
|
||||
|
||||
uint32_t first_fetch =
|
||||
((start_index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
|
||||
uint32_t last_fetch = // i think last_fetch should be inclusive if its modulo
|
||||
// is nz...
|
||||
(((start_index + num_registers) - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) /
|
||||
6);
|
||||
texture_cache_->TextureFetchConstantsWritten(first_fetch, last_fetch);
|
||||
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[start_index], base,
|
||||
num_registers);
|
||||
}
|
||||
|
||||
void D3D12CommandProcessor::WritePossiblySpecialRegistersFromMem(
|
||||
uint32_t start_index, uint32_t* base, uint32_t numregs) {
|
||||
uint32_t end = numregs + start_index;
|
||||
for (uint32_t index = start_index; index < end; ++index, ++base) {
|
||||
uint32_t value = xe::load_and_swap<uint32_t>(base);
|
||||
|
||||
register_file_->values[index].u32 = value;
|
||||
|
||||
unsigned expr = 0;
|
||||
|
||||
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
|
||||
|
||||
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
|
||||
|
||||
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
||||
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
||||
|
||||
if (expr == 0) {
|
||||
} else {
|
||||
HandleSpecialRegisterWrite(index, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void
|
||||
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers) {
|
||||
uint32_t end = start_index + num_registers;
|
||||
|
||||
uint32_t current_index = start_index;
|
||||
|
||||
auto get_end_before_qty = [&end, current_index](uint32_t regnum) {
|
||||
return std::min<uint32_t>(regnum, end) - current_index;
|
||||
};
|
||||
#define REGULAR_WRITE_CALLBACK(s, e, i, b, n) \
|
||||
copy_and_swap_32_unaligned(®ister_file_->values[i], b, n)
|
||||
#define WRITE_FETCH_CONSTANTS_CALLBACK(str, er, ind, b, n) \
|
||||
WriteFetchFromMem(ind, b, n)
|
||||
#define SPECIAL_REG_RANGE_CALLBACK(str, edr, ind, bs, n) \
|
||||
WritePossiblySpecialRegistersFromMem(ind, bs, n)
|
||||
#define WRITE_SHADER_CONSTANTS_CALLBACK(start_range, end_range, index, base, \
|
||||
n) \
|
||||
WriteShaderConstantsFromMem(index, base, n)
|
||||
#define WRITE_BOOL_LOOP_CALLBACK(s, e, i, b, n) WriteBoolLoopFromMem(i, b, n)
|
||||
|
||||
#define DO_A_RANGE(start_range, end_range, cb) \
|
||||
if constexpr (start_range >= register_lower_bound || \
|
||||
end_range > register_lower_bound) { \
|
||||
if (current_index < (end_range)) { \
|
||||
uint32_t ntowrite = get_end_before_qty(end_range); \
|
||||
cb((start_range), (end_range), current_index, base, ntowrite); \
|
||||
current_index += ntowrite; \
|
||||
base += ntowrite; \
|
||||
} \
|
||||
if (current_index >= end) { \
|
||||
return; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define REFRESH_MSVC_RANGE() \
|
||||
XE_MSVC_ASSUME(current_index >= register_lower_bound && \
|
||||
current_index < register_upper_bound)
|
||||
|
||||
REFRESH_MSVC_RANGE();
|
||||
|
||||
DO_A_RANGE(0, XE_GPU_REG_SCRATCH_REG0, REGULAR_WRITE_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SCRATCH_REG0, XE_GPU_REG_DC_LUT_30_COLOR + 1,
|
||||
SPECIAL_REG_RANGE_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_DC_LUT_30_COLOR + 1, XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
REGULAR_WRITE_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
WRITE_SHADER_CONSTANTS_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1,
|
||||
WRITE_FETCH_CONSTANTS_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||
XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, WRITE_BOOL_LOOP_CALLBACK);
|
||||
REFRESH_MSVC_RANGE();
|
||||
DO_A_RANGE(XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, 65536,
|
||||
REGULAR_WRITE_CALLBACK);
|
||||
}
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void
|
||||
|
@ -2626,11 +2707,17 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
// todo: use SIMD for getscissor + scaling here, should reduce code size more
|
||||
draw_util::Scissor scissor;
|
||||
draw_util::GetScissor(regs, scissor);
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
__m128i* scisp = (__m128i*)&scissor;
|
||||
*scisp = _mm_mullo_epi32(
|
||||
*scisp, _mm_setr_epi32(draw_resolution_scale_x, draw_resolution_scale_y,
|
||||
draw_resolution_scale_x, draw_resolution_scale_y));
|
||||
#else
|
||||
scissor.offset[0] *= draw_resolution_scale_x;
|
||||
scissor.offset[1] *= draw_resolution_scale_y;
|
||||
scissor.extent[0] *= draw_resolution_scale_x;
|
||||
scissor.extent[1] *= draw_resolution_scale_y;
|
||||
|
||||
#endif
|
||||
// Update viewport, scissor, blend factor and stencil reference.
|
||||
UpdateFixedFunctionState(viewport_info, scissor, primitive_polygonal,
|
||||
normalized_depth_control);
|
||||
|
|
|
@ -211,12 +211,25 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
protected:
|
||||
bool SetupContext() override;
|
||||
void ShutdownContext() override;
|
||||
|
||||
void WriteRegister(uint32_t index, uint32_t value) override;
|
||||
XE_FORCEINLINE
|
||||
void WriteRegisterForceinline(uint32_t index, uint32_t value);
|
||||
void WriteRegister(uint32_t index, uint32_t value) override;
|
||||
|
||||
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers) override;
|
||||
/*helper functions for WriteRegistersFromMem*/
|
||||
XE_FORCEINLINE
|
||||
void WriteShaderConstantsFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
XE_FORCEINLINE
|
||||
void WriteBoolLoopFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
XE_FORCEINLINE
|
||||
void WriteFetchFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
void WritePossiblySpecialRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers);
|
||||
|
|
|
@ -551,30 +551,70 @@ void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
|
|||
viewport_info_out.ndc_offset[i] = ndc_offset[i];
|
||||
}
|
||||
}
|
||||
void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
|
||||
bool clamp_to_surface_pitch) {
|
||||
template <bool clamp_to_surface_pitch>
|
||||
static inline
|
||||
void GetScissorTmpl(const RegisterFile& XE_RESTRICT regs,
|
||||
Scissor& XE_RESTRICT scissor_out) {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
|
||||
int32_t tl_x = int32_t(pa_sc_window_scissor_tl.tl_x);
|
||||
int32_t tl_y = int32_t(pa_sc_window_scissor_tl.tl_y);
|
||||
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
|
||||
int32_t br_x = int32_t(pa_sc_window_scissor_br.br_x);
|
||||
int32_t br_y = int32_t(pa_sc_window_scissor_br.br_y);
|
||||
if (!pa_sc_window_scissor_tl.window_offset_disable) {
|
||||
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
|
||||
tl_x += pa_sc_window_offset.window_x_offset;
|
||||
tl_y += pa_sc_window_offset.window_y_offset;
|
||||
br_x += pa_sc_window_offset.window_x_offset;
|
||||
br_y += pa_sc_window_offset.window_y_offset;
|
||||
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
|
||||
auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
|
||||
auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
|
||||
uint32_t surface_pitch = 0;
|
||||
if constexpr (clamp_to_surface_pitch) {
|
||||
surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
|
||||
}
|
||||
uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x,
|
||||
pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y,
|
||||
pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x,
|
||||
pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y,
|
||||
pa_sc_window_offset_window_x_offset =
|
||||
pa_sc_window_offset.window_x_offset,
|
||||
pa_sc_window_offset_window_y_offset =
|
||||
pa_sc_window_offset.window_y_offset,
|
||||
pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x,
|
||||
pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y,
|
||||
pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x,
|
||||
pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y;
|
||||
|
||||
int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x);
|
||||
int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y);
|
||||
|
||||
int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x);
|
||||
int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y);
|
||||
|
||||
__m128i tmp1 = _mm_setr_epi32(tl_x, tl_y, br_x, br_y);
|
||||
__m128i pa_sc_scissor = _mm_setr_epi32(
|
||||
pa_sc_screen_scissor_tl_tl_x, pa_sc_screen_scissor_tl_tl_y,
|
||||
pa_sc_screen_scissor_br_br_x, pa_sc_screen_scissor_br_br_y);
|
||||
__m128i xyoffsetadd = _mm_cvtsi64x_si128(
|
||||
static_cast<unsigned long long>(pa_sc_window_offset_window_x_offset) |
|
||||
(static_cast<unsigned long long>(pa_sc_window_offset_window_y_offset)
|
||||
<< 32));
|
||||
xyoffsetadd = _mm_unpacklo_epi64(xyoffsetadd, xyoffsetadd);
|
||||
// chrispy: put this here to make it clear that the shift by 31 is extracting
|
||||
// this field
|
||||
XE_MAYBE_UNUSED
|
||||
uint32_t window_offset_disable_reference =
|
||||
pa_sc_window_scissor_tl.window_offset_disable;
|
||||
|
||||
__m128i offset_disable_mask = _mm_set1_epi32(pa_sc_window_scissor_tl.value);
|
||||
|
||||
__m128i addend = _mm_blendv_epi8(xyoffsetadd, _mm_setzero_si128(),
|
||||
_mm_srai_epi32(offset_disable_mask, 31));
|
||||
|
||||
tmp1 = _mm_add_epi32(tmp1, addend);
|
||||
|
||||
//}
|
||||
// Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but
|
||||
// still handled here for completeness.
|
||||
auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
|
||||
tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl.tl_x));
|
||||
tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl.tl_y));
|
||||
auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
|
||||
br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br.br_x));
|
||||
br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br.br_y));
|
||||
if (clamp_to_surface_pitch) {
|
||||
__m128i lomax = _mm_max_epi32(tmp1, pa_sc_scissor);
|
||||
__m128i himin = _mm_min_epi32(tmp1, pa_sc_scissor);
|
||||
|
||||
tmp1 = _mm_blend_epi16(lomax, himin, 0b11110000);
|
||||
|
||||
if constexpr (clamp_to_surface_pitch) {
|
||||
// Clamp the horizontal scissor to surface_pitch for safety, in case that's
|
||||
// not done by the guest for some reason (it's not when doing draws without
|
||||
// clipping in Direct3D 9, for instance), to prevent overflow - this is
|
||||
|
@ -582,7 +622,79 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
|
|||
// rasterization without render target width at all (pixel shader
|
||||
// interlock-based custom RB implementations) and using conventional render
|
||||
// targets, but padded to EDRAM tiles.
|
||||
uint32_t surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
|
||||
tmp1 = _mm_blend_epi16(
|
||||
tmp1, _mm_min_epi32(tmp1, _mm_set1_epi32(surface_pitch)),
|
||||
0b00110011);
|
||||
}
|
||||
|
||||
tmp1 = _mm_max_epi32(tmp1, _mm_setzero_si128());
|
||||
|
||||
__m128i tl_in_high = _mm_unpacklo_epi64(tmp1, tmp1);
|
||||
|
||||
__m128i final_br = _mm_max_epi32(tmp1, tl_in_high);
|
||||
final_br = _mm_sub_epi32(final_br, tl_in_high);
|
||||
__m128i scissor_res = _mm_blend_epi16(tmp1, final_br, 0b11110000);
|
||||
_mm_storeu_si128((__m128i*)&scissor_out, scissor_res);
|
||||
#else
|
||||
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
|
||||
auto pa_sc_window_scissor_br = regs.Get<reg::PA_SC_WINDOW_SCISSOR_BR>();
|
||||
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
|
||||
auto pa_sc_screen_scissor_tl = regs.Get<reg::PA_SC_SCREEN_SCISSOR_TL>();
|
||||
auto pa_sc_screen_scissor_br = regs.Get<reg::PA_SC_SCREEN_SCISSOR_BR>();
|
||||
uint32_t surface_pitch = 0;
|
||||
if constexpr (clamp_to_surface_pitch) {
|
||||
surface_pitch = regs.Get<reg::RB_SURFACE_INFO>().surface_pitch;
|
||||
}
|
||||
uint32_t pa_sc_window_scissor_tl_tl_x = pa_sc_window_scissor_tl.tl_x,
|
||||
pa_sc_window_scissor_tl_tl_y = pa_sc_window_scissor_tl.tl_y,
|
||||
pa_sc_window_scissor_br_br_x = pa_sc_window_scissor_br.br_x,
|
||||
pa_sc_window_scissor_br_br_y = pa_sc_window_scissor_br.br_y,
|
||||
pa_sc_window_offset_window_x_offset =
|
||||
pa_sc_window_offset.window_x_offset,
|
||||
pa_sc_window_offset_window_y_offset =
|
||||
pa_sc_window_offset.window_y_offset,
|
||||
pa_sc_screen_scissor_tl_tl_x = pa_sc_screen_scissor_tl.tl_x,
|
||||
pa_sc_screen_scissor_tl_tl_y = pa_sc_screen_scissor_tl.tl_y,
|
||||
pa_sc_screen_scissor_br_br_x = pa_sc_screen_scissor_br.br_x,
|
||||
pa_sc_screen_scissor_br_br_y = pa_sc_screen_scissor_br.br_y;
|
||||
|
||||
int32_t tl_x = int32_t(pa_sc_window_scissor_tl_tl_x);
|
||||
int32_t tl_y = int32_t(pa_sc_window_scissor_tl_tl_y);
|
||||
|
||||
int32_t br_x = int32_t(pa_sc_window_scissor_br_br_x);
|
||||
int32_t br_y = int32_t(pa_sc_window_scissor_br_br_y);
|
||||
|
||||
// chrispy: put this here to make it clear that the shift by 31 is extracting
|
||||
// this field
|
||||
XE_MAYBE_UNUSED
|
||||
uint32_t window_offset_disable_reference =
|
||||
pa_sc_window_scissor_tl.window_offset_disable;
|
||||
int32_t window_offset_disable_mask =
|
||||
~(static_cast<int32_t>(pa_sc_window_scissor_tl.value) >> 31);
|
||||
// if (!pa_sc_window_scissor_tl.window_offset_disable) {
|
||||
|
||||
tl_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
|
||||
tl_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
|
||||
br_x += pa_sc_window_offset_window_x_offset & window_offset_disable_mask;
|
||||
br_y += pa_sc_window_offset_window_y_offset & window_offset_disable_mask;
|
||||
//}
|
||||
// Screen scissor is not used by Direct3D 9 (always 0, 0 to 8192, 8192), but
|
||||
// still handled here for completeness.
|
||||
|
||||
tl_x = std::max(tl_x, int32_t(pa_sc_screen_scissor_tl_tl_x));
|
||||
tl_y = std::max(tl_y, int32_t(pa_sc_screen_scissor_tl_tl_y));
|
||||
|
||||
br_x = std::min(br_x, int32_t(pa_sc_screen_scissor_br_br_x));
|
||||
br_y = std::min(br_y, int32_t(pa_sc_screen_scissor_br_br_y));
|
||||
if constexpr (clamp_to_surface_pitch) {
|
||||
// Clamp the horizontal scissor to surface_pitch for safety, in case that's
|
||||
// not done by the guest for some reason (it's not when doing draws without
|
||||
// clipping in Direct3D 9, for instance), to prevent overflow - this is
|
||||
// important for host implementations, both based on target-indepedent
|
||||
// rasterization without render target width at all (pixel shader
|
||||
// interlock-based custom RB implementations) and using conventional render
|
||||
// targets, but padded to EDRAM tiles.
|
||||
|
||||
tl_x = std::min(tl_x, int32_t(surface_pitch));
|
||||
br_x = std::min(br_x, int32_t(surface_pitch));
|
||||
}
|
||||
|
@ -599,6 +711,16 @@ void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
|
|||
scissor_out.offset[1] = uint32_t(tl_y);
|
||||
scissor_out.extent[0] = uint32_t(br_x - tl_x);
|
||||
scissor_out.extent[1] = uint32_t(br_y - tl_y);
|
||||
#endif
|
||||
}
|
||||
|
||||
void GetScissor(const RegisterFile& XE_RESTRICT regs,
|
||||
Scissor& XE_RESTRICT scissor_out, bool clamp_to_surface_pitch) {
|
||||
if (clamp_to_surface_pitch) {
|
||||
return GetScissorTmpl<true>(regs, scissor_out);
|
||||
} else {
|
||||
return GetScissorTmpl<false>(regs, scissor_out);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t GetNormalizedColorMask(const RegisterFile& regs,
|
||||
|
@ -863,7 +985,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
y1 = y0 + int32_t(xenos::kMaxResolveSize);
|
||||
}
|
||||
// fails in forza horizon 1
|
||||
//x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
|
||||
// x0 is 0, x1 is 0x100, y0 is 0x100, y1 is 0x100
|
||||
assert_true(x0 <= x1 && y0 <= y1);
|
||||
if (x0 >= x1 || y0 >= y1) {
|
||||
XELOGE("Resolve region is empty");
|
||||
|
@ -1103,7 +1225,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
info_out.rb_depth_clear = regs[XE_GPU_REG_RB_DEPTH_CLEAR].u32;
|
||||
info_out.rb_color_clear = regs[XE_GPU_REG_RB_COLOR_CLEAR].u32;
|
||||
info_out.rb_color_clear_lo = regs[XE_GPU_REG_RB_COLOR_CLEAR_LO].u32;
|
||||
#if 0
|
||||
#if 0
|
||||
XELOGD(
|
||||
"Resolve: {},{} <= x,y < {},{}, {} -> {} at 0x{:08X} (potentially "
|
||||
"modified memory range 0x{:08X} to 0x{:08X})",
|
||||
|
@ -1114,7 +1236,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
xenos::ColorRenderTargetFormat(color_edram_info.format)),
|
||||
FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
|
||||
copy_dest_extent_end);
|
||||
#endif
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
XE_MSVC_OPTIMIZE_REVERT()
|
||||
|
|
|
@ -433,13 +433,15 @@ struct GetViewportInfoArgs {
|
|||
void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
|
||||
ViewportInfo& viewport_info_out);
|
||||
|
||||
struct Scissor {
|
||||
struct alignas(16) Scissor {
|
||||
// Offset from render target UV = 0 to +UV.
|
||||
uint32_t offset[2];
|
||||
// Extent can be zero.
|
||||
uint32_t extent[2];
|
||||
};
|
||||
void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
|
||||
|
||||
void GetScissor(const RegisterFile& XE_RESTRICT regs,
|
||||
Scissor& XE_RESTRICT scissor_out,
|
||||
bool clamp_to_surface_pitch = true);
|
||||
|
||||
// Returns the color component write mask for the draw command taking into
|
||||
|
|
|
@ -109,7 +109,7 @@ XE_NOINLINE
|
|||
XE_COLD
|
||||
bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT;
|
||||
|
||||
XE_NOINLINE
|
||||
XE_FORCEINLINE
|
||||
XE_NOALIAS
|
||||
uint32_t GetCurrentRingReadCount();
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
|
|||
uint32_t count) XE_RESTRICT {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
|
||||
trace_writer_.WriteIndirectBufferStart(ptr, count * sizeof(uint32_t));
|
||||
if (count != 0) {
|
||||
RingBuffer old_reader = reader_;
|
||||
|
@ -32,10 +31,9 @@ void COMMAND_PROCESSOR::ExecuteIndirectBuffer(uint32_t ptr,
|
|||
trace_writer_.WriteIndirectBufferEnd();
|
||||
reader_ = old_reader;
|
||||
} else {
|
||||
//rare, but i've seen it happen! (and then a division by 0 occurs)
|
||||
// rare, but i've seen it happen! (and then a division by 0 occurs)
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
bool COMMAND_PROCESSOR::ExecutePacket() {
|
||||
|
@ -88,8 +86,9 @@ bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
|
|||
count * sizeof(uint32_t));
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
|
||||
/*
|
||||
Todo: optimize this function this one along with execute packet type III are
|
||||
the most frequently called functions for PM4
|
||||
*/
|
||||
XE_NOINLINE
|
||||
bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
|
||||
|
@ -99,7 +98,6 @@ bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
|
|||
|
||||
uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
|
||||
|
||||
|
||||
if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
|
||||
count * sizeof(uint32_t)) {
|
||||
trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
|
||||
|
@ -143,7 +141,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType2(uint32_t packet) XE_RESTRICT {
|
|||
trace_writer_.WritePacketEnd();
|
||||
return true;
|
||||
}
|
||||
XE_NOINLINE
|
||||
XE_FORCEINLINE
|
||||
XE_NOALIAS
|
||||
uint32_t COMMAND_PROCESSOR::GetCurrentRingReadCount() {
|
||||
return reader_.read_count();
|
||||
|
@ -446,41 +444,46 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(
|
|||
return true;
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
/*
|
||||
chrispy: this is fine to inline, as a noinline function it compiled down
|
||||
to 54 bytes
|
||||
*/
|
||||
static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
|
||||
/*
|
||||
Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits
|
||||
then use the wait_info value in order to select the bits that correctly implement the condition
|
||||
If neither subtraction has the signbit set then that means the value is equal
|
||||
*/
|
||||
bool matched = false;
|
||||
switch (wait_info & 0x7) {
|
||||
case 0x0: // Never.
|
||||
matched = false;
|
||||
break;
|
||||
case 0x1: // Less than reference.
|
||||
matched = value < ref;
|
||||
break;
|
||||
case 0x2: // Less than or equal to reference.
|
||||
matched = value <= ref;
|
||||
break;
|
||||
case 0x3: // Equal to reference.
|
||||
matched = value == ref;
|
||||
break;
|
||||
case 0x4: // Not equal to reference.
|
||||
matched = value != ref;
|
||||
break;
|
||||
case 0x5: // Greater than or equal to reference.
|
||||
matched = value >= ref;
|
||||
break;
|
||||
case 0x6: // Greater than reference.
|
||||
matched = value > ref;
|
||||
break;
|
||||
case 0x7: // Always
|
||||
matched = true;
|
||||
break;
|
||||
}
|
||||
return matched;
|
||||
// smaller code is generated than the #else path, although whether it is faster
|
||||
// i do not know. i don't think games do an enormous number of cond_write
|
||||
// though, so we have picked
|
||||
// the path with the smaller codegen.
|
||||
// we do technically have more instructions executed vs the switch case method,
|
||||
// but we have no mispredicts and most of our instructions are 0.25/0.3
|
||||
// throughput
|
||||
#if 1
|
||||
uint32_t value_minus_ref =
|
||||
static_cast<uint32_t>(static_cast<int32_t>(value - ref) >> 31);
|
||||
uint32_t ref_minus_value =
|
||||
static_cast<uint32_t>(static_cast<int32_t>(ref - value) >> 31);
|
||||
uint32_t eqmask = ~(value_minus_ref | ref_minus_value);
|
||||
uint32_t nemask = (value_minus_ref | ref_minus_value);
|
||||
|
||||
uint32_t value_lt_mask = value_minus_ref;
|
||||
uint32_t value_gt_mask = ref_minus_value;
|
||||
uint32_t value_lte_mask = value_lt_mask | eqmask;
|
||||
uint32_t value_gte_mask = value_gt_mask | eqmask;
|
||||
|
||||
uint32_t bits_for_selecting =
|
||||
(value_lt_mask & (1 << 1)) | (value_lte_mask & (1 << 2)) |
|
||||
(eqmask & (1 << 3)) | (nemask & (1 << 4)) | (value_gte_mask & (1 << 5)) |
|
||||
(value_gt_mask & (1 << 6)) | (1 << 7);
|
||||
|
||||
return (bits_for_selecting >> (wait_info & 7)) & 1;
|
||||
|
||||
#else
|
||||
|
||||
return ((((value < ref) << 1) | ((value <= ref) << 2) |
|
||||
((value == ref) << 3) | ((value != ref) << 4) |
|
||||
((value >= ref) << 5) | ((value > ref) << 6) | (1 << 7)) >>
|
||||
(wait_info & 7)) &
|
||||
1;
|
||||
#endif
|
||||
}
|
||||
XE_NOINLINE
|
||||
bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
|
||||
|
@ -520,19 +523,17 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_WAIT_REG_MEM(
|
|||
PrepareForWait();
|
||||
if (!cvars::vsync) {
|
||||
// User wants it fast and dangerous.
|
||||
xe::threading::MaybeYield();
|
||||
// do nothing
|
||||
} else {
|
||||
xe::threading::Sleep(std::chrono::milliseconds(wait / 0x100));
|
||||
ReturnFromWait();
|
||||
}
|
||||
// xe::threading::SyncMemory();
|
||||
ReturnFromWait();
|
||||
|
||||
if (!worker_running_) {
|
||||
// Short-circuited exit.
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
xe::threading::MaybeYield();
|
||||
}
|
||||
}
|
||||
} while (!matched);
|
||||
|
@ -1128,7 +1129,7 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(
|
|||
}
|
||||
|
||||
uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
|
||||
uint32_t write_index) {
|
||||
uint32_t write_index) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
|
||||
// If we have a pending trace stream open it now. That way we ensure we get
|
||||
|
|
|
@ -104,6 +104,15 @@ class TextureCache {
|
|||
void TextureFetchConstantWritten(uint32_t index) {
|
||||
texture_bindings_in_sync_ &= ~(UINT32_C(1) << index);
|
||||
}
|
||||
void TextureFetchConstantsWritten(uint32_t first_index, uint32_t last_index) {
|
||||
// generate a mask of all bits from before the first index, and xor it with
|
||||
// all bits before the last index this produces a mask covering only the
|
||||
// bits between first and last
|
||||
uint32_t res = ((1U << first_index) - 1) ^ ((1U << (last_index + 1)) - 1);
|
||||
// todo: check that this is right
|
||||
|
||||
texture_bindings_in_sync_ &= ~res;
|
||||
}
|
||||
|
||||
virtual void RequestTextures(uint32_t used_texture_mask);
|
||||
|
||||
|
|
Loading…
Reference in New Issue