diff --git a/src/xenia/base/dma.cc b/src/xenia/base/dma.cc index 7d2d9d80c..ead0ac490 100644 --- a/src/xenia/base/dma.cc +++ b/src/xenia/base/dma.cc @@ -1,4 +1,6 @@ #include "dma.h" +#include "logging.h" +#include "xbyak/xbyak/xbyak_util.h" template static void xedmaloghelper(const char (&fmt)[N], Ts... args) { @@ -14,8 +16,8 @@ using xe::swcache::CacheLine; static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine); XE_FORCEINLINE -static void XeCopy16384Streaming(CacheLine* XE_RESTRICT to, - CacheLine* XE_RESTRICT from) { +static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to, + CacheLine* XE_RESTRICT from) { uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE; CacheLine* dest1 = to; @@ -46,16 +48,58 @@ static void XeCopy16384Streaming(CacheLine* XE_RESTRICT to, } XE_MSVC_REORDER_BARRIER(); } +XE_FORCEINLINE +static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to, + CacheLine* XE_RESTRICT from) { + uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE; + + CacheLine* dest1 = to; + CacheLine* src1 = from; + + CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE; + CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE; + + CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2); + CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2); + + CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); + CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); +#pragma loop(no_vector) + for (uint32_t i = 0; i < num_lines_for_8k; ++i) { +#if 0 + xe::swcache::CacheLine line0, line1, line2, line3; + + xe::swcache::ReadLine(&line0, src1 + i); + xe::swcache::ReadLine(&line1, src2 + i); + xe::swcache::ReadLine(&line2, src3 + i); + xe::swcache::ReadLine(&line3, src4 + i); + XE_MSVC_REORDER_BARRIER(); + xe::swcache::WriteLineNT(dest1 + i, &line0); + xe::swcache::WriteLineNT(dest2 + i, &line1); + + xe::swcache::WriteLineNT(dest3 + i, &line2); + xe::swcache::WriteLineNT(dest4 + i, &line3); +#else + _movdir64b(dest1 + i, src1 + i); + _movdir64b(dest2 + i, src2 + i); + _movdir64b(dest3 + i, src3 + i); + _movdir64b(dest4 + i, src4 + i); +#endif + } + XE_MSVC_REORDER_BARRIER(); +} namespace xe::dma { -XE_FORCEINLINE -static void vastcpy_impl(CacheLine* XE_RESTRICT physaddr, - CacheLine* XE_RESTRICT rdmapping, - uint32_t written_length) { +using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length); +static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length) { static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE; while (written_length >= 16384) { - XeCopy16384Streaming(physaddr, rdmapping); + XeCopy16384StreamingAVX(physaddr, rdmapping); physaddr += NUM_LINES_FOR_16K; rdmapping += NUM_LINES_FOR_16K; @@ -88,12 +132,85 @@ static void vastcpy_impl(CacheLine* XE_RESTRICT physaddr, xe::swcache::WriteLineNT(physaddr + i, &line0); } } +static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length) { + static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE; + + while (written_length >= 16384) { + XeCopy16384Movdir64M(physaddr, rdmapping); + + physaddr += NUM_LINES_FOR_16K; + rdmapping += NUM_LINES_FOR_16K; + + written_length -= 16384; + } + + if (!written_length) { + return; + } + uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE; + + uint32_t i = 0; + + for (; i + 1 < num_written_lines; i += 2) { + _movdir64b(physaddr + i, rdmapping + i); + _movdir64b(physaddr + i + 1, rdmapping + i + 1); + } + + if (i < num_written_lines) { + _movdir64b(physaddr + i, rdmapping + i); + } +} + +static class DMAFeatures { + public: + uint32_t has_fast_rep_movsb : 1; + uint32_t has_movdir64b : 1; + + DMAFeatures() { + unsigned int data[4]; + memset(data, 0, sizeof(data)); + // intel extended features + Xbyak::util::Cpu::getCpuidEx(7, 0, data); + if (data[2] & (1 << 28)) { + has_movdir64b = 1; + } + if (data[1] & (1 << 9)) { + has_fast_rep_movsb = 1; + } + } +} dma_x86_features; +XE_COLD +static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length); + +static VastCpyDispatch vastcpy_dispatch = first_vastcpy; + +XE_COLD +static void first_vastcpy(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length) { + VastCpyDispatch dispatch_to_use = nullptr; + if (dma_x86_features.has_movdir64b) { + XELOGI("Selecting MOVDIR64M vastcpy."); + dispatch_to_use = vastcpy_impl_movdir64m; + } else { + XELOGI("Selecting generic AVX vastcpy."); + dispatch_to_use = vastcpy_impl_avx; + } + + vastcpy_dispatch = + dispatch_to_use; // all future calls will go through our selected path + return vastcpy_dispatch(physaddr, rdmapping, written_length); +} XE_NOINLINE void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping, uint32_t written_length) { - return vastcpy_impl((CacheLine*)physaddr, (CacheLine*)rdmapping, - written_length); + return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping, + written_length); } #define XEDMA_NUM_WORKERS 4 diff --git a/src/xenia/base/logging.cc b/src/xenia/base/logging.cc index 700bb3c66..db6dfc5b7 100644 --- a/src/xenia/base/logging.cc +++ b/src/xenia/base/logging.cc @@ -466,8 +466,7 @@ void ShutdownLogging() { } bool logging::internal::ShouldLog(LogLevel log_level) { - return logger_ != nullptr && - static_cast(log_level) <= cvars::log_level; + return static_cast(log_level) <= cvars::log_level; } std::pair logging::internal::GetThreadBuffer() { @@ -476,7 +475,7 @@ std::pair logging::internal::GetThreadBuffer() { void logging::internal::AppendLogLine(LogLevel log_level, const char prefix_char, size_t written) { - if (!ShouldLog(log_level) || !written) { + if (!logger_ || !ShouldLog(log_level) || !written) { return; } logger_->AppendLine(xe::threading::current_thread_id(), prefix_char, diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index b924c267c..b5574b815 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -612,7 +612,7 @@ enum class PrefetchTag { Write, Nontemporal, Level3, Level2, Level1 }; template static void Prefetch(const void* addr) { - static_assert(false, "Unknown tag"); + xenia_assert(false && "Unknown tag"); } template <> diff --git a/src/xenia/base/platform.h b/src/xenia/base/platform.h index ebb555034..e0c42b42d 100644 --- a/src/xenia/base/platform.h +++ b/src/xenia/base/platform.h @@ -127,8 +127,22 @@ #define XE_FORCEINLINE inline #define XE_NOINLINE #define XE_COLD -#define XE_LIKELY(...) (!!(__VA_ARGS__)) -#define XE_UNLIKELY(...) (!!(__VA_ARGS__)) + +#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]] +#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) [[unlikely]] +#endif + +#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1 +#define XE_LIKELY_IF(...) if (XE_LIKELY(__VA_ARGS__)) +#define XE_UNLIKELY_IF(...) if (XE_UNLIKELY(__VA_ARGS__)) +#else +#if __cplusplus >= 202002 +#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]] +#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) [[unlikely]] +#else +#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) +#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) +#endif #endif // only use __restrict if MSVC, for clang/gcc we can use -fstrict-aliasing which // acts as __restrict across the board todo: __restrict is part of the type diff --git a/src/xenia/base/ring_buffer.cc b/src/xenia/base/ring_buffer.cc index d7176c068..53cd4d703 100644 --- a/src/xenia/base/ring_buffer.cc +++ b/src/xenia/base/ring_buffer.cc @@ -45,15 +45,17 @@ void RingBuffer::AdvanceWrite(size_t _count) { RingBuffer::ReadRange RingBuffer::BeginRead(size_t _count) { ring_size_t count = std::min(static_cast(_count), capacity_); - if (!count) { - return {0}; + XE_LIKELY_IF(count) { + if (read_offset_ + count < capacity_) { + return {buffer_ + read_offset_, nullptr, count, 0}; + } else { + ring_size_t left_half = capacity_ - read_offset_; + ring_size_t right_half = count - left_half; + return {buffer_ + read_offset_, buffer_, left_half, right_half}; + } } - if (read_offset_ + count < capacity_) { - return {buffer_ + read_offset_, nullptr, count, 0}; - } else { - ring_size_t left_half = capacity_ - read_offset_; - ring_size_t right_half = count - left_half; - return {buffer_ + read_offset_, buffer_, left_half, right_half}; + else { + return {0}; } } diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 17abb72e7..32fec4fe2 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -1068,7 +1068,15 @@ static const vec128_t xmm_consts[] = { /*XMMXOPWordShiftMask*/ vec128s(15), /*XMMXOPDwordShiftMask*/ - vec128i(31)}; + vec128i(31), + /*XMMLVLShuffle*/ + v128_setr_bytes(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), + /*XMMLVRCmp16*/ + vec128b(16), + /*XMMSTVLShuffle*/ + v128_setr_bytes(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + /* XMMSTVRSwapMask*/ + vec128b((uint8_t)0x83)}; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { for (auto& vec : xmm_consts) { diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index b20978ea3..b31e7d4d3 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -168,6 +168,10 @@ enum XmmConst { XMMXOPByteShiftMask, XMMXOPWordShiftMask, XMMXOPDwordShiftMask, + XMMLVLShuffle, + XMMLVRCmp16, + XMMSTVLShuffle, + XMMSTVRSwapMask // swapwordmask with bit 7 set }; using amdfx::xopcompare_e; diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index cc6c1cf32..3a64acc18 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -16,6 +16,7 @@ #include "xenia/base/memory.h" #include "xenia/cpu/backend/x64/x64_backend.h" #include "xenia/cpu/backend/x64/x64_op.h" +#include "xenia/cpu/backend/x64/x64_stack_layout.h" #include "xenia/cpu/backend/x64/x64_tracers.h" #include "xenia/cpu/ppc/ppc_context.h" #include "xenia/cpu/processor.h" @@ -359,6 +360,451 @@ EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8, ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32, ATOMIC_EXCHANGE_I64); +static __m128i callnativesafe_lvl(void* ctx, void* addr) { + uintptr_t uaddr = reinterpret_cast(addr); + + uintptr_t bad_offs = uaddr & 0xf; + + uaddr &= ~0xfULL; + + __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); + + __m128i badhelper = + _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + + __m128i tmpshuf = _mm_add_epi8(badhelper, _mm_set1_epi8((char)bad_offs)); + + tmpshuf = _mm_or_si128(tmpshuf, _mm_cmpgt_epi8(tmpshuf, _mm_set1_epi8(15))); + return _mm_shuffle_epi8(tempload, tmpshuf); +} + +struct LVL_V128 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(e.edx, 0xf); + + e.lea(e.rcx, e.ptr[ComputeMemoryAddress(e, i.src1)]); + e.mov(e.eax, 0xf); + + e.and_(e.eax, e.ecx); + e.or_(e.rcx, e.rdx); + e.vmovd(e.xmm0, e.eax); + + e.xor_(e.rcx, e.rdx); + e.vpxor(e.xmm1, e.xmm1); + e.vmovdqa(e.xmm3, e.ptr[e.rcx]); + e.vmovdqa(e.xmm2, e.GetXmmConstPtr(XMMLVLShuffle)); + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMPermuteControl15)); + e.vpshufb(e.xmm0, e.xmm0, e.xmm1); + + e.vpaddb(e.xmm2, e.xmm0); + + e.vpcmpgtb(e.xmm1, e.xmm2, i.dest); + e.vpor(e.xmm0, e.xmm1, e.xmm2); + e.vpshufb(i.dest, e.xmm3, e.xmm0); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LVL, LVL_V128); + +static __m128i callnativesafe_lvr(void* ctx, void* addr) { + uintptr_t uaddr = reinterpret_cast(addr); + + uintptr_t bad_offs = uaddr & 0xf; + if (!bad_offs) { + return _mm_setzero_si128(); + } + uaddr &= ~0xfULL; + + __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); + + __m128i badhelper = + _mm_setr_epi8(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + + __m128i tmpshuf = _mm_add_epi8(badhelper, _mm_set1_epi8((char)bad_offs)); + + tmpshuf = _mm_or_si128(tmpshuf, _mm_cmplt_epi8(tmpshuf, _mm_set1_epi8(16))); + return _mm_shuffle_epi8(tempload, tmpshuf); +} + +struct LVR_V128 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + Xbyak::Label endpoint{}; + // todo: bailout instead? dont know how frequently the zero skip happens + e.vpxor(i.dest, i.dest); + e.mov(e.edx, 0xf); + + e.lea(e.rcx, e.ptr[ComputeMemoryAddress(e, i.src1)]); + e.mov(e.eax, 0xf); + + e.and_(e.eax, e.ecx); + e.jz(endpoint); + e.or_(e.rcx, e.rdx); + e.vmovd(e.xmm0, e.eax); + + e.xor_(e.rcx, e.rdx); + e.vpxor(e.xmm1, e.xmm1); + e.vmovdqa(e.xmm3, e.ptr[e.rcx]); + e.vmovdqa(e.xmm2, e.GetXmmConstPtr(XMMLVLShuffle)); + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMMLVRCmp16)); + e.vpshufb(e.xmm0, e.xmm0, e.xmm1); + + e.vpaddb(e.xmm2, e.xmm0); + + e.vpcmpgtb(e.xmm1, i.dest, e.xmm2); + e.vpor(e.xmm0, e.xmm1, e.xmm2); + e.vpshufb(i.dest, e.xmm3, e.xmm0); + e.L(endpoint); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_LVR, LVR_V128); + +static __m128i PermuteV128Bytes(__m128i selector, __m128i src1, __m128i src2) { +#if 1 + __m128i selector2 = _mm_xor_si128(selector, _mm_set1_epi8(3)); + + __m128i src1_shuf = _mm_shuffle_epi8(src1, selector2); + __m128i src2_shuf = _mm_shuffle_epi8(src2, selector2); + + __m128i src2_selection = _mm_cmpgt_epi8(selector2, _mm_set1_epi8(15)); + + return _mm_blendv_epi8(src1_shuf, src2_shuf, src2_selection); + +#else + // not the issue + unsigned char tmpbuffer[32]; + + _mm_storeu_si128((__m128i*)tmpbuffer, src1); + _mm_storeu_si128((__m128i*)(&tmpbuffer[16]), src2); + + __m128i result; + + for (unsigned i = 0; i < 16; ++i) { + result.m128i_u8[i] = tmpbuffer[(selector.m128i_u8[i] ^ 3) & 0x1f]; + } + return result; + +#endif +} +static __m128i ByteSwap(__m128i input) { + return _mm_shuffle_epi8(input, _mm_setr_epi32(0x00010203u, 0x04050607u, + 0x08090A0Bu, 0x0C0D0E0Fu)); +} +static __m128i LVSR(char input) { + __m128i lvsr_table_base = ByteSwap(_mm_setr_epi8( + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31)); + + __m128i base_as_vec = _mm_loadu_si128((const __m128i*)&lvsr_table_base); + + __m128i shr_for_offset = _mm_sub_epi8(base_as_vec, _mm_set1_epi8(input)); + return shr_for_offset; +} + +/* +Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); +// ea &= ~0xF +ea = f.And(ea, f.LoadConstantUint64(~0xFull)); +Value* shrs = f.LoadVectorShr(eb); +Value* zerovec = f.LoadZeroVec128(); + +// v = (old & ~mask) | ((new >> eb) & mask) +Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE); +Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); + +// mask = FFFF... >> eb +Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE); + +Value* v = f.Select(mask, old_value, new_value); +// ea &= ~0xF (handled above) +f.Store(ea, f.ByteSwap(v)); +*/ +#if 0 + +static void callnativesafe_stvl(void* ctx, void* addr, __m128i* value) { + uintptr_t uaddr = reinterpret_cast(addr); + + uintptr_t bad_offs = uaddr & 0xf; + + uaddr &= ~0xfULL; + + __m128i tempload = ByteSwap(_mm_loadu_si128((const __m128i*)uaddr)); + + __m128i our_value_to_store = _mm_loadu_si128(value); + + __m128i shr_for_offset = LVSR((char)bad_offs); + + __m128i permuted_us = + PermuteV128Bytes(shr_for_offset, _mm_setzero_si128(), our_value_to_store); + //__m128i mask = PermuteV128Bytes(shr_for_offset, _mm_setzero_si128(), + // _mm_set1_epi8((char)0xff)); + + __m128i mask = _mm_cmpgt_epi8(shr_for_offset, _mm_set1_epi8(15)); + __m128i blended_input_and_memory = + _mm_blendv_epi8(tempload, permuted_us, mask); + + __m128i swapped_final_result = ByteSwap(blended_input_and_memory); + + _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); +} +#else +static void callnativesafe_stvl(void* ctx, void* addr, __m128i* value) { + uintptr_t uaddr = reinterpret_cast(addr); + + uintptr_t bad_offs = uaddr & 0xf; + + uaddr &= ~0xfULL; + + __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); + + __m128i our_value_to_store = _mm_loadu_si128(value); + + __m128i shr_for_offset; + { + __m128i lvsr_table_base = + _mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31), + _mm_set1_epi8(16)); + shr_for_offset = + _mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs)); + } + __m128i permuted_us; + { + __m128i selector2 = _mm_xor_si128(shr_for_offset, _mm_set1_epi8(3)); + + __m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, selector2); + + permuted_us = src2_shuf; + } + + __m128i blended_input_and_memory = + _mm_blendv_epi8(permuted_us, tempload, shr_for_offset); + + __m128i swapped_final_result = blended_input_and_memory; + + _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); +} +static void callnativesafe_stvl_experiment(void* addr, __m128i* value) { + uintptr_t uaddr = reinterpret_cast(addr); + + uintptr_t bad_offs = uaddr & 0xf; + + uaddr &= ~0xfULL; + + __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); + + __m128i our_value_to_store = _mm_loadu_si128(value); + + __m128i shr_for_offset; + { + __m128i lvsr_table_base = + _mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31), + _mm_set1_epi8(16)); + + // lvsr_table_base = _mm_xor_si128(lvsr_table_base, _mm_set1_epi8(3)); + // lvsr_table_base = ByteSwap(lvsr_table_base); + shr_for_offset = + _mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs)); + } + __m128i permuted_us; + { + shr_for_offset = _mm_xor_si128(shr_for_offset, _mm_set1_epi8(3)); + + __m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, shr_for_offset); + + permuted_us = src2_shuf; + } + + __m128i blended_input_and_memory = + _mm_blendv_epi8(permuted_us, tempload, shr_for_offset); + + __m128i swapped_final_result = blended_input_and_memory; + + _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); +} + +#endif +struct STVL_V128 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { +#if 0 + e.lea(e.GetNativeParam(0), e.ptr[ComputeMemoryAddress(e, i.src1)]); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.lea(e.GetNativeParam(1), e.StashXmm(0, src2)); + e.CallNativeSafe((void*)callnativesafe_stvl); + +#else + e.mov(e.ecx, 15); + e.mov(e.edx, e.ecx); + e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]); + e.and_(e.ecx, e.eax); + e.vmovd(e.xmm0, e.ecx); + e.not_(e.rdx); + e.and_(e.rax, e.rdx); + e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMSTVLShuffle)); + // e.vmovdqa(e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask)); + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + e.vpbroadcastb(e.xmm3, e.xmm0); + } else { + e.vpshufb(e.xmm3, e.xmm0, e.GetXmmConstPtr(XMMZero)); + } + e.vpsubb(e.xmm0, e.xmm1, e.xmm3); + e.vpxor(e.xmm1, e.xmm0, + e.GetXmmConstPtr(XMMSwapWordMask)); // xmm1 from now on will be our + // selector for blend/shuffle + // we can reuse xmm0, xmm2 and xmm3 now + // e.vmovdqa(e.xmm0, e.ptr[e.rax]); + + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm0); + + e.vpshufb(e.xmm2, src2, e.xmm1); + e.vpblendvb(e.xmm3, e.xmm2, e.ptr[e.rax], e.xmm1); + e.vmovdqa(e.ptr[e.rax], e.xmm3); + +#endif + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STVL, STVL_V128); + +/* + Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); + // Skip if %16=0 (no data to store). + auto skip_label = f.NewLabel(); + f.BranchFalse(eb, skip_label); + // ea &= ~0xF + // NOTE: need to recalculate ea and eb because after Branch we start a new + // block and we can't use their previous instantiation in the new block + ea = CalculateEA_0(f, ra, rb); + eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); + ea = f.And(ea, f.LoadConstantUint64(~0xFull)); + Value* shrs = f.LoadVectorShr(eb); + Value* zerovec = f.LoadZeroVec128(); + // v = (old & ~mask) | ((new << eb) & mask) + Value* new_value = f.Permute(shrs, f.LoadVR(vd), zerovec, INT8_TYPE); + Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); + // mask = ~FFFF... >> eb + Value* mask = f.Permute(shrs, f.Not(zerovec), zerovec, INT8_TYPE); + Value* v = f.Select(mask, old_value, new_value); + // ea &= ~0xF (handled above) + f.Store(ea, f.ByteSwap(v)); + f.MarkLabel(skip_label); +*/ +#if 0 +static void callnativesafe_stvr(void* ctx, void* addr, __m128i* value) { + uintptr_t uaddr = reinterpret_cast(addr); + + uintptr_t bad_offs = uaddr & 0xf; + if (!bad_offs) { + return; + } + uaddr &= ~0xfULL; + + __m128i tempload = ByteSwap(_mm_loadu_si128((const __m128i*)uaddr)); + + __m128i our_value_to_store = _mm_loadu_si128(value); + + __m128i shr_for_offset = LVSR((char)bad_offs); + + __m128i permuted_us = PermuteV128Bytes( + shr_for_offset, our_value_to_store, _mm_setzero_si128() ); + __m128i mask = PermuteV128Bytes( + shr_for_offset, _mm_set1_epi8((char)0xff) ,_mm_setzero_si128() + ); + + //__m128i mask = _mm_cmpgt_epi8(shr_for_offset, _mm_set1_epi8(15)); + __m128i blended_input_and_memory = + _mm_blendv_epi8(tempload, permuted_us, mask); + + __m128i swapped_final_result = ByteSwap(blended_input_and_memory); + + _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); +} +#else +static void callnativesafe_stvr(void* ctx, void* addr, __m128i* value) { + uintptr_t uaddr = reinterpret_cast(addr); + + uintptr_t bad_offs = uaddr & 0xf; + + uaddr &= ~0xfULL; + if (!bad_offs) { + return; + } + __m128i tempload = _mm_loadu_si128((const __m128i*)uaddr); + + __m128i our_value_to_store = _mm_loadu_si128(value); + + __m128i shr_for_offset; + { + __m128i lvsr_table_base = + _mm_sub_epi8(_mm_setr_epi8(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31), + _mm_set1_epi8(16)); + + // lvsr_table_base = _mm_xor_si128(lvsr_table_base, _mm_set1_epi8(3)); + // lvsr_table_base = ByteSwap(lvsr_table_base); + shr_for_offset = + _mm_sub_epi8(lvsr_table_base, _mm_set1_epi8((char)bad_offs)); + } + __m128i permuted_us; + { + shr_for_offset = _mm_xor_si128(shr_for_offset, _mm_set1_epi8((char)0x83)); + + __m128i src2_shuf = _mm_shuffle_epi8(our_value_to_store, shr_for_offset); + + permuted_us = src2_shuf; + } + + __m128i blended_input_and_memory = + _mm_blendv_epi8(permuted_us, tempload, shr_for_offset); + + __m128i swapped_final_result = blended_input_and_memory; + + _mm_storeu_si128((__m128i*)uaddr, swapped_final_result); +} +#endif +struct STVR_V128 : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { +#if 0 + e.lea(e.GetNativeParam(0), e.ptr[ComputeMemoryAddress(e, i.src1)]); + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm1); + + e.lea(e.GetNativeParam(1), e.StashXmm(0, src2)); + e.CallNativeSafe((void*)callnativesafe_stvr); + +#else + Xbyak::Label skipper{}; + e.mov(e.ecx, 15); + e.mov(e.edx, e.ecx); + e.lea(e.rax, e.ptr[ComputeMemoryAddress(e, i.src1)]); + e.and_(e.ecx, e.eax); + e.jz(skipper); + e.vmovd(e.xmm0, e.ecx); + e.not_(e.rdx); + e.and_(e.rax, e.rdx); + e.vmovdqa(e.xmm1, e.GetXmmConstPtr(XMMSTVLShuffle)); + // todo: maybe a table lookup might be a better idea for getting the + // shuffle/blend + // e.vmovdqa(e.xmm2, e.GetXmmConstPtr(XMMSTVRSwapMask)); + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + e.vpbroadcastb(e.xmm3, e.xmm0); + } else { + e.vpshufb(e.xmm3, e.xmm0, e.GetXmmConstPtr(XMMZero)); + } + e.vpsubb(e.xmm0, e.xmm1, e.xmm3); + e.vpxor(e.xmm1, e.xmm0, + e.GetXmmConstPtr(XMMSTVRSwapMask)); // xmm1 from now on will be our + // selector for blend/shuffle + // we can reuse xmm0, xmm2 and xmm3 now + // e.vmovdqa(e.xmm0, e.ptr[e.rax]); + + Xmm src2 = GetInputRegOrConstant(e, i.src2, e.xmm0); + + e.vpshufb(e.xmm2, src2, e.xmm1); + e.vpblendvb(e.xmm3, e.xmm2, e.ptr[e.rax], e.xmm1); + e.vmovdqa(e.ptr[e.rax], e.xmm3); + e.L(skipper); +#endif + } +}; +EMITTER_OPCODE_TABLE(OPCODE_STVR, STVR_V128); // ============================================================================ // OPCODE_ATOMIC_COMPARE_EXCHANGE // ============================================================================ diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index a59b1c72e..3254586e5 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -122,16 +122,16 @@ enum Opcode { OPCODE_NOP, OPCODE_SOURCE_OFFSET, OPCODE_DEBUG_BREAK, - OPCODE_DEBUG_BREAK_TRUE, + OPCODE_DEBUG_BREAK_TRUE, // remove, branch and break OPCODE_TRAP, - OPCODE_TRAP_TRUE, + OPCODE_TRAP_TRUE, // remove, branch and trap OPCODE_CALL, - OPCODE_CALL_TRUE, + OPCODE_CALL_TRUE, // remove, branch and call OPCODE_CALL_INDIRECT, - OPCODE_CALL_INDIRECT_TRUE, + OPCODE_CALL_INDIRECT_TRUE, // remove, branch and call OPCODE_CALL_EXTERN, OPCODE_RETURN, - OPCODE_RETURN_TRUE, + OPCODE_RETURN_TRUE, // remove, branch and return OPCODE_SET_RETURN_ADDRESS, OPCODE_BRANCH, OPCODE_BRANCH_TRUE, @@ -194,8 +194,8 @@ enum Opcode { // 0x4F7FFD00. OPCODE_VECTOR_CONVERT_I2F, OPCODE_VECTOR_CONVERT_F2I, - OPCODE_LOAD_VECTOR_SHL, - OPCODE_LOAD_VECTOR_SHR, + OPCODE_LOAD_VECTOR_SHL, // remove, use arithmetic instead + OPCODE_LOAD_VECTOR_SHR, // remove, use arithmetic instead OPCODE_LOAD_CLOCK, OPCODE_LOAD_LOCAL, OPCODE_STORE_LOCAL, @@ -204,8 +204,8 @@ enum Opcode { OPCODE_CONTEXT_BARRIER, OPCODE_LOAD_MMIO, OPCODE_STORE_MMIO, - OPCODE_LOAD_OFFSET, - OPCODE_STORE_OFFSET, + OPCODE_LOAD_OFFSET, // remove, use add instead? + OPCODE_STORE_OFFSET, // remove, use add instead? OPCODE_LOAD, OPCODE_STORE, // chrispy: todo: implement, our current codegen for the unaligned loads is @@ -222,7 +222,7 @@ enum Opcode { OPCODE_MIN, OPCODE_VECTOR_MIN, OPCODE_SELECT, - OPCODE_IS_NAN, + OPCODE_IS_NAN, // remove? compare_eq with self instead OPCODE_COMPARE_EQ, OPCODE_COMPARE_NE, OPCODE_COMPARE_SLT, @@ -233,14 +233,14 @@ enum Opcode { OPCODE_COMPARE_ULE, OPCODE_COMPARE_UGT, OPCODE_COMPARE_UGE, - OPCODE_DID_SATURATE, + OPCODE_DID_SATURATE, // remove, use different way of tracking saturation OPCODE_VECTOR_COMPARE_EQ, OPCODE_VECTOR_COMPARE_SGT, OPCODE_VECTOR_COMPARE_SGE, OPCODE_VECTOR_COMPARE_UGT, OPCODE_VECTOR_COMPARE_UGE, OPCODE_ADD, - OPCODE_ADD_CARRY, + OPCODE_ADD_CARRY, // remove, instead zero extend carry and add OPCODE_VECTOR_ADD, OPCODE_SUB, OPCODE_VECTOR_SUB, @@ -261,7 +261,7 @@ enum Opcode { OPCODE_DOT_PRODUCT_3, OPCODE_DOT_PRODUCT_4, OPCODE_AND, - OPCODE_AND_NOT, + OPCODE_AND_NOT, // remove, Not+And instead OPCODE_OR, OPCODE_XOR, OPCODE_NOT, @@ -271,8 +271,8 @@ enum Opcode { OPCODE_VECTOR_SHR, OPCODE_SHA, OPCODE_VECTOR_SHA, - OPCODE_ROTATE_LEFT, - OPCODE_VECTOR_ROTATE_LEFT, + OPCODE_ROTATE_LEFT, // remove, left/right shift combo instead + OPCODE_VECTOR_ROTATE_LEFT, // eliminate, replace with left/right shift combo OPCODE_VECTOR_AVERAGE, OPCODE_BYTE_SWAP, OPCODE_CNTLZ, @@ -281,7 +281,8 @@ enum Opcode { OPCODE_SPLAT, OPCODE_PERMUTE, OPCODE_SWIZZLE, - OPCODE_PACK, + OPCODE_PACK, // break up into smaller operations and add a float16 convert + // opcode OPCODE_UNPACK, OPCODE_ATOMIC_EXCHANGE, OPCODE_ATOMIC_COMPARE_EXCHANGE, diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index c95f068c0..9f2ede47f 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -208,6 +208,7 @@ int InstrEmit_stvxl128(PPCHIRBuilder& f, const InstrData& i) { int InstrEmit_lvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { Value* ea = CalculateEA_0(f, ra, rb); +#if 0 Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); // ea &= ~0xF ea = f.And(ea, f.LoadConstantUint64(~0xFull)); @@ -216,6 +217,11 @@ int InstrEmit_lvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, f.LoadZeroVec128(), INT8_TYPE); f.StoreVR(vd, v); return 0; +#else + Value* val = f.LoadVectorLeft(ea); + f.StoreVR(vd, val); + return 0; +#endif } int InstrEmit_lvlx(PPCHIRBuilder& f, const InstrData& i) { return InstrEmit_lvlx_(f, i, i.X.RT, i.X.RA, i.X.RB); @@ -237,6 +243,7 @@ int InstrEmit_lvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, // buffer, which sometimes may be nothing and hang off the end of the valid // page area. We still need to zero the resulting register, though. Value* ea = CalculateEA_0(f, ra, rb); +#if 0 Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); // Skip if %16=0 (just load zero). auto load_label = f.NewLabel(); @@ -257,6 +264,11 @@ int InstrEmit_lvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, f.StoreVR(vd, v); f.MarkLabel(end_label); return 0; +#else + Value* val = f.LoadVectorRight(ea); + f.StoreVR(vd, val); + return 0; +#endif } int InstrEmit_lvrx(PPCHIRBuilder& f, const InstrData& i) { return InstrEmit_lvrx_(f, i, i.X.RT, i.X.RA, i.X.RB); @@ -275,7 +287,9 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { // NOTE: if eb == 0 (so 16b aligned) this equals new_value // we could optimize this to prevent the other load/mask, in that case. + Value* ea = CalculateEA_0(f, ra, rb); +#if 0 Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); // ea &= ~0xF ea = f.And(ea, f.LoadConstantUint64(~0xFull)); @@ -283,6 +297,8 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, Value* zerovec = f.LoadZeroVec128(); // v = (old & ~mask) | ((new >> eb) & mask) + + Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE); Value* new_value = f.Permute(shrs, zerovec, f.LoadVR(vd), INT8_TYPE); Value* old_value = f.ByteSwap(f.Load(ea, VEC128_TYPE)); /* @@ -291,11 +307,16 @@ int InstrEmit_stvlx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, here looks as if it might make more sense as a comparison ( */ // mask = FFFF... >> eb - Value* mask = f.Permute(shrs, zerovec, f.Not(zerovec), INT8_TYPE); + Value* v = f.Select(mask, old_value, new_value); // ea &= ~0xF (handled above) f.Store(ea, f.ByteSwap(v)); +#else + + Value* vdr = f.LoadVR(vd); + f.StoreVectorLeft(ea, vdr); +#endif return 0; } int InstrEmit_stvlx(PPCHIRBuilder& f, const InstrData& i) { @@ -318,6 +339,7 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, // buffer, which sometimes may be nothing and hang off the end of the valid // page area. Value* ea = CalculateEA_0(f, ra, rb); +#if 0 Value* eb = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstantInt8(0xF)); // Skip if %16=0 (no data to store). auto skip_label = f.NewLabel(); @@ -339,6 +361,10 @@ int InstrEmit_stvrx_(PPCHIRBuilder& f, const InstrData& i, uint32_t vd, // ea &= ~0xF (handled above) f.Store(ea, f.ByteSwap(v)); f.MarkLabel(skip_label); +#else + Value* vdr = f.LoadVR(vd); + f.StoreVectorRight(ea, vdr); +#endif return 0; } int InstrEmit_stvrx(PPCHIRBuilder& f, const InstrData& i) { diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index de5c6fb5f..109636bb1 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -29,20 +29,6 @@ #include "xenia/kernel/kernel_state.h" #include "xenia/kernel/user_module.h" -#if defined(NDEBUG) -static constexpr bool should_log_unknown_reg_writes() { return false; } - -#else - -DEFINE_bool(log_unknown_register_writes, false, - "Log writes to unknown registers from " - "CommandProcessor::WriteRegister. Has significant performance hit.", - "GPU"); -static bool should_log_unknown_reg_writes() { - return cvars::log_unknown_register_writes; -} -#endif - namespace xe { namespace gpu { @@ -475,44 +461,34 @@ void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index, } } void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { - if (should_log_unknown_reg_writes()) { - // chrispy: rearrange check order, place set after checks - if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) { - XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value); - check_reg_out_of_bounds: - if (XE_UNLIKELY(index >= RegisterFile::kRegisterCount)) { - XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", - index); - return; - } + // chrispy: rearrange check order, place set after checks + + if (XE_LIKELY(index < RegisterFile::kRegisterCount)) { + register_file_->values[index].u32 = value; + + // quick pre-test + // todo: figure out just how unlikely this is. if very (it ought to be, + // theres a ton of registers other than these) make this predicate + // branchless and mark with unlikely, then make HandleSpecialRegisterWrite + // noinline yep, its very unlikely. these ORS here are meant to be bitwise + // ors, so that we do not do branching evaluation of the conditions (we will + // almost always take all of the branches) + + unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) | + (index == XE_GPU_REG_COHER_STATUS_HOST) | + ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= + (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); + // chrispy: reordered for msvc branch probability (assumes if is taken and + // else is not) + if (XE_LIKELY(expr == 0)) { + XE_MSVC_REORDER_BARRIER(); + + } else { + HandleSpecialRegisterWrite(index, value); } } else { - goto check_reg_out_of_bounds; - } - register_file_->values[index].u32 = value; - - // regs with extra logic on write: XE_GPU_REG_COHER_STATUS_HOST - // XE_GPU_REG_DC_LUT_RW_INDEX - // XE_GPU_REG_DC_LUT_SEQ_COLOR XE_GPU_REG_DC_LUT_PWL_DATA - // XE_GPU_REG_DC_LUT_30_COLOR - - // quick pre-test - // todo: figure out just how unlikely this is. if very (it ought to be, theres - // a ton of registers other than these) make this predicate branchless and - // mark with unlikely, then make HandleSpecialRegisterWrite noinline yep, its - // very unlikely. these ORS here are meant to be bitwise ors, so that we do - // not do branching evaluation of the conditions (we will almost always take - // all of the branches) - - unsigned expr = (index - XE_GPU_REG_SCRATCH_REG0 < 8) | - (index == XE_GPU_REG_COHER_STATUS_HOST) | - ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= - (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)); - // chrispy: reordered for msvc branch probability (assumes if is taken and - // else is not) - if (XE_LIKELY(expr == 0)) { - } else { - HandleSpecialRegisterWrite(index, value); + XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index); + return; } } void CommandProcessor::WriteRegistersFromMem(uint32_t start_index, @@ -587,7 +563,7 @@ void CommandProcessor::ReturnFromWait() {} uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index, uint32_t write_index) { SCOPE_profile_cpu_f("gpu"); - +#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 // If we have a pending trace stream open it now. That way we ensure we get // all commands. if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) { @@ -599,7 +575,7 @@ uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index, trace_writer_.Open(path, title_id); InitializeTrace(); } - +#endif // Adjust pointer base. uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t); start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF); @@ -676,22 +652,24 @@ bool CommandProcessor::ExecutePacket(RingBuffer* reader) { return true; } - if (XE_UNLIKELY(packet == 0xCDCDCDCD)) { + if (XE_LIKELY(packet != 0xCDCDCDCD)) { + actually_execute_packet: + switch (packet_type) { + case 0x00: + return ExecutePacketType0(reader, packet); + case 0x01: + return ExecutePacketType1(reader, packet); + case 0x02: + return ExecutePacketType2(reader, packet); + case 0x03: + return ExecutePacketType3(reader, packet); + default: + assert_unhandled_case(packet_type); + return false; + } + } else { XELOGW("GPU packet is CDCDCDCD - probably read uninitialized memory!"); - } - - switch (packet_type) { - case 0x00: - return ExecutePacketType0(reader, packet); - case 0x01: - return ExecutePacketType1(reader, packet); - case 0x02: - return ExecutePacketType2(reader, packet); - case 0x03: - return ExecutePacketType3(reader, packet); - default: - assert_unhandled_case(packet_type); - return false; + goto actually_execute_packet; } } @@ -712,10 +690,15 @@ bool CommandProcessor::ExecutePacketType0(RingBuffer* reader, uint32_t packet) { uint32_t base_index = (packet & 0x7FFF); uint32_t write_one_reg = (packet >> 15) & 0x1; - if (write_one_reg) { - WriteOneRegisterFromRing(reader, base_index, count); + + if (!write_one_reg) { + if (count == 1) { + WriteRegister(base_index, reader->ReadAndSwap()); + } else { + WriteRegisterRangeFromRing(reader, base_index, count); + } } else { - WriteRegisterRangeFromRing(reader, base_index, count); + WriteOneRegisterFromRing(reader, base_index, count); } trace_writer_.WritePacketEnd(); @@ -750,7 +733,7 @@ bool CommandProcessor::ExecutePacketType3(RingBuffer* reader, uint32_t packet) { uint32_t count = ((packet >> 16) & 0x3FFF) + 1; auto data_start_offset = reader->read_offset(); - if (reader->read_count() < count * sizeof(uint32_t)) { + XE_UNLIKELY_IF(reader->read_count() < count * sizeof(uint32_t)) { XELOGE( "ExecutePacketType3 overflow (read count {:08X}, packet count {:08X})", reader->read_count(), count * sizeof(uint32_t)); @@ -914,6 +897,8 @@ bool CommandProcessor::ExecutePacketType3(RingBuffer* reader, uint32_t packet) { } trace_writer_.WritePacketEnd(); +#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1 + if (opcode == PM4_XE_SWAP) { // End the trace writer frame. if (trace_writer_.is_open()) { @@ -932,6 +917,7 @@ bool CommandProcessor::ExecutePacketType3(RingBuffer* reader, uint32_t packet) { InitializeTrace(); } } +#endif assert_true(reader->read_offset() == (data_start_offset + (count * sizeof(uint32_t))) % @@ -1512,9 +1498,13 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingBuffer* reader, reader->AdvanceRead((count - 1) * sizeof(uint32_t)); return true; } + uint32_t countm1 = count - 1; - WriteRegisterRangeFromRing(reader, index, count - 1); - + if (countm1 != 1) { + WriteRegisterRangeFromRing(reader, index, countm1); + } else { + WriteRegister(index, reader->ReadAndSwap()); + } return true; } @@ -1523,9 +1513,13 @@ bool CommandProcessor::ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader, uint32_t count) { uint32_t offset_type = reader->ReadAndSwap(); uint32_t index = offset_type & 0xFFFF; + uint32_t countm1 = count - 1; - WriteRegisterRangeFromRing(reader, index, count - 1); - + if (countm1 != 1) { + WriteRegisterRangeFromRing(reader, index, countm1); + } else { + WriteRegister(index, reader->ReadAndSwap()); + } return true; } @@ -1573,8 +1567,12 @@ bool CommandProcessor::ExecutePacketType3_SET_SHADER_CONSTANTS( RingBuffer* reader, uint32_t packet, uint32_t count) { uint32_t offset_type = reader->ReadAndSwap(); uint32_t index = offset_type & 0xFFFF; - - WriteRegisterRangeFromRing(reader, index, count - 1); + uint32_t countm1 = count - 1; + if (countm1 != 1) { + WriteRegisterRangeFromRing(reader, index, countm1); + } else { + WriteRegister(index, reader->ReadAndSwap()); + } return true; } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index a670cc9d6..0b0d70b45 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -1678,11 +1678,79 @@ void D3D12CommandProcessor::ShutdownContext() { } // todo: bit-pack the bools and use bitarith to reduce branches void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { - CommandProcessor::WriteRegister(index, value); +#if XE_ARCH_AMD64 == 1 + // CommandProcessor::WriteRegister(index, value); - bool cbuf_binding_float_pixel_utd = cbuffer_binding_float_pixel_.up_to_date; - bool cbuf_binding_float_vertex_utd = cbuffer_binding_float_vertex_.up_to_date; - bool cbuf_binding_bool_loop_utd = cbuffer_binding_bool_loop_.up_to_date; + __m128i to_rangecheck = _mm_set1_epi16(static_cast(index)); + + __m128i lower_bounds = _mm_setr_epi16( + XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 - 1, + XE_GPU_REG_SHADER_CONSTANT_000_X - 1, + XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 - 1, XE_GPU_REG_SCRATCH_REG0 - 1, + XE_GPU_REG_COHER_STATUS_HOST - 1, XE_GPU_REG_DC_LUT_RW_INDEX - 1, 0, 0); + __m128i upper_bounds = _mm_setr_epi16( + XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5 + 1, + XE_GPU_REG_SHADER_CONSTANT_511_W + 1, + XE_GPU_REG_SHADER_CONSTANT_LOOP_31 + 1, XE_GPU_REG_SCRATCH_REG7 + 1, + XE_GPU_REG_COHER_STATUS_HOST + 1, XE_GPU_REG_DC_LUT_30_COLOR + 1, 0, 0); + + // quick pre-test + // todo: figure out just how unlikely this is. if very (it ought to be, + // theres a ton of registers other than these) make this predicate + // branchless and mark with unlikely, then make HandleSpecialRegisterWrite + // noinline yep, its very unlikely. these ORS here are meant to be bitwise + // ors, so that we do not do branching evaluation of the conditions (we will + // almost always take all of the branches) + /* unsigned expr = + (index - XE_GPU_REG_SCRATCH_REG0 < 8) | + (index == XE_GPU_REG_COHER_STATUS_HOST) | + ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <= + (XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));*/ + __m128i is_above_lower = _mm_cmpgt_epi16(to_rangecheck, lower_bounds); + __m128i is_below_upper = _mm_cmplt_epi16(to_rangecheck, upper_bounds); + __m128i is_within_range = _mm_and_si128(is_above_lower, is_below_upper); + register_file_->values[index].u32 = value; + + uint32_t movmask = static_cast(_mm_movemask_epi8(is_within_range)); + + if (!movmask) { + return; + } else { + if (movmask & (1 << 3)) { + if (frame_open_) { + uint32_t float_constant_index = + (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; + uint64_t float_constant_mask = 1ULL << float_constant_index; + + if (float_constant_index >= 256) { + float_constant_index = + static_cast(float_constant_index); + if (current_float_constant_map_pixel_[float_constant_index >> 6] & + float_constant_mask) { // take advantage of x86 + // modulus shift + cbuffer_binding_float_pixel_.up_to_date = false; + } + } else { + if (current_float_constant_map_vertex_[float_constant_index >> 6] & + float_constant_mask) { + cbuffer_binding_float_vertex_.up_to_date = false; + } + } + } + } else if (movmask & (1 << 5)) { + cbuffer_binding_bool_loop_.up_to_date = false; + } else if (movmask & (1 << 1)) { + cbuffer_binding_fetch_.up_to_date = false; + + texture_cache_->TextureFetchConstantWritten( + (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); + } else { + HandleSpecialRegisterWrite(index, value); + } + } +#else + + CommandProcessor::WriteRegister(index, value); if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 && index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) { @@ -1693,16 +1761,8 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); // } } else { - if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd | - cbuf_binding_bool_loop_utd)) { - return; - } - if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { - if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd)) { - return; - } if (frame_open_) { uint32_t float_constant_index = (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; @@ -1724,6 +1784,7 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { cbuffer_binding_bool_loop_.up_to_date = false; } } +#endif } void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, uint32_t* base, @@ -1733,9 +1794,14 @@ void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index, D3D12CommandProcessor::WriteRegister(start_index + i, data); } } -void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, - uint32_t base, - uint32_t num_registers) { +/* +wraparound rarely happens, so its best to hoist this out of +writeregisterrangefromring, and structure the two functions so that this can be +tail called +*/ +XE_NOINLINE +void D3D12CommandProcessor::WriteRegisterRangeFromRing_WraparoundCase( + xe::RingBuffer* ring, uint32_t base, uint32_t num_registers) { // we already brought it into L2 earlier RingBuffer::ReadRange range = ring->BeginPrefetchedRead(num_registers * @@ -1747,14 +1813,32 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, D3D12CommandProcessor::WriteRegistersFromMem( base, reinterpret_cast(const_cast(range.first)), num_regs_firstrange); - if (range.second) { - D3D12CommandProcessor::WriteRegistersFromMem( - base + num_regs_firstrange, - reinterpret_cast(const_cast(range.second)), - num_registers - num_regs_firstrange); - } + + D3D12CommandProcessor::WriteRegistersFromMem( + base + num_regs_firstrange, + reinterpret_cast(const_cast(range.second)), + num_registers - num_regs_firstrange); + ring->EndRead(range); } +void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_registers) { + RingBuffer::ReadRange range = + ring->BeginRead(num_registers * sizeof(uint32_t)); + + XE_LIKELY_IF(!range.second) { + uint32_t num_regs_firstrange = + static_cast(range.first_length / sizeof(uint32_t)); + + D3D12CommandProcessor::WriteRegistersFromMem( + base, reinterpret_cast(const_cast(range.first)), + num_regs_firstrange); + ring->EndRead(range); + } else { + return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers); + } +} void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base, uint32_t num_times) { @@ -1768,7 +1852,7 @@ void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring, base, xe::load_and_swap(read.first + (sizeof(uint32_t) * i))); } - if (read.second) { + XE_UNLIKELY_IF (read.second) { uint32_t second_length = read.second_length / sizeof(uint32_t); for (uint32_t i = 0; i < second_length; ++i) { @@ -2791,9 +2875,8 @@ bool D3D12CommandProcessor::IssueCopy() { // chrispy: this memcpy needs to be optimized as much as possible auto physaddr = memory_->TranslatePhysical(written_address); - dma::vastcpy(physaddr, (uint8_t*)readback_mapping, - written_length); - // XEDmaCpy(physaddr, readback_mapping, written_length); + dma::vastcpy(physaddr, (uint8_t*)readback_mapping, written_length); + // XEDmaCpy(physaddr, readback_mapping, written_length); D3D12_RANGE readback_write_range = {}; readback_buffer->Unmap(0, &readback_write_range); } @@ -4606,12 +4689,12 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) { if (size == 0) { return nullptr; } - #if 0 +#if 0 if (readback_available_) { GetDMAC()->WaitJobDone(readback_available_); readback_available_ = 0; } - #endif +#endif size = xe::align(size, kReadbackBufferSizeIncrement); if (size > readback_buffer_size_) { const ui::d3d12::D3D12Provider& provider = GetD3D12Provider(); diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index e64447a4e..998141f49 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -213,6 +213,11 @@ class D3D12CommandProcessor final : public CommandProcessor { XE_FORCEINLINE virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base, uint32_t num_registers) override; + + XE_NOINLINE + void WriteRegisterRangeFromRing_WraparoundCase(xe::RingBuffer* ring, + uint32_t base, + uint32_t num_registers); XE_FORCEINLINE virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base, uint32_t num_times) override; @@ -614,7 +619,8 @@ class D3D12CommandProcessor final : public CommandProcessor { uint32_t current_graphics_root_up_to_date_; // System shader constants. - DxbcShaderTranslator::SystemConstants system_constants_; + alignas(XE_HOST_CACHE_LINE_SIZE) + DxbcShaderTranslator::SystemConstants system_constants_; // Float constant usage masks of the last draw call. // chrispy: make sure accesses to these cant cross cacheline boundaries diff --git a/src/xenia/gpu/d3d12/pipeline_cache.cc b/src/xenia/gpu/d3d12/pipeline_cache.cc index 91ac56a91..2176e777a 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.cc +++ b/src/xenia/gpu/d3d12/pipeline_cache.cc @@ -427,7 +427,9 @@ void PipelineCache::InitializeShaderStorage( ++shader_translation_threads_busy; break; } - shader_to_translate->AnalyzeUcode(ucode_disasm_buffer); + if (!shader_to_translate->is_ucode_analyzed()) { + shader_to_translate->AnalyzeUcode(ucode_disasm_buffer); + } // Translate each needed modification on this thread after performing // modification-independent analysis of the whole shader. uint64_t ucode_data_hash = shader_to_translate->ucode_data_hash(); @@ -980,7 +982,9 @@ bool PipelineCache::ConfigurePipeline( xenos::VertexShaderExportMode::kPosition2VectorsEdgeKill); assert_false(register_file_.Get().gen_index_vtx); if (!vertex_shader->is_translated()) { - vertex_shader->shader().AnalyzeUcode(ucode_disasm_buffer_); + if (!vertex_shader->shader().is_ucode_analyzed()) { + vertex_shader->shader().AnalyzeUcode(ucode_disasm_buffer_); + } if (!TranslateAnalyzedShader(*shader_translator_, *vertex_shader, dxbc_converter_, dxc_utils_, dxc_compiler_)) { XELOGE("Failed to translate the vertex shader!"); @@ -1004,7 +1008,9 @@ bool PipelineCache::ConfigurePipeline( } if (pixel_shader != nullptr) { if (!pixel_shader->is_translated()) { - pixel_shader->shader().AnalyzeUcode(ucode_disasm_buffer_); + if (!pixel_shader->shader().is_ucode_analyzed()) { + pixel_shader->shader().AnalyzeUcode(ucode_disasm_buffer_); + } if (!TranslateAnalyzedShader(*shader_translator_, *pixel_shader, dxbc_converter_, dxc_utils_, dxc_compiler_)) { diff --git a/src/xenia/gpu/d3d12/pipeline_cache.h b/src/xenia/gpu/d3d12/pipeline_cache.h index d1ede5ffa..30969e7c4 100644 --- a/src/xenia/gpu/d3d12/pipeline_cache.h +++ b/src/xenia/gpu/d3d12/pipeline_cache.h @@ -71,7 +71,9 @@ class PipelineCache { const uint32_t* host_address, uint32_t dword_count); // Analyze shader microcode on the translator thread. void AnalyzeShaderUcode(Shader& shader) { - shader.AnalyzeUcode(ucode_disasm_buffer_); + if (!shader.is_ucode_analyzed()) { + shader.AnalyzeUcode(ucode_disasm_buffer_); + } } // Retrieves the shader modification for the current state. The shader must diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index 2f27f6e01..420bafcf2 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -53,6 +53,26 @@ inline bool IsPrimitiveLine(const RegisterFile& regs) { regs.Get().prim_type); } +constexpr uint32_t EncodeIsPrimitivePolygonalTable() { + unsigned result = 0; +#define TRUEFOR(x) \ + result |= 1U << static_cast(xenos::PrimitiveType::x) + + TRUEFOR(kTriangleList); + TRUEFOR(kTriangleFan); + TRUEFOR(kTriangleStrip); + TRUEFOR(kTriangleWithWFlags); + TRUEFOR(kQuadList); + TRUEFOR(kQuadStrip); + TRUEFOR(kPolygon); +#undef TRUEFOR + // TODO(Triang3l): Investigate how kRectangleList should be treated - possibly + // actually drawn as two polygons on the console, however, the current + // geometry shader doesn't care about the winding order - allowing backface + // culling for rectangles currently breaks 4D53082D. + return result; +} + // Polygonal primitive types (not including points and lines) are rasterized as // triangles, have front and back faces, and also support face culling and fill // modes (polymode_front_ptype, polymode_back_ptype). Other primitive types are @@ -61,6 +81,7 @@ inline bool IsPrimitiveLine(const RegisterFile& regs) { // GL_FRONT_AND_BACK, points and lines are still drawn), and may in some cases // use the "para" registers instead of "front" or "back" (for "parallelogram" - // like poly_offset_para_enable). +XE_FORCEINLINE constexpr bool IsPrimitivePolygonal(bool vgt_output_path_is_tessellation_enable, xenos::PrimitiveType type) { if (vgt_output_path_is_tessellation_enable && @@ -71,26 +92,15 @@ constexpr bool IsPrimitivePolygonal(bool vgt_output_path_is_tessellation_enable, // enough. return true; } - switch (type) { - case xenos::PrimitiveType::kTriangleList: - case xenos::PrimitiveType::kTriangleFan: - case xenos::PrimitiveType::kTriangleStrip: - case xenos::PrimitiveType::kTriangleWithWFlags: - case xenos::PrimitiveType::kQuadList: - case xenos::PrimitiveType::kQuadStrip: - case xenos::PrimitiveType::kPolygon: - return true; - default: - break; - } - // TODO(Triang3l): Investigate how kRectangleList should be treated - possibly - // actually drawn as two polygons on the console, however, the current - // geometry shader doesn't care about the winding order - allowing backface - // culling for rectangles currently breaks 4D53082D. - return false; -} + // chrispy: expensive jumptable, use bit table instead -inline bool IsPrimitivePolygonal(const RegisterFile& regs) { + constexpr uint32_t primitive_polygonal_table = + EncodeIsPrimitivePolygonalTable(); + + return (primitive_polygonal_table & (1U << static_cast(type))) != 0; +} +XE_FORCEINLINE +bool IsPrimitivePolygonal(const RegisterFile& regs) { return IsPrimitivePolygonal( regs.Get().path_select == xenos::VGTOutputPath::kTessellationEnable, diff --git a/src/xenia/gpu/dxbc_shader_translator.cc b/src/xenia/gpu/dxbc_shader_translator.cc index daa8cf782..16870e807 100644 --- a/src/xenia/gpu/dxbc_shader_translator.cc +++ b/src/xenia/gpu/dxbc_shader_translator.cc @@ -94,7 +94,9 @@ std::vector DxbcShaderTranslator::CreateDepthOnlyPixelShader() { // TODO(Triang3l): Handle in a nicer way (is_depth_only_pixel_shader_ is a // leftover from when a Shader object wasn't used during translation). Shader shader(xenos::ShaderType::kPixel, 0, nullptr, 0); - shader.AnalyzeUcode(instruction_disassembly_buffer_); + if (!shader.is_ucode_analyzed()) { + shader.AnalyzeUcode(instruction_disassembly_buffer_); + } Shader::Translation& translation = *shader.GetOrCreateTranslation(0); TranslateAnalyzedShader(translation); is_depth_only_pixel_shader_ = false; diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index fdc9cb0cf..f378015a9 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -50,7 +50,11 @@ __declspec(dllexport) uint32_t AmdPowerXpressRequestHighPerformance = 1; } // extern "C" #endif // XE_PLATFORM_WIN32 -GraphicsSystem::GraphicsSystem() : vsync_worker_running_(false) {} +GraphicsSystem::GraphicsSystem() : vsync_worker_running_(false) { + register_file_ = reinterpret_cast(memory::AllocFixed( + nullptr, sizeof(RegisterFile), memory::AllocationType::kReserveCommit, + memory::PageAccess::kReadWrite)); +} GraphicsSystem::~GraphicsSystem() = default; @@ -198,13 +202,13 @@ uint32_t GraphicsSystem::ReadRegister(uint32_t addr) { // maximum [width(0x0FFF), height(0x0FFF)] return 0x050002D0; default: - if (!register_file_.IsValidRegister(r)) { + if (!register_file()->IsValidRegister(r)) { XELOGE("GPU: Read from unknown register ({:04X})", r); } } assert_true(r < RegisterFile::kRegisterCount); - return register_file_.values[r].u32; + return register_file()->values[r].u32; } void GraphicsSystem::WriteRegister(uint32_t addr, uint32_t value) { @@ -222,7 +226,7 @@ void GraphicsSystem::WriteRegister(uint32_t addr, uint32_t value) { } assert_true(r < RegisterFile::kRegisterCount); - register_file_.values[r].u32 = value; + this->register_file()->values[r].u32 = value; } void GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t size_log2) { diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index 0434a5619..ef58d4569 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -58,7 +58,7 @@ class GraphicsSystem { // from a device loss. void OnHostGpuLossFromAnyThread(bool is_responsible); - RegisterFile* register_file() { return ®ister_file_; } + RegisterFile* register_file() { return register_file_; } CommandProcessor* command_processor() const { return command_processor_.get(); } @@ -112,7 +112,7 @@ class GraphicsSystem { std::atomic vsync_worker_running_; kernel::object_ref vsync_worker_thread_; - RegisterFile register_file_; + RegisterFile* register_file_; std::unique_ptr command_processor_; bool paused_ = false; diff --git a/src/xenia/gpu/texture_info.h b/src/xenia/gpu/texture_info.h index f5b8a3728..43124470d 100644 --- a/src/xenia/gpu/texture_info.h +++ b/src/xenia/gpu/texture_info.h @@ -10,17 +10,86 @@ #ifndef XENIA_GPU_TEXTURE_INFO_H_ #define XENIA_GPU_TEXTURE_INFO_H_ +#include #include #include - #include "xenia/base/assert.h" #include "xenia/gpu/xenos.h" namespace xe { namespace gpu { +#if XE_ARCH_AMD64 == 1 +struct GetBaseFormatHelper { + uint64_t indexer_; + // chrispy: todo, can encode deltas or a SHIFT+ADD for remapping the input + // format to the base, the shuffle lookup isnt great + std::array remap_; +}; +constexpr GetBaseFormatHelper PrecomputeGetBaseFormatTable() { +#define R(x, y) xenos::TextureFormat::x, xenos::TextureFormat::y + constexpr xenos::TextureFormat entries[] = { + R(k_16_EXPAND, k_16_FLOAT), + R(k_16_16_EXPAND, k_16_16_FLOAT), + R(k_16_16_16_16_EXPAND, k_16_16_16_16_FLOAT), + R(k_8_8_8_8_AS_16_16_16_16, k_8_8_8_8), + R(k_DXT1_AS_16_16_16_16, k_DXT1), + R(k_DXT2_3_AS_16_16_16_16, k_DXT2_3), + R(k_DXT4_5_AS_16_16_16_16, k_DXT4_5), + R(k_2_10_10_10_AS_16_16_16_16, k_2_10_10_10), + R(k_10_11_11_AS_16_16_16_16, k_10_11_11), + R(k_11_11_10_AS_16_16_16_16, k_11_11_10), + R(k_8_8_8_8_GAMMA_EDRAM, k_8_8_8_8)}; +#undef R + + uint64_t need_remap_table = 0ULL; + constexpr unsigned num_entries = sizeof(entries) / sizeof(entries[0]); + + for (unsigned i = 0; i < num_entries / 2; ++i) { + need_remap_table |= 1ULL << static_cast(entries[i * 2]); + } + std::array remap{0}; + + for (unsigned i = 0; i < num_entries / 2; ++i) { + remap[i] = static_cast(static_cast(entries[(i * 2) + 1])); + } + + return GetBaseFormatHelper{need_remap_table, remap}; +} +inline xenos::TextureFormat GetBaseFormat(xenos::TextureFormat texture_format) { + constexpr GetBaseFormatHelper helper = PrecomputeGetBaseFormatTable(); + + constexpr uint64_t indexer_table = helper.indexer_; + constexpr std::array table = helper.remap_; + + uint64_t format_mask = 1ULL << static_cast(texture_format); + if ((indexer_table & format_mask)) { + uint64_t trailing_mask = format_mask - 1ULL; + uint64_t trailing_bits = indexer_table & trailing_mask; + + uint32_t sparse_index = xe::bit_count(trailing_bits); + + __m128i index_in_low = + _mm_cvtsi32_si128(static_cast(sparse_index) | 0x80808000); + + __m128i new_format_low = _mm_shuffle_epi8( + _mm_setr_epi8(table[0], table[1], table[2], table[3], table[4], + table[5], table[6], table[7], table[8], table[9], + table[10], table[11], table[12], 0, 0, 0), + index_in_low); + + uint32_t prelaundered = + static_cast(_mm_cvtsi128_si32(new_format_low)); + return *reinterpret_cast(&prelaundered); + + } else { + return texture_format; + } +} +#else inline xenos::TextureFormat GetBaseFormat(xenos::TextureFormat texture_format) { // These formats are used for resampling textures / gamma control. + // 11 entries switch (texture_format) { case xenos::TextureFormat::k_16_EXPAND: return xenos::TextureFormat::k_16_FLOAT; @@ -50,7 +119,7 @@ inline xenos::TextureFormat GetBaseFormat(xenos::TextureFormat texture_format) { return texture_format; } - +#endif inline size_t GetTexelSize(xenos::TextureFormat format) { switch (format) { case xenos::TextureFormat::k_1_5_5_5: