Huge set of performance improvements, combined with an architecture specific build and clang-cl users have reported absurd gains over master for some gains, in the range 50%-90%

But for normal msvc builds i would put it at around 30-50%
Added per-xexmodule caching of information per instruction, can be used to remember what code needs compiling at start up
Record what guest addresses wrote mmio and backpropagate that to future runs, eliminating dependence on exception trapping. this makes many games like h3 actually tolerable to run under a debugger
fixed a number of errors where temporaries were being passed by reference/pointer
Can now be compiled with clang-cl 14.0.1, requires -Werror off though and some other solution/project changes.
Added macros wrapping compiler extensions like noinline, forceinline, __expect, and cold.
Removed the "global lock" in guest code completely. It does not properly emulate the behavior of mfmsrd/mtmsr and it seriously cripples amd cpus. Removing this yielded around a 3x speedup in Halo Reach for me.
Disabled the microprofiler for now. The microprofiler has a huge performance cost associated with it. Developers can re-enable it in the base/profiling header if they really need it
Disable the trace writer in release builds. despite just returning after checking if the file was open the trace functions were consuming about 0.60% cpu time total
Add IsValidReg, GetRegisterInfo is a huge (about 45k) branching function and using that to check if a register was valid consumed a significant chunk of time
Optimized RingBuffer::ReadAndSwap and RingBuffer::read_count. This gave us the largest overall boost in performance. The memcpies were unnecessary and one of them was always a no-op
Added simplification rules for multiplicative patterns like (x+x), (x<<1)+x
For the most frequently called win32 functions i added code to call their underlying NT implementations, which lets us skip a lot of MS code we don't care about/isnt relevant to our usecases
^this can be toggled off in the platform_win header
handle indirect call true with constant function pointer, was occurring in h3
lookup host format swizzle in denser array
by default, don't check if a gpu register is unknown, instead just check if its out of range. controlled by a cvar
^looking up whether its known or not took approx 0.3% cpu time
Changed some things in /cpu to make the project UNITYBUILD friendly
The timer thread was spinning way too much and consuming a ton of cpu, changed it to use a blocking wait instead
tagged some conditions as XE_UNLIKELY/LIKELY based on profiler feedback (will only affect clang builds)
Shifted around some code in CommandProcessor::WriteRegister based on how frequently it was executed
added support for docdecaduple precision floating point so that we can represent our performance gains numerically
tons of other stuff im probably forgetting
This commit is contained in:
chss95cs@gmail.com 2022-08-13 12:59:00 -07:00
parent 2f59487bf3
commit cb85fe401c
49 changed files with 1462 additions and 483 deletions

View File

@ -46,7 +46,9 @@ static_assert((std::endian::native == std::endian::big) ||
namespace xe {
#if XE_COMPILER_MSVC
// chrispy: added workaround for clang, otherwise byteswap_ulong becomes calls
// to ucrtbase
#if XE_COMPILER_MSVC == 1 && !defined(__clang__)
#define XENIA_BASE_BYTE_SWAP_16 _byteswap_ushort
#define XENIA_BASE_BYTE_SWAP_32 _byteswap_ulong
#define XENIA_BASE_BYTE_SWAP_64 _byteswap_uint64

View File

@ -28,7 +28,8 @@ namespace xe {
class Win32MappedMemory : public MappedMemory {
public:
// CreateFile returns INVALID_HANDLE_VALUE in case of failure.
static constexpr HANDLE kFileHandleInvalid = INVALID_HANDLE_VALUE;
// chrispy: made inline const to get around clang error
static inline const HANDLE kFileHandleInvalid = INVALID_HANDLE_VALUE;
// CreateFileMapping returns nullptr in case of failure.
static constexpr HANDLE kMappingHandleInvalid = nullptr;

View File

@ -15,7 +15,15 @@
WINAPI_PARTITION_SYSTEM | WINAPI_PARTITION_GAMES)
#define XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
#endif
/*
these two dont bypass much ms garbage compared to the threading ones,
but Protect is used by PhysicalHeap::EnableAccessCallbacks which eats a lot
of cpu time, so every bit counts
*/
XE_NTDLL_IMPORT(NtProtectVirtualMemory, cls_NtProtectVirtualMemory,
NtProtectVirtualMemoryPointer);
XE_NTDLL_IMPORT(NtQueryVirtualMemory, cls_NtQueryVirtualMemory,
NtQueryVirtualMemoryPointer);
namespace xe {
namespace memory {
@ -139,6 +147,18 @@ bool Protect(void* base_address, size_t length, PageAccess access,
*out_old_access = PageAccess::kNoAccess;
}
DWORD new_protect = ToWin32ProtectFlags(access);
#if XE_USE_NTDLL_FUNCTIONS == 1
DWORD old_protect = 0;
SIZE_T MemoryLength = length;
PVOID MemoryCache = base_address;
BOOL result = NtProtectVirtualMemoryPointer.invoke<NTSTATUS>(
(HANDLE)0xFFFFFFFFFFFFFFFFLL, &MemoryCache, &MemoryLength,
new_protect, &old_protect) >= 0;
#else
#ifdef XE_BASE_MEMORY_WIN_USE_DESKTOP_FUNCTIONS
DWORD old_protect = 0;
BOOL result = VirtualProtect(base_address, length, new_protect, &old_protect);
@ -146,6 +166,7 @@ bool Protect(void* base_address, size_t length, PageAccess access,
ULONG old_protect = 0;
BOOL result = VirtualProtectFromApp(base_address, length, ULONG(new_protect),
&old_protect);
#endif
#endif
if (!result) {
return false;
@ -161,8 +182,17 @@ bool QueryProtect(void* base_address, size_t& length, PageAccess& access_out) {
MEMORY_BASIC_INFORMATION info;
ZeroMemory(&info, sizeof(info));
#if XE_USE_NTDLL_FUNCTIONS == 1
ULONG_PTR ResultLength;
NTSTATUS query_result = NtQueryVirtualMemoryPointer.invoke<NTSTATUS>(
(HANDLE)0xFFFFFFFFFFFFFFFFLL, (PVOID)base_address,
0 /* MemoryBasicInformation*/, &info, length, &ResultLength);
SIZE_T result = query_result >= 0 ? ResultLength : 0;
#else
SIZE_T result = VirtualQuery(base_address, &info, length);
#endif
if (!result) {
return false;
}

View File

@ -10,10 +10,9 @@
#include "xenia/base/mutex.h"
namespace xe {
std::recursive_mutex& global_critical_region::mutex() {
// chrispy: moved this out of body of function to eliminate the initialization
// guards
static std::recursive_mutex global_mutex;
return global_mutex;
}
std::recursive_mutex& global_critical_region::mutex() { return global_mutex; }
} // namespace xe

View File

@ -41,19 +41,33 @@
#error Unsupported target OS.
#endif
#if defined(__clang__)
#if defined(__clang__) && !defined(_MSC_VER) // chrispy: support clang-cl
#define XE_COMPILER_CLANG 1
#define XE_COMPILER_HAS_CLANG_EXTENSIONS 1
#elif defined(__GNUC__)
#define XE_COMPILER_GNUC 1
#define XE_COMPILER_HAS_GNU_EXTENSIONS 1
#elif defined(_MSC_VER)
#define XE_COMPILER_MSVC 1
#define XE_COMPILER_HAS_MSVC_EXTENSIONS 1
#elif defined(__MINGW32)
#define XE_COMPILER_MINGW32 1
#define XE_COMPILER_HAS_GNU_EXTENSIONS 1
#elif defined(__INTEL_COMPILER)
#define XE_COMPILER_INTEL 1
#else
#define XE_COMPILER_UNKNOWN 1
#endif
// chrispy: had to place this here.
#if defined(__clang__) && defined(_MSC_VER)
#define XE_COMPILER_CLANG_CL 1
#define XE_COMPILER_HAS_CLANG_EXTENSIONS 1
#endif
// clang extensions == superset of gnu extensions
#if XE_COMPILER_HAS_CLANG_EXTENSIONS == 1
#define XE_COMPILER_HAS_GNU_EXTENSIONS 1
#endif
#if defined(_M_AMD64) || defined(__amd64__)
#define XE_ARCH_AMD64 1
@ -93,6 +107,29 @@
#define XEPACKEDSTRUCTANONYMOUS(value) _XEPACKEDSCOPE(struct value)
#define XEPACKEDUNION(name, value) _XEPACKEDSCOPE(union name value)
#if XE_COMPILER_HAS_MSVC_EXTENSIONS == 1
#define XE_FORCEINLINE __forceinline
#define XE_NOINLINE __declspec(noinline)
// can't properly emulate "cold" in msvc, but can still segregate the function
// into its own seg
#define XE_COLD __declspec(code_seg(".cold"))
#define XE_LIKELY(...) (!!(__VA_ARGS__))
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
#define XE_FORCEINLINE __attribute__((always_inline))
#define XE_NOINLINE __attribute__((noinline))
#define XE_COLD __attribute__((cold))
#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
#else
#define XE_FORCEINLINE inline
#define XE_NOINLINE
#define XE_COLD
#define XE_LIKELY(...) (!!(__VA_ARGS__))
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
#endif
namespace xe {
#if XE_PLATFORM_WIN32

View File

@ -34,4 +34,31 @@
#undef DeleteFile
#undef GetFirstChild
#define XE_USE_NTDLL_FUNCTIONS 1
#if XE_USE_NTDLL_FUNCTIONS==1
/*
ntdll versions of functions often skip through a lot of extra garbage in KernelBase
*/
#define XE_NTDLL_IMPORT(name, cls, clsvar) \
static class cls { \
public: \
FARPROC fn;\
cls() : fn(nullptr) {\
auto ntdll = GetModuleHandleA("ntdll.dll");\
if (ntdll) { \
fn = GetProcAddress(ntdll, #name );\
}\
} \
template <typename TRet = void, typename... TArgs> \
inline TRet invoke(TArgs... args) {\
return reinterpret_cast<NTSYSAPI TRet(NTAPI*)(TArgs...)>(fn)(args...);\
}\
inline operator bool() const {\
return fn!=nullptr;\
}\
} clsvar
#else
#define XE_NTDLL_IMPORT(name, cls, clsvar) static constexpr bool clsvar = false
#endif
#endif // XENIA_BASE_PLATFORM_WIN_H_

View File

@ -20,7 +20,7 @@
#include "xenia/ui/virtual_key.h"
#include "xenia/ui/window_listener.h"
#if XE_PLATFORM_WIN32
#if XE_PLATFORM_WIN32 && 0
#define XE_OPTION_PROFILING 1
#define XE_OPTION_PROFILING_UI 1
#else

View File

@ -19,7 +19,26 @@
#include "xenia/base/byte_order.h"
namespace xe {
/*
todo: this class is CRITICAL to the performance of the entire emulator
currently, about 0.74% cpu time is still taken up by ReadAndSwap, 0.23
is used by read_count I believe that part of the issue is that smaller
ringbuffers are kicking off an automatic prefetcher stream, that ends up
reading ahead of the end of the ring because it can only go in a straight
line it then gets a cache miss when it eventually wraps around to the start
of the ring? really hard to tell whats going on there honestly, maybe we can
occasionally prefetch the first line of the ring to L1? For the automatic
prefetching i don't think there are any good options. I don't know if we have
any control over where these buffers will be (they seem to be in guest memory
:/), but if we did we could right-justify the buffer so that the final byte
of the ring ends at the end of a page. i think most automatic prefetchers
cannot cross page boundaries it does feel like something isnt right here
though
todo: microoptimization, we can change our size members to be uint32 so
that the registers no longer need the rex prefix, shrinking the generated
code a bit.. like i said, every bit helps in this class
*/
class RingBuffer {
public:
RingBuffer(uint8_t* buffer, size_t capacity);
@ -32,6 +51,8 @@ class RingBuffer {
uintptr_t read_ptr() const { return uintptr_t(buffer_) + read_offset_; }
void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
size_t read_count() const {
// chrispy: these branches are unpredictable
#if 0
if (read_offset_ == write_offset_) {
return 0;
} else if (read_offset_ < write_offset_) {
@ -39,6 +60,33 @@ class RingBuffer {
} else {
return (capacity_ - read_offset_) + write_offset_;
}
#else
size_t read_offs = read_offset_;
size_t write_offs = write_offset_;
size_t cap = capacity_;
size_t offset_delta = write_offs - read_offs;
size_t wrap_read_count = (cap - read_offs) + write_offs;
size_t comparison_value = read_offs <= write_offs;
#if 0
size_t selector =
static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
offset_delta &= selector;
wrap_read_count &= ~selector;
return offset_delta | wrap_read_count;
#else
if (XE_LIKELY(read_offs <= write_offs)) {
return offset_delta; // will be 0 if they are equal, semantically
// identical to old code (i checked the asm, msvc
// does not automatically do this)
} else {
return wrap_read_count;
}
#endif
#endif
}
size_t write_offset() const { return write_offset_; }
@ -113,6 +161,28 @@ class RingBuffer {
size_t write_offset_ = 0;
};
template <>
inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
size_t read_offset = this->read_offset_;
xenia_assert(this->capacity_ >= 4);
size_t next_read_offset = read_offset + 4;
#if 0
size_t zerotest = next_read_offset - this->capacity_;
// unpredictable branch, use bit arith instead
// todo: it would be faster to use lzcnt, but we need to figure out if all
// machines we support support it
next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
#else
if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
next_read_offset = 0;
//todo: maybe prefetch next? or should that happen much earlier?
}
#endif
this->read_offset_ = next_read_offset;
unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
return xe::byte_swap(ring_value);
}
} // namespace xe
#endif // XENIA_BASE_RING_BUFFER_H_

View File

@ -10,12 +10,12 @@
#include <algorithm>
#include <forward_list>
#include "third_party/disruptorplus/include/disruptorplus/blocking_wait_strategy.hpp"
#include "third_party/disruptorplus/include/disruptorplus/multi_threaded_claim_strategy.hpp"
#include "third_party/disruptorplus/include/disruptorplus/ring_buffer.hpp"
#include "third_party/disruptorplus/include/disruptorplus/sequence_barrier.hpp"
#include "third_party/disruptorplus/include/disruptorplus/spin_wait.hpp"
#include "third_party/disruptorplus/include/disruptorplus/spin_wait_strategy.hpp"
#include "xenia/base/assert.h"
#include "xenia/base/threading.h"
#include "xenia/base/threading_timer_queue.h"
@ -26,6 +26,12 @@ namespace xe {
namespace threading {
using WaitItem = TimerQueueWaitItem;
/*
chrispy: changed this to a blocking wait from a spin-wait, the spin was
monopolizing a ton of cpu time (depending on the game 2-4% of total cpu time)
on my 3990x no complaints since that change
*/
using WaitStrat = dp::blocking_wait_strategy;
class TimerQueue {
public:
@ -147,9 +153,10 @@ class TimerQueue {
// This ring buffer will be used to introduce timers queued by the public API
static constexpr size_t kWaitCount = 512;
dp::ring_buffer<std::shared_ptr<WaitItem>> buffer_;
dp::spin_wait_strategy wait_strategy_;
dp::multi_threaded_claim_strategy<dp::spin_wait_strategy> claim_strategy_;
dp::sequence_barrier<dp::spin_wait_strategy> consumed_;
WaitStrat wait_strategy_;
dp::multi_threaded_claim_strategy<WaitStrat> claim_strategy_;
dp::sequence_barrier<WaitStrat> consumed_;
// This is a _sorted_ (ascending due_) list of active timers managed by a
// dedicated thread

View File

@ -7,19 +7,49 @@
******************************************************************************
*/
#include <winternl.h>
#include "xenia/base/assert.h"
#include "xenia/base/chrono_steady_cast.h"
#include "xenia/base/logging.h"
#include "xenia/base/platform_win.h"
#include "xenia/base/threading.h"
#include "xenia/base/threading_timer_queue.h"
#if defined(__clang__)
// chrispy: i do not understand why this is an error for clang here
// something about the quoted __FUNCTION__ freaks it out (clang 14.0.1)
#define LOG_LASTERROR() \
{ XELOGI("Win32 Error 0x{:08X} in " __FUNCTION__ "(...)", GetLastError()); }
do { \
XELOGI("Win32 Error 0x{:08X} in {} (...)", GetLastError(), __FUNCTION__); \
} while (false)
#else
#define LOG_LASTERROR() \
do { \
XELOGI("Win32 Error 0x{:08X} in " __FUNCTION__ "(...)", GetLastError()); \
} while (false)
#endif
typedef HANDLE (*SetThreadDescriptionFn)(HANDLE hThread,
PCWSTR lpThreadDescription);
// sys function for ntyieldexecution, by calling it we sidestep
// RtlGetCurrentUmsThread
XE_NTDLL_IMPORT(NtYieldExecution, cls_NtYieldExecution,
NtYieldExecutionPointer);
// sidestep the activation context/remapping special windows handles like stdout
XE_NTDLL_IMPORT(NtWaitForSingleObject, cls_NtWaitForSingleObject,
NtWaitForSingleObjectPointer);
XE_NTDLL_IMPORT(NtSetEvent, cls_NtSetEvent, NtSetEventPointer);
// difference between NtClearEvent and NtResetEvent is that NtResetEvent returns
// the events state prior to the call, but we dont need that. might need to
// check whether one or the other is faster in the kernel though yeah, just
// checked, the code in ntoskrnl is way simpler for clearevent than resetevent
XE_NTDLL_IMPORT(NtClearEvent, cls_NtClearEvent, NtClearEventPointer);
XE_NTDLL_IMPORT(NtPulseEvent, cls_NtPulseEvent, NtPulseEventPointer);
// heavily called, we dont skip much garbage by calling this, but every bit
// counts
XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
NtReleaseSemaphorePointer);
namespace xe {
namespace threading {
@ -80,7 +110,13 @@ void set_name(const std::string_view name) {
}
void MaybeYield() {
#if defined(XE_USE_NTDLL_FUNCTIONS)
NtYieldExecutionPointer.invoke();
#else
SwitchToThread();
#endif
// memorybarrier is really not necessary here...
MemoryBarrier();
}
@ -134,8 +170,26 @@ class Win32Handle : public T {
WaitResult Wait(WaitHandle* wait_handle, bool is_alertable,
std::chrono::milliseconds timeout) {
HANDLE handle = wait_handle->native_handle();
DWORD result = WaitForSingleObjectEx(handle, DWORD(timeout.count()),
is_alertable ? TRUE : FALSE);
DWORD result;
DWORD timeout_dw = DWORD(timeout.count());
BOOL bAlertable = is_alertable ? TRUE : FALSE;
// todo: we might actually be able to use NtWaitForSingleObject even if its
// alertable, just need to study whether
// RtlDeactivateActivationContextUnsafeFast/RtlActivateActivationContext are
// actually needed for us
#if XE_USE_NTDLL_FUNCTIONS == 1
if (bAlertable) {
result = WaitForSingleObjectEx(handle, timeout_dw, bAlertable);
} else {
LARGE_INTEGER timeout_big;
timeout_big.QuadPart = -10000LL * static_cast<int64_t>(timeout_dw);
result = NtWaitForSingleObjectPointer.invoke<NTSTATUS>(
handle, bAlertable, timeout_dw == INFINITE ? nullptr : &timeout_big);
}
#else
result = WaitForSingleObjectEx(handle, timeout_dw, bAlertable);
#endif
switch (result) {
case WAIT_OBJECT_0:
return WaitResult::kSuccess;
@ -178,7 +232,9 @@ std::pair<WaitResult, size_t> WaitMultiple(WaitHandle* wait_handles[],
size_t wait_handle_count,
bool wait_all, bool is_alertable,
std::chrono::milliseconds timeout) {
std::vector<HANDLE> handles(wait_handle_count);
std::vector<HANDLE> handles(
wait_handle_count); // max handles is like 64, so it would make more
// sense to just do a fixed size array here
for (size_t i = 0; i < wait_handle_count; ++i) {
handles[i] = wait_handles[i]->native_handle();
}
@ -208,9 +264,16 @@ class Win32Event : public Win32Handle<Event> {
public:
explicit Win32Event(HANDLE handle) : Win32Handle(handle) {}
~Win32Event() override = default;
#if XE_USE_NTDLL_FUNCTIONS == 1
void Set() override { NtSetEventPointer.invoke(handle_, nullptr); }
void Reset() override { NtClearEventPointer.invoke(handle_); }
void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
#else
void Set() override { SetEvent(handle_); }
void Reset() override { ResetEvent(handle_); }
void Pulse() override { PulseEvent(handle_); }
#endif
};
std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
@ -220,6 +283,7 @@ std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
return std::make_unique<Win32Event>(handle);
} else {
LOG_LASTERROR();
return nullptr;
}
}
@ -240,10 +304,15 @@ class Win32Semaphore : public Win32Handle<Semaphore> {
explicit Win32Semaphore(HANDLE handle) : Win32Handle(handle) {}
~Win32Semaphore() override = default;
bool Release(int release_count, int* out_previous_count) override {
#if XE_USE_NTDLL_FUNCTIONS == 1
return NtReleaseSemaphorePointer.invoke<NTSTATUS>(handle_, release_count,
out_previous_count) >= 0;
#else
return ReleaseSemaphore(handle_, release_count,
reinterpret_cast<LPLONG>(out_previous_count))
? true
: false;
#endif
}
};

View File

@ -82,8 +82,9 @@ std::string upper_ascii(const std::string_view view) {
template <bool LOWER>
inline size_t hash_fnv1a(const std::string_view view) {
const size_t offset_basis = 0xCBF29CE484222325ull;
// chrispy: constant capture errors on clang
auto work = [](size_t hash, uint8_t byte_of_data) {
const size_t prime = 0x00000100000001B3ull;
auto work = [&prime](size_t hash, uint8_t byte_of_data) {
hash ^= byte_of_data;
hash *= prime;
return hash;

View File

@ -25,7 +25,7 @@
#include "xenia/cpu/breakpoint.h"
#include "xenia/cpu/processor.h"
#include "xenia/cpu/stack_walker.h"
#include "xenia/cpu/xex_module.h"
DEFINE_int32(x64_extension_mask, -1,
"Allow the detection and utilization of specific instruction set "
"features.\n"
@ -45,6 +45,12 @@ DEFINE_int32(x64_extension_mask, -1,
" -1 = Detect and utilize all possible processor features\n",
"x64");
DEFINE_bool(record_mmio_access_exceptions, true,
"For guest addresses records whether we caught any mmio accesses "
"for them. This info can then be used on a subsequent run to "
"instruct the recompiler to emit checks",
"CPU");
namespace xe {
namespace cpu {
namespace backend {
@ -86,6 +92,11 @@ X64Backend::~X64Backend() {
ExceptionHandler::Uninstall(&ExceptionCallbackThunk, this);
}
static void ForwardMMIOAccessForRecording(void* context, void* hostaddr) {
reinterpret_cast<X64Backend*>(context)
->RecordMMIOExceptionForGuestInstruction(hostaddr);
}
bool X64Backend::Initialize(Processor* processor) {
if (!Backend::Initialize(processor)) {
return false;
@ -146,6 +157,8 @@ bool X64Backend::Initialize(Processor* processor) {
// Setup exception callback
ExceptionHandler::Install(&ExceptionCallbackThunk, this);
processor->memory()->SetMMIOExceptionRecordingCallback(
ForwardMMIOAccessForRecording, (void*)this);
return true;
}
@ -390,7 +403,28 @@ bool X64Backend::ExceptionCallbackThunk(Exception* ex, void* data) {
auto backend = reinterpret_cast<X64Backend*>(data);
return backend->ExceptionCallback(ex);
}
void X64Backend::RecordMMIOExceptionForGuestInstruction(void* host_address) {
uint64_t host_addr_u64 = (uint64_t)host_address;
auto fnfor = code_cache()->LookupFunction(host_addr_u64);
if (fnfor) {
uint32_t guestaddr = fnfor->MapMachineCodeToGuestAddress(host_addr_u64);
Module* guest_module = fnfor->module();
if (guest_module) {
XexModule* xex_guest_module = dynamic_cast<XexModule*>(guest_module);
if (xex_guest_module) {
cpu::InfoCacheFlags* icf =
xex_guest_module->GetInstructionAddressFlags(guestaddr);
if (icf) {
icf->accessed_mmio = true;
}
}
}
}
}
bool X64Backend::ExceptionCallback(Exception* ex) {
if (ex->code() != Exception::Code::kIllegalInstruction) {
// We only care about illegal instructions. Other things will be handled by
@ -399,6 +433,8 @@ bool X64Backend::ExceptionCallback(Exception* ex) {
return false;
}
// processor_->memory()->LookupVirtualMappedRange()
// Verify an expected illegal instruction.
auto instruction_bytes =
xe::load_and_swap<uint16_t>(reinterpret_cast<void*>(ex->pc()));

View File

@ -92,6 +92,8 @@ class X64Backend : public Backend {
}
virtual void SetGuestRoundingMode(void* ctx, unsigned int mode) override;
void RecordMMIOExceptionForGuestInstruction(void* host_address);
private:
static bool ExceptionCallbackThunk(Exception* ex, void* data);
bool ExceptionCallback(Exception* ex);

View File

@ -156,7 +156,7 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
void** out_code_address, size_t* out_code_size,
std::vector<SourceMapEntry>* out_source_map) {
SCOPE_profile_cpu_f("cpu");
guest_module_ = dynamic_cast<XexModule*>(function->module());
// Reset.
debug_info_ = debug_info;
debug_info_flags_ = debug_info_flags;

View File

@ -18,8 +18,8 @@
#include "xenia/cpu/hir/hir_builder.h"
#include "xenia/cpu/hir/instr.h"
#include "xenia/cpu/hir/value.h"
#include "xenia/cpu/xex_module.h"
#include "xenia/memory.h"
// NOTE: must be included last as it expects windows.h to already be included.
#include "third_party/xbyak/xbyak/xbyak.h"
#include "third_party/xbyak/xbyak/xbyak_util.h"
@ -65,11 +65,7 @@ enum class SimdDomain : uint32_t {
// CONFLICTING means its used in multiple domains)
};
enum class MXCSRMode : uint32_t {
Unknown,
Fpu,
Vmx
};
enum class MXCSRMode : uint32_t { Unknown, Fpu, Vmx };
static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
if (dom1 == dom2) {
@ -326,16 +322,21 @@ class X64Emitter : public Xbyak::CodeGenerator {
size_t stack_size() const { return stack_size_; }
SimdDomain DeduceSimdDomain(const hir::Value* for_value);
void ForgetMxcsrMode() {
mxcsr_mode_ = MXCSRMode::Unknown;
}
void ForgetMxcsrMode() { mxcsr_mode_ = MXCSRMode::Unknown; }
/*
returns true if had to load mxcsr. DOT_PRODUCT can use this to skip clearing the overflow flag, as it will never be set in the vmx fpscr
returns true if had to load mxcsr. DOT_PRODUCT can use this to skip
clearing the overflow flag, as it will never be set in the vmx fpscr
*/
bool ChangeMxcsrMode(MXCSRMode new_mode, bool already_set=false);//already_set means that the caller already did vldmxcsr, used for SET_ROUNDING_MODE
bool ChangeMxcsrMode(
MXCSRMode new_mode,
bool already_set = false); // already_set means that the caller already
// did vldmxcsr, used for SET_ROUNDING_MODE
void LoadFpuMxcsrDirect(); // unsafe, does not change mxcsr_mode_
void LoadVmxMxcsrDirect(); // unsafe, does not change mxcsr_mode_
XexModule* GuestModule() { return guest_module_; }
protected:
void* Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function = nullptr);
@ -348,6 +349,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
X64Backend* backend_ = nullptr;
X64CodeCache* code_cache_ = nullptr;
XbyakAllocator* allocator_ = nullptr;
XexModule* guest_module_ = nullptr;
Xbyak::util::Cpu cpu_;
uint32_t feature_flags_ = 0;

View File

@ -60,23 +60,46 @@ union InstrKey {
InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
InstrKey(uint32_t v) : value(v) {}
// this used to take about 1% cpu while precompiling
// it kept reloading opcode, and also constantly repacking and unpacking the
// bitfields. instead, we pack the fields at the very end
InstrKey(const Instr* i) : value(0) {
opcode = i->opcode->num;
uint32_t sig = i->opcode->signature;
dest =
GET_OPCODE_SIG_TYPE_DEST(sig) ? OPCODE_SIG_TYPE_V + i->dest->type : 0;
src1 = GET_OPCODE_SIG_TYPE_SRC1(sig);
if (src1 == OPCODE_SIG_TYPE_V) {
src1 += i->src1.value->type;
const OpcodeInfo* info = i->GetOpcodeInfo();
uint32_t sig = info->signature;
OpcodeSignatureType dest_type, src1_type, src2_type, src3_type;
UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type);
uint32_t out_desttype = (uint32_t)dest_type;
uint32_t out_src1type = (uint32_t)src1_type;
uint32_t out_src2type = (uint32_t)src2_type;
uint32_t out_src3type = (uint32_t)src3_type;
Value* destv = i->dest;
// pre-deref, even if not value
Value* src1v = i->src1.value;
Value* src2v = i->src2.value;
Value* src3v = i->src3.value;
if (out_src1type == OPCODE_SIG_TYPE_V) {
out_src1type += src1v->type;
}
src2 = GET_OPCODE_SIG_TYPE_SRC2(sig);
if (src2 == OPCODE_SIG_TYPE_V) {
src2 += i->src2.value->type;
if (out_src2type == OPCODE_SIG_TYPE_V) {
out_src2type += src2v->type;
}
src3 = GET_OPCODE_SIG_TYPE_SRC3(sig);
if (src3 == OPCODE_SIG_TYPE_V) {
src3 += i->src3.value->type;
if (out_src3type == OPCODE_SIG_TYPE_V) {
out_src3type += src3v->type;
}
opcode = info->num;
dest = out_desttype ? OPCODE_SIG_TYPE_V + destv->type : 0;
src1 = out_src1type;
src2 = out_src2type;
src3 = out_src3type;
}
template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,

View File

@ -18,7 +18,7 @@
#include "xenia/cpu/backend/x64/x64_op.h"
#include "xenia/cpu/backend/x64/x64_tracers.h"
#include "xenia/cpu/ppc/ppc_context.h"
#include "xenia/cpu/processor.h"
DEFINE_bool(
elide_e0_check, false,
"Eliminate e0 check on some memory accesses, like to r13(tls) or r1(sp)",
@ -27,6 +27,10 @@ DEFINE_bool(enable_rmw_context_merging, false,
"Permit merging read-modify-write HIR instr sequences together "
"into x86 instructions that use a memory operand.",
"x64");
DEFINE_bool(emit_mmio_aware_stores_for_recorded_exception_addresses, true,
"Uses info gathered via record_mmio_access_exceptions to emit "
"special stores that are faster than trapping the exception",
"CPU");
namespace xe {
namespace cpu {
@ -965,6 +969,21 @@ struct STORE_MMIO_I32
}
};
EMITTER_OPCODE_TABLE(OPCODE_STORE_MMIO, STORE_MMIO_I32);
// according to triangle we dont support mmio reads atm so no point in
// implementing this for them
static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) {
return false;
}
uint32_t guestaddr = i->GuestAddressFor();
if (!guestaddr) {
return false;
}
auto flags = e.GuestModule()->GetInstructionAddressFlags(guestaddr);
return flags && flags->accessed_mmio;
}
// ============================================================================
// OPCODE_LOAD_OFFSET
@ -1030,6 +1049,28 @@ struct LOAD_OFFSET_I64
EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
LOAD_OFFSET_I32, LOAD_OFFSET_I64);
template <typename T, bool swap>
static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
if (swap) {
value = xe::byte_swap(value);
}
if (guestaddr >= 0xE0000000) {
guestaddr += 0x1000;
}
auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
if (!gaddr) {
*reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
} else {
value = xe::byte_swap(value); /*
was having issues, found by comparing the values used with exceptions
to these that we were reversed...
*/
gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
}
}
// ============================================================================
// OPCODE_STORE_OFFSET
// ============================================================================
@ -1038,6 +1079,7 @@ struct STORE_OFFSET_I8
I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I8Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
if (i.src3.is_constant) {
e.mov(e.byte[addr], i.src3.constant());
} else {
@ -1076,6 +1118,30 @@ struct STORE_OFFSET_I32
: Sequence<STORE_OFFSET_I32,
I<OPCODE_STORE_OFFSET, VoidOp, I64Op, I64Op, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (IsPossibleMMIOInstruction(e, i.instr)) {
void* addrptr = (void*)&MMIOAwareStore<uint32_t, false>;
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
addrptr = (void*)&MMIOAwareStore<uint32_t, true>;
}
if (i.src1.is_constant) {
e.mov(e.GetNativeParam(0).cvt32(), i.src1.constant());
} else {
e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
}
if (i.src2.is_constant) {
e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant());
} else {
e.add(e.GetNativeParam(0).cvt32(), i.src2);
}
if (i.src3.is_constant) {
e.mov(e.GetNativeParam(1).cvt32(), i.src3.constant());
} else {
e.mov(e.GetNativeParam(1).cvt32(), i.src3);
}
e.CallNativeSafe(addrptr);
} else {
auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
assert_false(i.src3.is_constant);
@ -1096,6 +1162,7 @@ struct STORE_OFFSET_I32
}
}
}
}
};
struct STORE_OFFSET_I64
@ -1290,6 +1357,25 @@ struct STORE_I16 : Sequence<STORE_I16, I<OPCODE_STORE, VoidOp, I64Op, I16Op>> {
};
struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
if (IsPossibleMMIOInstruction(e, i.instr)) {
void* addrptr = (void*)&MMIOAwareStore<uint32_t, false>;
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
addrptr = (void*)&MMIOAwareStore<uint32_t, true>;
}
if (i.src1.is_constant) {
e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
} else {
e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
}
if (i.src2.is_constant) {
e.mov(e.GetNativeParam(1).cvt32(), i.src2.constant());
} else {
e.mov(e.GetNativeParam(1).cvt32(), i.src2);
}
e.CallNativeSafe(addrptr);
} else {
auto addr = ComputeMemoryAddress(e, i.src1);
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
assert_false(i.src2.is_constant);
@ -1305,8 +1391,9 @@ struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
e.mov(e.dword[addr], i.src2);
}
}
}
if (IsTracingData()) {
addr = ComputeMemoryAddress(e, i.src1);
auto addr = ComputeMemoryAddress(e, i.src1);
e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
e.lea(e.GetNativeParam(0), e.ptr[addr]);
e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));

View File

@ -1683,6 +1683,9 @@ struct DIV_I16 : Sequence<DIV_I16, I<OPCODE_DIV, I16Op, I16Op, I16Op>> {
assert_impossible_sequence(DIV_I16);
}
};
/*
TODO: hoist the overflow/zero checks into HIR
*/
struct DIV_I32 : Sequence<DIV_I32, I<OPCODE_DIV, I32Op, I32Op, I32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
Xbyak::Label skip;
@ -1766,6 +1769,9 @@ struct DIV_I32 : Sequence<DIV_I32, I<OPCODE_DIV, I32Op, I32Op, I32Op>> {
e.mov(i.dest, e.eax);
}
};
/*
TODO: hoist the overflow/zero checks into HIR
*/
struct DIV_I64 : Sequence<DIV_I64, I<OPCODE_DIV, I64Op, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) {
Xbyak::Label skip;
@ -1811,7 +1817,7 @@ struct DIV_I64 : Sequence<DIV_I64, I<OPCODE_DIV, I64Op, I64Op, I64Op>> {
} else {
// check for signed overflow
if (i.src1.is_constant) {
if (i.src1.constant() != (1 << 31)) {
if (i.src1.constant() != (1ll << 63)) {
// we're good, overflow is impossible
} else {
e.cmp(i.src2, -1); // otherwise, if src2 is -1 then we have

View File

@ -149,7 +149,20 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
i->Remove();
}
result = true;
} else if (i->src2.value->IsConstant()) { // chrispy: fix h3 bug from
// const indirect call true
auto function = processor_->LookupFunction(
uint32_t(i->src2.value->constant.i32));
if (!function) {
break;
}
// i->Replace(&OPCODE_CALL_TRUE_info, i->flags);
i->opcode = &OPCODE_CALL_TRUE_info;
i->set_src2(nullptr);
i->src2.symbol = function;
result = true;
}
break;
case OPCODE_BRANCH_TRUE:

View File

@ -796,10 +796,13 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
if (var_definition) {
var_definition = var_definition->GetDestDefSkipAssigns();
if (var_definition != NULL)
{
if (!var_definition) {
return false;
}
def_opcode = var_definition->opcode->num;
}
if (!var_definition) {
return false;
}
// x == 0 -> !x
if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) {
@ -1231,8 +1234,7 @@ Value* SimplificationPass::CheckValue(Value* value, bool& result) {
result = false;
return value;
}
bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
bool SimplificationPass::SimplifyAddWithSHL(hir::Instr* i,
hir::HIRBuilder* builder) {
/*
example: (x <<1 ) + x == (x*3)
@ -1278,11 +1280,81 @@ bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
return true;
}
bool SimplificationPass::SimplifyAddToSelf(hir::Instr* i,
hir::HIRBuilder* builder) {
/*
heres a super easy one
*/
if (i->src1.value != i->src2.value) {
return false;
}
i->opcode = &OPCODE_SHL_info;
i->set_src2(builder->LoadConstantUint8(1));
return true;
}
bool SimplificationPass::SimplifyAddArith(hir::Instr* i,
hir::HIRBuilder* builder) {
if (SimplifyAddWithSHL(i, builder)) {
return true;
}
if (SimplifyAddToSelf(i, builder)) {
return true;
}
return false;
}
bool SimplificationPass::SimplifySubArith(hir::Instr* i,
hir::HIRBuilder* builder) {
/*
todo: handle expressions like (x*8) - (x*5) == (x*3)...if these can even
happen of course */
return false;
}
bool SimplificationPass::SimplifySHLArith(hir::Instr* i,
hir::HIRBuilder* builder) {
Value* sh = i->src2.value;
Value* shifted = i->src1.value;
if (!sh->IsConstant()) {
return false;
}
hir::Instr* definition = shifted->GetDefSkipAssigns();
if (!definition) {
return false;
}
if (definition->GetOpcodeNum() != OPCODE_MUL) {
return false;
}
if (definition->flags != ARITHMETIC_UNSIGNED) {
return false;
}
auto [mulconst, mulnonconst] = definition->BinaryValueArrangeAsConstAndVar();
if (!mulconst) {
return false;
}
auto newmul = builder->AllocValue(mulconst->type);
newmul->set_from(mulconst);
newmul->Shl(sh);
i->Replace(&OPCODE_MUL_info, ARITHMETIC_UNSIGNED);
i->set_src1(mulnonconst);
i->set_src2(newmul);
return true;
}
bool SimplificationPass::SimplifyBasicArith(hir::Instr* i,
hir::HIRBuilder* builder) {
if (!i->dest) {
@ -1301,6 +1373,9 @@ bool SimplificationPass::SimplifyBasicArith(hir::Instr* i,
case OPCODE_SUB: {
return SimplifySubArith(i, builder);
}
case OPCODE_SHL: {
return SimplifySHLArith(i, builder);
}
}
return false;
}
@ -1317,6 +1392,97 @@ bool SimplificationPass::SimplifyBasicArith(hir::HIRBuilder* builder) {
}
return result;
}
/*
todo: add load-store simplification pass
do things like load-store byteswap elimination, for instance,
if a value is loaded, ored with a constant mask, and then stored, we
simply have to byteswap the mask it will be ored with and then we can
eliminate the two byteswaps
the same can be done for and, or, xor, andn with constant masks
this can also be done for comparisons with 0 for equality and not equal
another optimization: with ppc you cannot move a floating point register
directly to a gp one, a gp one directly to a floating point register, or a
vmx one to either. so guest code will store the result to the stack, and then
load it to the register it needs in HIR we can sidestep this. we will still
need to byteswap and store the result for correctness, but we can eliminate
the load and byteswap by grabbing the original value from the store
skyth's sanic idb, 0x824D7724
lis r11,
lfs f0, flt_8200CBCC@l(r11)
fmuls f0, time, f0
fctidz f0, f0 # vcvttss2si
stfd f0, 0x190+var_138(r1)
lwz r30, 0x190+var_138+4(r1)
cmplwi cr6, r30, 0x63 # 'c'
ble cr6, counter_op
*/
/*
todo: simple loop unrolling
skyth sanic 0x831D9908
mr r30, r4
mr r29, r5
mr r11, r7
li r31, 0
loc_831D9928:
slwi r9, r11, 1
addi r10, r11, 1
addi r8, r1, 0xD0+var_80
clrlwi r11, r10, 16
cmplwi cr6, r11, 0x10
sthx r31, r9, r8
ble cr6, loc_831D9928
v5 = 1;
do
{
v6 = 2 * v5;
v5 = (unsigned __int16)(v5 + 1);
*(_WORD *)&v24[v6] = 0;
}
while ( v5 <= 0x10 );
v7 = 0;
do
{
v8 = __ROL4__(*(unsigned __int8 *)(v7 + a2), 1);
v7 = (unsigned __int16)(v7 + 1);
++*(_WORD *)&v24[v8];
}
while ( v7 < 8 );
v9 = 1;
v25[0] = 0;
do
{
v10 = 2 * v9;
v11 = 16 - v9;
v9 = (unsigned __int16)(v9 + 1);
v25[v10 / 2] = (*(_WORD *)&v24[v10] << v11) + *(_WORD
*)&v24[v10 + 48];
}
while ( v9 <= 0x10 );
skyth sanic:
sub_831BBAE0
sub_831A41A8
*/
} // namespace passes
} // namespace compiler
} // namespace cpu

View File

@ -36,9 +36,11 @@ class SimplificationPass : public ConditionalGroupSubpass {
// handles simple multiplication/addition rules
bool SimplifyBasicArith(hir::HIRBuilder* builder);
bool SimplifyBasicArith(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyAddWithSHL(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyAddToSelf(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifyAddArith(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifySubArith(hir::Instr* i, hir::HIRBuilder* builder);
bool SimplifySHLArith(hir::Instr* i, hir::HIRBuilder* builder);
// handle either or or xor with 0
bool CheckOrXorZero(hir::Instr* i);
bool CheckOr(hir::Instr* i, hir::HIRBuilder* builder);

View File

@ -200,6 +200,20 @@ const Instr* Instr::GetNonFakePrev() const {
}
return curr;
}
uint32_t Instr::GuestAddressFor() const {
Instr* srch = prev;
while (srch) {
if (srch->GetOpcodeNum() == OPCODE_SOURCE_OFFSET) {
return (uint32_t)srch->src1.offset;
}
srch = srch->prev;
}
return 0; // eek.
}
} // namespace hir
} // namespace cpu
} // namespace xe

View File

@ -169,6 +169,8 @@ if both are constant, return nullptr, nullptr
// gets previous instr, skipping instrs like COMMENT, OPCODE_CONTEXT_BARRIER,
// OPCODE_SOURCE_OFFSET
const hir::Instr* GetNonFakePrev() const;
uint32_t GuestAddressFor() const;
};
} // namespace hir

View File

@ -30,7 +30,8 @@ std::unique_ptr<MMIOHandler> MMIOHandler::Install(
HostToGuestVirtual host_to_guest_virtual,
const void* host_to_guest_virtual_context,
AccessViolationCallback access_violation_callback,
void* access_violation_callback_context) {
void* access_violation_callback_context,
MmioAccessRecordCallback record_mmio_callback, void* record_mmio_context) {
// There can be only one handler at a time.
assert_null(global_handler_);
if (global_handler_) {
@ -40,7 +41,8 @@ std::unique_ptr<MMIOHandler> MMIOHandler::Install(
auto handler = std::unique_ptr<MMIOHandler>(new MMIOHandler(
virtual_membase, physical_membase, membase_end, host_to_guest_virtual,
host_to_guest_virtual_context, access_violation_callback,
access_violation_callback_context));
access_violation_callback_context, record_mmio_callback,
record_mmio_context));
// Install the exception handler directed at the MMIOHandler.
ExceptionHandler::Install(ExceptionCallbackThunk, handler.get());
@ -54,14 +56,18 @@ MMIOHandler::MMIOHandler(uint8_t* virtual_membase, uint8_t* physical_membase,
HostToGuestVirtual host_to_guest_virtual,
const void* host_to_guest_virtual_context,
AccessViolationCallback access_violation_callback,
void* access_violation_callback_context)
void* access_violation_callback_context,
MmioAccessRecordCallback record_mmio_callback,
void* record_mmio_context)
: virtual_membase_(virtual_membase),
physical_membase_(physical_membase),
memory_end_(membase_end),
host_to_guest_virtual_(host_to_guest_virtual),
host_to_guest_virtual_context_(host_to_guest_virtual_context),
access_violation_callback_(access_violation_callback),
access_violation_callback_context_(access_violation_callback_context) {}
access_violation_callback_context_(access_violation_callback_context),
record_mmio_callback_(record_mmio_callback),
record_mmio_context_(record_mmio_context) {}
MMIOHandler::~MMIOHandler() {
ExceptionHandler::Uninstall(ExceptionCallbackThunk, this);
@ -412,6 +418,8 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
// Quick kill anything outside our mapping.
return false;
}
uint64_t hostip = ex->pc();
void* fault_host_address = reinterpret_cast<void*>(ex->fault_address());
// Access violations are pretty rare, so we can do a linear search here.
@ -561,6 +569,13 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
}
#endif // XE_ARCH_ARM64
if (record_mmio_callback_) {
// record that the guest address corresponding to the faulting instructions'
// host address reads/writes mmio. we can backpropagate this info on future
// compilations
record_mmio_callback_(record_mmio_context_, (void*)ex->pc());
}
// Advance RIP to the next instruction so that we resume properly.
ex->set_resume_pc(rip + decoded_load_store.length);

View File

@ -29,7 +29,8 @@ typedef uint32_t (*MMIOReadCallback)(void* ppc_context, void* callback_context,
uint32_t addr);
typedef void (*MMIOWriteCallback)(void* ppc_context, void* callback_context,
uint32_t addr, uint32_t value);
typedef void (*MmioAccessRecordCallback)(void* context,
void* host_insn_address);
struct MMIORange {
uint32_t address;
uint32_t mask;
@ -58,7 +59,8 @@ class MMIOHandler {
HostToGuestVirtual host_to_guest_virtual,
const void* host_to_guest_virtual_context,
AccessViolationCallback access_violation_callback,
void* access_violation_callback_context);
void* access_violation_callback_context,
MmioAccessRecordCallback record_mmio_callback, void* record_mmio_context);
static MMIOHandler* global_handler() { return global_handler_; }
bool RegisterRange(uint32_t virtual_address, uint32_t mask, uint32_t size,
@ -68,13 +70,20 @@ class MMIOHandler {
bool CheckLoad(uint32_t virtual_address, uint32_t* out_value);
bool CheckStore(uint32_t virtual_address, uint32_t value);
void SetMMIOExceptionRecordingCallback(MmioAccessRecordCallback callback,
void* context) {
record_mmio_context_ = context;
record_mmio_callback_ = callback;
}
protected:
MMIOHandler(uint8_t* virtual_membase, uint8_t* physical_membase,
uint8_t* membase_end, HostToGuestVirtual host_to_guest_virtual,
const void* host_to_guest_virtual_context,
AccessViolationCallback access_violation_callback,
void* access_violation_callback_context);
void* access_violation_callback_context,
MmioAccessRecordCallback record_mmio_callback,
void* record_mmio_context);
static bool ExceptionCallbackThunk(Exception* ex, void* data);
bool ExceptionCallback(Exception* ex);
@ -90,7 +99,9 @@ class MMIOHandler {
AccessViolationCallback access_violation_callback_;
void* access_violation_callback_context_;
MmioAccessRecordCallback record_mmio_callback_;
void* record_mmio_context_;
static MMIOHandler* global_handler_;
xe::global_critical_region global_critical_region_;

View File

@ -1439,11 +1439,23 @@ int InstrEmit_vsel(PPCHIRBuilder& f, const InstrData& i) {
int InstrEmit_vsel128(PPCHIRBuilder& f, const InstrData& i) {
return InstrEmit_vsel_(f, VX128_VD128, VX128_VA128, VX128_VB128, VX128_VD128);
}
// chrispy: this is test code for checking whether a game takes advantage of the
// VSR/VSL undocumented/undefined variable shift behavior
static void AssertShiftElementsOk(PPCHIRBuilder& f, Value* v) {
#if 0
Value* splatted = f.Splat(f.Extract(v, (uint8_t)0, INT8_TYPE), VEC128_TYPE);
Value* checkequal = f.Xor(splatted, v);
f.DebugBreakTrue(f.IsTrue(checkequal));
#endif
}
int InstrEmit_vsl(PPCHIRBuilder& f, const InstrData& i) {
Value* v = f.Shl(f.LoadVR(i.VX.VA),
f.And(f.Extract(f.LoadVR(i.VX.VB), 15, INT8_TYPE),
f.LoadConstantInt8(0b111)));
Value* va = f.LoadVR(i.VX.VA);
Value* vb = f.LoadVR(i.VX.VB);
AssertShiftElementsOk(f, vb);
Value* v =
f.Shl(va, f.And(f.Extract(vb, 15, INT8_TYPE), f.LoadConstantInt8(0b111)));
f.StoreVR(i.VX.VD, v);
return 0;
}
@ -1623,9 +1635,13 @@ int InstrEmit_vspltisw128(PPCHIRBuilder& f, const InstrData& i) {
}
int InstrEmit_vsr(PPCHIRBuilder& f, const InstrData& i) {
Value* v = f.Shr(f.LoadVR(i.VX.VA),
f.And(f.Extract(f.LoadVR(i.VX.VB), 15, INT8_TYPE),
f.LoadConstantInt8(0b111)));
Value* va = f.LoadVR(i.VX.VA);
Value* vb = f.LoadVR(i.VX.VB);
AssertShiftElementsOk(f, vb);
Value* v =
f.Shr(va, f.And(f.Extract(vb, 15, INT8_TYPE), f.LoadConstantInt8(0b111)));
f.StoreVR(i.VX.VD, v);
return 0;
}

View File

@ -769,8 +769,14 @@ int InstrEmit_mfmsr(PPCHIRBuilder& f, const InstrData& i) {
// bit 62 = RI; recoverable interrupt
// return 8000h if unlocked (interrupts enabled), else 0
f.MemoryBarrier();
if (cvars::disable_global_lock || true) {
f.StoreGPR(i.X.RT, f.LoadConstantUint64(0));
} else {
f.CallExtern(f.builtins()->check_global_lock);
f.StoreGPR(i.X.RT, f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
f.StoreGPR(i.X.RT,
f.LoadContext(offsetof(PPCContext, scratch), INT64_TYPE));
}
return 0;
}
@ -782,6 +788,7 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) {
f.StoreContext(
offsetof(PPCContext, scratch),
f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE));
#if 0
if (i.X.RT == 13) {
// iff storing from r13 we are taking a lock (disable interrupts).
if (!cvars::disable_global_lock) {
@ -793,6 +800,7 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) {
f.CallExtern(f.builtins()->leave_global_lock);
}
}
#endif
return 0;
} else {
// L = 0
@ -807,6 +815,7 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) {
f.MemoryBarrier();
f.StoreContext(offsetof(PPCContext, scratch),
f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE));
#if 0
if (i.X.RT == 13) {
// iff storing from r13 we are taking a lock (disable interrupts).
if (!cvars::disable_global_lock) {
@ -818,6 +827,7 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) {
f.CallExtern(f.builtins()->leave_global_lock);
}
}
#endif
return 0;
} else {
// L = 0

View File

@ -5406,6 +5406,7 @@ PPCOpcodeDisasmInfo ppc_opcode_disasm_table[] = {
INSTRUCTION(0x6c000000, "xoris" , kD , kI, kGeneral, "XOR Immediate Shifted" , (PPCOpcodeField::kRS,PPCOpcodeField::kUIMM), (PPCOpcodeField::kRA), PrintDisasm_xoris),
INSTRUCTION(0x7c000278, "xorx" , kX , kI, kGeneral, "XOR" , (PPCOpcodeField::kRS,PPCOpcodeField::kRB), (PPCOpcodeField::kRA,PPCOpcodeField::kCRcond), PrintDisasm_xorx),
};
#undef INSTRUCTION
static_assert(sizeof(ppc_opcode_disasm_table) / sizeof(PPCOpcodeDisasmInfo) == static_cast<int>(PPCOpcode::kInvalid), "PPC table mismatch - rerun ppc-table-gen");
const PPCOpcodeDisasmInfo& GetOpcodeDisasmInfo(PPCOpcode opcode) {

View File

@ -470,6 +470,7 @@ PPCOpcodeInfo ppc_opcode_table[] = {
INSTRUCTION(0x6c000000, "xoris" , kD , kI, kGeneral),
INSTRUCTION(0x7c000278, "xorx" , kX , kI, kGeneral),
};
#undef INSTRUCTION
static_assert(sizeof(ppc_opcode_table) / sizeof(PPCOpcodeInfo) == static_cast<int>(PPCOpcode::kInvalid), "PPC table mismatch - rerun ppc-table-gen");
const PPCOpcodeInfo& GetOpcodeInfo(PPCOpcode opcode) {

View File

@ -257,11 +257,22 @@ Function* Processor::ResolveFunction(uint32_t address) {
// Grab symbol declaration.
auto function = LookupFunction(address);
if (!function) {
entry->status = Entry::STATUS_FAILED;
return nullptr;
}
auto module_for = function->module();
auto xexmod = dynamic_cast<XexModule*>(module_for);
if (xexmod) {
auto addr_flags = xexmod->GetInstructionAddressFlags(address);
if (addr_flags) {
addr_flags->was_resolved = 1;
}
}
if (!DemandFunction(function)) {
entry->status = Entry::STATUS_FAILED;
return nullptr;

View File

@ -14,13 +14,16 @@
#include "third_party/fmt/include/fmt/format.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/cpu/cpu_flags.h"
#include "xenia/cpu/export_resolver.h"
#include "xenia/cpu/lzx.h"
#include "xenia/cpu/processor.h"
#include "xenia/emulator.h"
#include "xenia/kernel/kernel_state.h"
#include "xenia/kernel/xmodule.h"
@ -29,6 +32,14 @@
#include "third_party/crypto/rijndael-alg-fst.h"
#include "third_party/pe/pe_image.h"
DEFINE_bool(disable_instruction_infocache, false,
"Disables caching records of called instructions/mmio accesses.",
"CPU");
DEFINE_bool(disable_function_precompilation, true,
"Disables pre-compiling guest functions that we know we've called "
"on previous runs",
"CPU");
static const uint8_t xe_xex2_retail_key[16] = {
0x20, 0xB1, 0x85, 0xA5, 0x9D, 0x28, 0xFD, 0xC3,
0x40, 0x58, 0x3F, 0xBB, 0x08, 0x96, 0xBF, 0x91};
@ -977,6 +988,7 @@ bool XexModule::LoadContinue() {
// Scan and find the low/high addresses.
// All code sections are continuous, so this should be easy.
// could use a source for the above information
auto heap = memory()->LookupHeap(base_address_);
auto page_size = heap->page_size();
@ -1045,7 +1057,24 @@ bool XexModule::LoadContinue() {
library_offset += library->size;
}
}
sha1::SHA1 final_image_sha_;
final_image_sha_.reset();
unsigned high_code = this->high_address_ - this->low_address_;
final_image_sha_.processBytes(memory()->TranslateVirtual(this->low_address_),
high_code);
final_image_sha_.finalize(image_sha_bytes_);
char fmtbuf[16];
for (unsigned i = 0; i < 16; ++i) {
sprintf_s(fmtbuf, "%X", image_sha_bytes_[i]);
image_sha_str_ += &fmtbuf[0];
}
info_cache_.Init(this);
// Find __savegprlr_* and __restgprlr_* and the others.
// We can flag these for special handling (inlining/etc).
if (!FindSaveRest()) {
@ -1288,7 +1317,68 @@ std::unique_ptr<Function> XexModule::CreateFunction(uint32_t address) {
return std::unique_ptr<Function>(
processor_->backend()->CreateGuestFunction(this, address));
}
void XexInfoCache::Init(XexModule* xexmod) {
if (cvars::disable_instruction_infocache) {
return;
}
auto emu = xexmod->kernel_state_->emulator();
std::filesystem::path infocache_path = emu->cache_root();
infocache_path.append(L"modules");
infocache_path.append(xexmod->image_sha_str_);
std::filesystem::create_directories(infocache_path);
infocache_path.append("executable_addr_flags.bin");
unsigned num_codebytes = xexmod->high_address_ - xexmod->low_address_;
num_codebytes += 3; // round up to nearest multiple of 4
num_codebytes &= ~3;
bool did_exist = true;
if (!std::filesystem::exists(infocache_path)) {
xe::filesystem::CreateEmptyFile(infocache_path);
did_exist = false;
}
// todo: prepopulate with stuff from pdata, dll exports
this->executable_addr_flags_ = std::move(xe::MappedMemory::Open(
infocache_path, xe::MappedMemory::Mode::kReadWrite, 0,
sizeof(InfoCacheFlagsHeader) +
(sizeof(InfoCacheFlags) *
(num_codebytes /
4)))); // one infocacheflags entry for each PPC instr-sized addr
if (did_exist) {
xexmod->PrecompileKnownFunctions();
}
}
InfoCacheFlags* XexModule::GetInstructionAddressFlags(uint32_t guest_addr) {
if (guest_addr < low_address_ || guest_addr > high_address_) {
return nullptr;
}
guest_addr -= low_address_;
return info_cache_.LookupFlags(guest_addr);
}
void XexModule::PrecompileKnownFunctions() {
if (cvars::disable_function_precompilation) {
return;
}
uint32_t start = 0;
uint32_t end = (high_address_ - low_address_) / 4;
auto flags = info_cache_.LookupFlags(0);
if (!flags) {
return;
}
for (uint32_t i = 0; i < end; i++) {
if (flags[i].was_resolved) {
processor_->ResolveFunction(low_address_ + (i * 4));
}
}
}
bool XexModule::FindSaveRest() {
// Special stack save/restore functions.
// http://research.microsoft.com/en-us/um/redmond/projects/invisible/src/crt/md/ppc/xxx.s.htm

View File

@ -12,7 +12,7 @@
#include <string>
#include <vector>
#include "xenia/base/mapped_memory.h"
#include "xenia/cpu/module.h"
#include "xenia/kernel/util/xex2_info.h"
@ -30,6 +30,39 @@ constexpr fourcc_t kXEX2Signature = make_fourcc("XEX2");
constexpr fourcc_t kElfSignature = make_fourcc(0x7F, 'E', 'L', 'F');
class Runtime;
struct InfoCacheFlags {
uint32_t was_resolved : 1; // has this address ever been called/requested
// via resolvefunction?
uint32_t accessed_mmio : 1;
uint32_t reserved : 30;
};
struct XexInfoCache {
struct InfoCacheFlagsHeader {
unsigned char reserved[256]; // put xenia version here
InfoCacheFlags* LookupFlags(unsigned offset) {
return &reinterpret_cast<InfoCacheFlags*>(&this[1])[offset];
}
};
/*
for every 4-byte aligned address, records a 4 byte set of flags.
*/
std::unique_ptr<MappedMemory> executable_addr_flags_;
void Init(class XexModule*);
InfoCacheFlags* LookupFlags(unsigned offset) {
offset /= 4;
if (!executable_addr_flags_) {
return nullptr;
}
uint8_t* data = executable_addr_flags_->data();
if (!data) {
return nullptr;
}
return reinterpret_cast<InfoCacheFlagsHeader*>(data)->LookupFlags(offset);
}
};
class XexModule : public xe::cpu::Module {
public:
@ -174,10 +207,14 @@ class XexModule : public xe::cpu::Module {
XEX_MODULE_PATCH_FULL));
}
InfoCacheFlags* GetInstructionAddressFlags(uint32_t guest_addr);
void PrecompileKnownFunctions();
protected:
std::unique_ptr<Function> CreateFunction(uint32_t address) override;
private:
friend struct XexInfoCache;
void ReadSecurityInfo();
int ReadImage(const void* xex_addr, size_t xex_length, bool use_dev_key);
@ -217,6 +254,10 @@ class XexModule : public xe::cpu::Module {
XexFormat xex_format_ = kFormatUnknown;
SecurityInfoContext security_info_ = {};
uint8_t image_sha_bytes_[16];
std::string image_sha_str_;
XexInfoCache info_cache_;
};
} // namespace cpu

View File

@ -16,6 +16,7 @@
#include "third_party/fmt/include/fmt/format.h"
#include "xenia/base/byte_stream.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/profiling.h"
@ -28,6 +29,10 @@
#include "xenia/kernel/kernel_state.h"
#include "xenia/kernel/user_module.h"
DEFINE_bool(log_unknown_register_writes, false,
"Log writes to unknown registers from "
"CommandProcessor::WriteRegister. Has significant performance hit.",
"GPU");
namespace xe {
namespace gpu {
@ -329,19 +334,9 @@ void CommandProcessor::UpdateWritePointer(uint32_t value) {
write_ptr_index_ = value;
write_ptr_index_event_->Set();
}
void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
void CommandProcessor::HandleSpecialRegisterWrite(uint32_t index,
uint32_t value) {
RegisterFile& regs = *register_file_;
if (index >= RegisterFile::kRegisterCount) {
XELOGW("CommandProcessor::WriteRegister index out of bounds: {}", index);
return;
}
regs.values[index].u32 = value;
if (!regs.GetRegisterInfo(index)) {
XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
}
// Scratch register writeback.
if (index >= XE_GPU_REG_SCRATCH_REG0 && index <= XE_GPU_REG_SCRATCH_REG7) {
uint32_t scratch_reg = index - XE_GPU_REG_SCRATCH_REG0;
@ -469,6 +464,43 @@ void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
}
}
}
void CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
if (XE_UNLIKELY(cvars::log_unknown_register_writes)) {
// chrispy: rearrange check order, place set after checks
if (XE_UNLIKELY(!register_file_->IsValidRegister(index))) {
XELOGW("GPU: Write to unknown register ({:04X} = {:08X})", index, value);
check_reg_out_of_bounds:
if (XE_UNLIKELY(index >= RegisterFile::kRegisterCount)) {
XELOGW("CommandProcessor::WriteRegister index out of bounds: {}",
index);
return;
}
}
} else {
goto check_reg_out_of_bounds;
}
register_file_->values[index].u32 = value;
// regs with extra logic on write: XE_GPU_REG_COHER_STATUS_HOST
// XE_GPU_REG_DC_LUT_RW_INDEX
// XE_GPU_REG_DC_LUT_SEQ_COLOR XE_GPU_REG_DC_LUT_PWL_DATA
// XE_GPU_REG_DC_LUT_30_COLOR
// quick pre-test
// todo: figure out just how unlikely this is. if very (it ought to be, theres
// a ton of registers other than these) make this predicate branchless and
// mark with unlikely, then make HandleSpecialRegisterWrite noinline yep, its
// very unlikely. these ORS here are meant to be bitwise ors, so that we do
// not do branching evaluation of the conditions (we will almost always take
// all of the branches)
if (XE_UNLIKELY(
(index - XE_GPU_REG_SCRATCH_REG0 < 8) |
(index == XE_GPU_REG_COHER_STATUS_HOST) |
((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX)))) {
HandleSpecialRegisterWrite(index, value);
}
}
void CommandProcessor::MakeCoherent() {
SCOPE_profile_cpu_f("gpu");

View File

@ -150,7 +150,9 @@ class CommandProcessor {
void WorkerThreadMain();
virtual bool SetupContext() = 0;
virtual void ShutdownContext() = 0;
// rarely needed, most register writes have no special logic here
XE_NOINLINE
void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
virtual void WriteRegister(uint32_t index, uint32_t value);
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {

View File

@ -712,7 +712,7 @@ void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) {
ff_viewport_update_needed_ |= ff_viewport_.Height != viewport.Height;
ff_viewport_update_needed_ |= ff_viewport_.MinDepth != viewport.MinDepth;
ff_viewport_update_needed_ |= ff_viewport_.MaxDepth != viewport.MaxDepth;
if (ff_viewport_update_needed_) {
if (XE_UNLIKELY(ff_viewport_update_needed_)) {
ff_viewport_ = viewport;
deferred_command_list_.RSSetViewport(ff_viewport_);
ff_viewport_update_needed_ = false;

View File

@ -4799,18 +4799,16 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears(
if (!current_transfers.empty()) {
are_current_command_list_render_targets_valid_ = false;
if (dest_rt_key.is_depth) {
command_list.D3DOMSetRenderTargets(
0, nullptr, FALSE, &dest_d3d12_rt.descriptor_draw().GetHandle());
auto handle = dest_d3d12_rt.descriptor_draw().GetHandle();
command_list.D3DOMSetRenderTargets(0, nullptr, FALSE, &handle);
if (!use_stencil_reference_output_) {
command_processor_.SetStencilReference(UINT8_MAX);
}
} else {
command_list.D3DOMSetRenderTargets(
1,
&(dest_d3d12_rt.descriptor_load_separate().IsValid()
auto handle = dest_d3d12_rt.descriptor_load_separate().IsValid()
? dest_d3d12_rt.descriptor_load_separate().GetHandle()
: dest_d3d12_rt.descriptor_draw().GetHandle()),
FALSE, nullptr);
: dest_d3d12_rt.descriptor_draw().GetHandle();
command_list.D3DOMSetRenderTargets(1, &handle, FALSE, nullptr);
}
uint32_t dest_pitch_tiles = dest_rt_key.GetPitchTiles();
@ -5425,12 +5423,12 @@ void D3D12RenderTargetCache::PerformTransfersAndResolveClears(
dest_d3d12_rt.SetResourceState(D3D12_RESOURCE_STATE_RENDER_TARGET),
D3D12_RESOURCE_STATE_RENDER_TARGET);
if (clear_via_drawing) {
command_list.D3DOMSetRenderTargets(
1,
&(dest_d3d12_rt.descriptor_load_separate().IsValid()
auto handle =
(dest_d3d12_rt.descriptor_load_separate().IsValid()
? dest_d3d12_rt.descriptor_load_separate().GetHandle()
: dest_d3d12_rt.descriptor_draw().GetHandle()),
FALSE, nullptr);
: dest_d3d12_rt.descriptor_draw().GetHandle());
command_list.D3DOMSetRenderTargets(1, &handle, FALSE, nullptr);
are_current_command_list_render_targets_valid_ = true;
D3D12_VIEWPORT clear_viewport;
clear_viewport.TopLeftX = float(clear_rect.left);

View File

@ -78,314 +78,24 @@ namespace shaders {
#include "xenia/gpu/shaders/bytecode/d3d12_5_1/texture_load_r5g6b5_b5g6r5_scaled_cs.h"
} // namespace shaders
const D3D12TextureCache::HostFormat D3D12TextureCache::host_formats_[64] = {
// k_1_REVERSE
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_1
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_8
{DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb,
DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_1_5_5_5
// Red and blue swapped in the load shader for simplicity.
{DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM,
kLoadShaderIndexR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_5_6_5
// Red and blue swapped in the load shader for simplicity.
{DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM,
kLoadShaderIndexR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_6_5_5
// On the host, green bits in blue, blue bits in green.
{DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM,
kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)},
// k_8_8_8_8
{DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_2_10_10_10
{DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_8_A
{DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb,
DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_8_B
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_8_8
{DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndex16bpb,
DXGI_FORMAT_R8G8_SNORM, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_Cr_Y1_Cb_Y0_REP
// Red and blue swapped in the load shader for simplicity.
// TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is usable for
// the signed version, separate unsigned and signed load shaders completely
// (as one doesn't need decompression for this format, while another does).
{DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM,
kLoadShaderIndexGBGR8ToGRGB8, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
true, DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexGBGR8ToRGB8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_Y1_Cr_Y0_Cb_REP
// Red and blue swapped in the load shader for simplicity.
// TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is usable for
// the signed version, separate unsigned and signed load shaders completely
// (as one doesn't need decompression for this format, while another does).
{DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM,
kLoadShaderIndexBGRG8ToRGBG8, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
true, DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexBGRG8ToRGB8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_16_16_EDRAM
// Not usable as a texture, also has -32...32 range.
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_8_8_8_8_A
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_4_4_4_4
// Red and blue swapped in the load shader for simplicity.
{DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM,
kLoadShaderIndexRGBA4ToBGRA4, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_10_11_11
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_11_11_10
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_DXT1
{DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT2_3
{DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT4_5
{DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_16_16_16_16_EDRAM
// Not usable as a texture, also has -32...32 range.
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// R32_FLOAT for depth because shaders would require an additional SRV to
// sample stencil, which we don't provide.
// k_24_8
{DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthUnorm,
DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_24_8_FLOAT
{DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthFloat,
DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16
{DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, kLoadShaderIndex16bpb,
DXGI_FORMAT_R16_SNORM, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16
{DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_SNORM, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_16_16_16
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_16_EXPAND
{DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb,
DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16_EXPAND
{DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndex32bpb,
DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_16_16_16_EXPAND
{DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT,
kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_16_FLOAT
{DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb,
DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16_FLOAT
{DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndex32bpb,
DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_16_16_16_FLOAT
{DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT,
kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_32
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_32
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_32_32_32_32
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_32_FLOAT
{DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndex32bpb,
DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_32_FLOAT
{DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndex64bpb,
DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_32_32_32_32_FLOAT
{DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT,
kLoadShaderIndex128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_32_AS_8
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_AS_8_8
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_MPEG
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16_MPEG
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_8_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_AS_8_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_AS_8_8_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_MPEG_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16_MPEG_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_DXN
{DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8_UNORM,
kLoadShaderIndexDXNToRG8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_8_8_8_8_AS_16_16_16_16
{DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT1_AS_16_16_16_16
{DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT2_3_AS_16_16_16_16
{DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT4_5_AS_16_16_16_16
{DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_2_10_10_10_AS_16_16_16_16
{DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_10_11_11_AS_16_16_16_16
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_11_11_10_AS_16_16_16_16
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_32_32_32_FLOAT
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_DXT3A
// R8_UNORM has the same size as BC2, but doesn't have the 4x4 size
// alignment requirement.
{DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, kLoadShaderIndexDXT3A,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_DXT5A
{DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, kLoadShaderIndex64bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8_UNORM,
kLoadShaderIndexDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_CTX1
{DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexCTX1,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_DXT3A_AS_1_1_1_1
{DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM,
kLoadShaderIndexDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_8_8_8_8_GAMMA_EDRAM
// Not usable as a texture.
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_2_10_10_10_FLOAT_EDRAM
// Not usable as a texture.
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
};
/*
chrispy: we're getting cache misses in GetHostFormatSwizzle, use a
denser array todo: not all 65536 possible swizzles are used, this could
probably be one cache line
*/
using SwizzleArray = std::array<unsigned short, 64>;
static constexpr SwizzleArray build_xenos_swizzle_for_format() {
SwizzleArray result{0};
for (int i = 0; i < 64; ++i) {
result[i] =
static_cast<uint16_t>(D3D12TextureCache::host_formats_[i].swizzle);
}
return result;
}
alignas(64) constexpr SwizzleArray xenos_swizzle_for_format =
build_xenos_swizzle_for_format();
D3D12TextureCache::D3D12TextureCache(const RegisterFile& register_file,
D3D12SharedMemory& shared_memory,
@ -1544,7 +1254,8 @@ bool D3D12TextureCache::IsScaledResolveSupportedForFormat(
}
uint32_t D3D12TextureCache::GetHostFormatSwizzle(TextureKey key) const {
return host_formats_[uint32_t(key.format)].swizzle;
// return host_formats_[uint32_t(key.format)].swizzle;
return xenos_swizzle_for_format[uint32_t(key.format)];
}
uint32_t D3D12TextureCache::GetMaxHostTextureWidthHeight(

View File

@ -160,29 +160,6 @@ class D3D12TextureCache final : public TextureCache {
ID3D12Resource* RequestSwapTexture(
D3D12_SHADER_RESOURCE_VIEW_DESC& srv_desc_out,
xenos::TextureFormat& format_out);
protected:
bool IsSignedVersionSeparateForFormat(TextureKey key) const override;
bool IsScaledResolveSupportedForFormat(TextureKey key) const override;
uint32_t GetHostFormatSwizzle(TextureKey key) const override;
uint32_t GetMaxHostTextureWidthHeight(
xenos::DataDimension dimension) const override;
uint32_t GetMaxHostTextureDepthOrArraySize(
xenos::DataDimension dimension) const override;
std::unique_ptr<Texture> CreateTexture(TextureKey key) override;
// This binds pipelines, allocates descriptors, and copies!
bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base,
bool load_mips) override;
void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override;
private:
static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;
struct HostFormat {
// Format info for the regular case.
// DXGI format (typeless when different signedness or number representation
@ -223,6 +200,352 @@ class D3D12TextureCache final : public TextureCache {
// Mapping of Xenos swizzle components to DXGI format components.
uint32_t swizzle;
};
static constexpr HostFormat host_formats_[64]{
// k_1_REVERSE
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_1
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_8
{DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb,
DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_1_5_5_5
// Red and blue swapped in the load shader for simplicity.
{DXGI_FORMAT_B5G5R5A1_UNORM, DXGI_FORMAT_B5G5R5A1_UNORM,
kLoadShaderIndexR5G5B5A1ToB5G5R5A1, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_5_6_5
// Red and blue swapped in the load shader for simplicity.
{DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM,
kLoadShaderIndexR5G6B5ToB5G6R5, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_6_5_5
// On the host, green bits in blue, blue bits in green.
{DXGI_FORMAT_B5G6R5_UNORM, DXGI_FORMAT_B5G6R5_UNORM,
kLoadShaderIndexR5G5B6ToB5G6R5WithRBGASwizzle, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, XE_GPU_MAKE_TEXTURE_SWIZZLE(R, B, G, G)},
// k_8_8_8_8
{DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_2_10_10_10
{DXGI_FORMAT_R10G10B10A2_TYPELESS, DXGI_FORMAT_R10G10B10A2_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_8_A
{DXGI_FORMAT_R8_TYPELESS, DXGI_FORMAT_R8_UNORM, kLoadShaderIndex8bpb,
DXGI_FORMAT_R8_SNORM, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_8_B
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_8_8
{DXGI_FORMAT_R8G8_TYPELESS, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndex16bpb,
DXGI_FORMAT_R8G8_SNORM, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_Cr_Y1_Cb_Y0_REP
// Red and blue swapped in the load shader for simplicity.
// TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is
// usable for
// the signed version, separate unsigned and signed load shaders
// completely
// (as one doesn't need decompression for this format, while another
// does).
{DXGI_FORMAT_G8R8_G8B8_UNORM, DXGI_FORMAT_G8R8_G8B8_UNORM,
kLoadShaderIndexGBGR8ToGRGB8, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM,
kLoadShaderIndexGBGR8ToRGB8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_Y1_Cr_Y0_Cb_REP
// Red and blue swapped in the load shader for simplicity.
// TODO(Triang3l): The DXGI_FORMAT_R8G8B8A8_U/SNORM conversion is
// usable for
// the signed version, separate unsigned and signed load shaders
// completely
// (as one doesn't need decompression for this format, while another
// does).
{DXGI_FORMAT_R8G8_B8G8_UNORM, DXGI_FORMAT_R8G8_B8G8_UNORM,
kLoadShaderIndexBGRG8ToRGBG8, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8G8B8A8_UNORM,
kLoadShaderIndexBGRG8ToRGB8, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_16_16_EDRAM
// Not usable as a texture, also has -32...32 range.
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_8_8_8_8_A
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_4_4_4_4
// Red and blue swapped in the load shader for simplicity.
{DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM,
kLoadShaderIndexRGBA4ToBGRA4, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_10_11_11
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_11_11_10
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_DXT1
{DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT2_3
{DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT4_5
{DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_16_16_16_16_EDRAM
// Not usable as a texture, also has -32...32 range.
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// R32_FLOAT for depth because shaders would require an additional SRV
// to
// sample stencil, which we don't provide.
// k_24_8
{DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthUnorm,
DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_24_8_FLOAT
{DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexDepthFloat,
DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16
{DXGI_FORMAT_R16_TYPELESS, DXGI_FORMAT_R16_UNORM, kLoadShaderIndex16bpb,
DXGI_FORMAT_R16_SNORM, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16
{DXGI_FORMAT_R16G16_TYPELESS, DXGI_FORMAT_R16G16_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_SNORM, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_16_16_16
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_16_EXPAND
{DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb,
DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16_EXPAND
{DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT,
kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_16_16_16_EXPAND
{DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT,
kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_16_FLOAT
{DXGI_FORMAT_R16_FLOAT, DXGI_FORMAT_R16_FLOAT, kLoadShaderIndex16bpb,
DXGI_FORMAT_R16_FLOAT, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16_FLOAT
{DXGI_FORMAT_R16G16_FLOAT, DXGI_FORMAT_R16G16_FLOAT,
kLoadShaderIndex32bpb, DXGI_FORMAT_R16G16_FLOAT, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_16_16_16_FLOAT
{DXGI_FORMAT_R16G16B16A16_FLOAT, DXGI_FORMAT_R16G16B16A16_FLOAT,
kLoadShaderIndex64bpb, DXGI_FORMAT_R16G16B16A16_FLOAT,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_32
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_32
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_32_32_32_32
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_32_FLOAT
{DXGI_FORMAT_R32_FLOAT, DXGI_FORMAT_R32_FLOAT, kLoadShaderIndex32bpb,
DXGI_FORMAT_R32_FLOAT, kLoadShaderIndexUnknown, false,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_32_FLOAT
{DXGI_FORMAT_R32G32_FLOAT, DXGI_FORMAT_R32G32_FLOAT,
kLoadShaderIndex64bpb, DXGI_FORMAT_R32G32_FLOAT, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_32_32_32_32_FLOAT
{DXGI_FORMAT_R32G32B32A32_FLOAT, DXGI_FORMAT_R32G32B32A32_FLOAT,
kLoadShaderIndex128bpb, DXGI_FORMAT_R32G32B32A32_FLOAT,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_32_AS_8
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_AS_8_8
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_MPEG
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16_MPEG
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_8_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_AS_8_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_32_AS_8_8_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_16_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_MPEG_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_16_16_MPEG_INTERLACED
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_DXN
{DXGI_FORMAT_BC5_UNORM, DXGI_FORMAT_BC5_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexDXNToRG8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_8_8_8_8_AS_16_16_16_16
{DXGI_FORMAT_R8G8B8A8_TYPELESS, DXGI_FORMAT_R8G8B8A8_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_R8G8B8A8_SNORM,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT1_AS_16_16_16_16
{DXGI_FORMAT_BC1_UNORM, DXGI_FORMAT_BC1_UNORM, kLoadShaderIndex64bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT1ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT2_3_AS_16_16_16_16
{DXGI_FORMAT_BC2_UNORM, DXGI_FORMAT_BC2_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT3ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_DXT4_5_AS_16_16_16_16
{DXGI_FORMAT_BC3_UNORM, DXGI_FORMAT_BC3_UNORM, kLoadShaderIndex128bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true,
DXGI_FORMAT_R8G8B8A8_UNORM, kLoadShaderIndexDXT5ToRGBA8,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_2_10_10_10_AS_16_16_16_16
{DXGI_FORMAT_R10G10B10A2_UNORM, DXGI_FORMAT_R10G10B10A2_UNORM,
kLoadShaderIndex32bpb, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
false, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_10_11_11_AS_16_16_16_16
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndexR11G11B10ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexR11G11B10ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_11_11_10_AS_16_16_16_16
{DXGI_FORMAT_R16G16B16A16_TYPELESS, DXGI_FORMAT_R16G16B16A16_UNORM,
kLoadShaderIndexR10G11B11ToRGBA16, DXGI_FORMAT_R16G16B16A16_SNORM,
kLoadShaderIndexR10G11B11ToRGBA16SNorm, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_32_32_32_FLOAT
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBB},
// k_DXT3A
// R8_UNORM has the same size as BC2, but doesn't have the 4x4 size
// alignment requirement.
{DXGI_FORMAT_R8_UNORM, DXGI_FORMAT_R8_UNORM, kLoadShaderIndexDXT3A,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_DXT5A
{DXGI_FORMAT_BC4_UNORM, DXGI_FORMAT_BC4_UNORM, kLoadShaderIndex64bpb,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, true, DXGI_FORMAT_R8_UNORM,
kLoadShaderIndexDXT5AToR8, xenos::XE_GPU_TEXTURE_SWIZZLE_RRRR},
// k_CTX1
{DXGI_FORMAT_R8G8_UNORM, DXGI_FORMAT_R8G8_UNORM, kLoadShaderIndexCTX1,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGGG},
// k_DXT3A_AS_1_1_1_1
{DXGI_FORMAT_B4G4R4A4_UNORM, DXGI_FORMAT_B4G4R4A4_UNORM,
kLoadShaderIndexDXT3AAs1111ToBGRA4, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_8_8_8_8_GAMMA_EDRAM
// Not usable as a texture.
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
// k_2_10_10_10_FLOAT_EDRAM
// Not usable as a texture.
{DXGI_FORMAT_UNKNOWN, DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown,
DXGI_FORMAT_UNKNOWN, kLoadShaderIndexUnknown, false, DXGI_FORMAT_UNKNOWN,
kLoadShaderIndexUnknown, xenos::XE_GPU_TEXTURE_SWIZZLE_RGBA},
};
protected:
bool IsSignedVersionSeparateForFormat(TextureKey key) const override;
bool IsScaledResolveSupportedForFormat(TextureKey key) const override;
uint32_t GetHostFormatSwizzle(TextureKey key) const override;
uint32_t GetMaxHostTextureWidthHeight(
xenos::DataDimension dimension) const override;
uint32_t GetMaxHostTextureDepthOrArraySize(
xenos::DataDimension dimension) const override;
std::unique_ptr<Texture> CreateTexture(TextureKey key) override;
// This binds pipelines, allocates descriptors, and copies!
bool LoadTextureDataFromResidentMemoryImpl(Texture& texture, bool load_base,
bool load_mips) override;
void UpdateTextureBindingsImpl(uint32_t fetch_constant_mask) override;
private:
static constexpr uint32_t kLoadGuestXThreadsPerGroupLog2 = 2;
static constexpr uint32_t kLoadGuestYBlocksPerGroupLog2 = 5;
class D3D12Texture final : public Texture {
public:
@ -467,8 +790,6 @@ class D3D12TextureCache final : public TextureCache {
xenos::ClampMode NormalizeClampMode(xenos::ClampMode clamp_mode) const;
static const HostFormat host_formats_[64];
D3D12CommandProcessor& command_processor_;
bool bindless_resources_used_;

View File

@ -198,7 +198,7 @@ uint32_t GraphicsSystem::ReadRegister(uint32_t addr) {
// maximum [width(0x0FFF), height(0x0FFF)]
return 0x050002D0;
default:
if (!register_file_.GetRegisterInfo(r)) {
if (!register_file_.IsValidRegister(r)) {
XELOGE("GPU: Read from unknown register ({:04X})", r);
}
}

View File

@ -8,7 +8,7 @@
*/
#include "xenia/gpu/register_file.h"
#include <array>
#include <cstring>
#include "xenia/base/math.h"
@ -17,6 +17,52 @@ namespace xe {
namespace gpu {
RegisterFile::RegisterFile() { std::memset(values, 0, sizeof(values)); }
constexpr unsigned int GetHighestRegisterNumber() {
uint32_t highest = 0;
#define XE_GPU_REGISTER(index, type, name) \
highest = std::max<uint32_t>(highest, index);
#include "xenia/gpu/register_table.inc"
#undef XE_GPU_REGISTER
return highest;
}
constexpr unsigned int GetLowestRegisterNumber() {
uint32_t lowest = UINT_MAX;
#define XE_GPU_REGISTER(index, type, name) \
lowest = std::min<uint32_t>(lowest, index);
#include "xenia/gpu/register_table.inc"
#undef XE_GPU_REGISTER
return lowest;
}
static constexpr uint32_t lowest_register = GetLowestRegisterNumber();
static constexpr uint32_t highest_register = GetHighestRegisterNumber();
static constexpr uint32_t total_num_registers =
highest_register - lowest_register;
static constexpr uint32_t num_required_words_for_registers =
((total_num_registers + 63) & ~63) / 64;
// can't use bitset, its not constexpr in c++ 17
using ValidRegisterBitset = std::array<
uint64_t,
num_required_words_for_registers>; // std::bitset<highest_register
// - lowest_register>;
static constexpr ValidRegisterBitset BuildValidRegisterBitset() {
ValidRegisterBitset result{};
#define XE_GPU_REGISTER(index, type, name) \
result[(index - lowest_register) / 64] |= \
1ULL << ((index - lowest_register) % 64);
#include "xenia/gpu/register_table.inc"
#undef XE_GPU_REGISTER
return result;
}
static constexpr ValidRegisterBitset valid_register_bitset =
BuildValidRegisterBitset();
const RegisterInfo* RegisterFile::GetRegisterInfo(uint32_t index) {
switch (index) {
@ -34,6 +80,18 @@ const RegisterInfo* RegisterFile::GetRegisterInfo(uint32_t index) {
return nullptr;
}
}
/*
todo: this still uses a lot of cpu! our bitset is too large
*/
bool RegisterFile::IsValidRegister(uint32_t index) {
if (XE_UNLIKELY(index < lowest_register) ||
XE_UNLIKELY(index > highest_register)) {
return false;
}
uint32_t register_linear_index = index - lowest_register;
return (valid_register_bitset[register_linear_index / 64] &
(1ULL << (register_linear_index % 64))) != 0;
}
} // namespace gpu
} // namespace xe

View File

@ -32,7 +32,7 @@ class RegisterFile {
RegisterFile();
static const RegisterInfo* GetRegisterInfo(uint32_t index);
static bool IsValidRegister(uint32_t index);
static constexpr size_t kRegisterCount = 0x5003;
union RegisterValue {
uint32_t u32;

View File

@ -41,9 +41,6 @@
#include "xenia/ui/windowed_app_context.h"
#include "xenia/xbox.h"
DEFINE_string(target_trace_file, "", "Specifies the trace file to load.",
"GPU");
namespace xe {
namespace gpu {
@ -66,7 +63,7 @@ TraceViewer::TraceViewer(xe::ui::WindowedAppContext& app_context,
TraceViewer::~TraceViewer() = default;
bool TraceViewer::OnInitialize() {
std::string path = cvars::target_trace_file;
std::string path = cvars::target_trace_file.u8string();
// If no path passed, ask the user.
// On Android, however, there's no synchronous file picker, and the trace file

View File

@ -12,6 +12,7 @@
#include <string_view>
#include "xenia/base/cvar.h"
#include "xenia/emulator.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/trace_player.h"
@ -24,7 +25,7 @@
#include "xenia/ui/window.h"
#include "xenia/ui/window_listener.h"
#include "xenia/ui/windowed_app.h"
DECLARE_path(target_trace_file);
namespace xe {
namespace gpu {

View File

@ -25,7 +25,7 @@
namespace xe {
namespace gpu {
#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
TraceWriter::TraceWriter(uint8_t* membase)
: membase_(membase), file_(nullptr) {}
@ -362,6 +362,6 @@ void TraceWriter::WriteGammaRamp(
fwrite(gamma_ramp_pwl_rgb, 1, kPWLUncompressedLength, file_);
}
}
#endif
} // namespace gpu
} // namespace xe

View File

@ -17,11 +17,22 @@
#include "xenia/gpu/registers.h"
#include "xenia/gpu/trace_protocol.h"
// only enable trace writer in debug builds, measured hit from the trace
// function calls (even if they just immediately return) is 0.40-0.60% cpu time
// total. with inlining they just bloat the caller and negatively impact
// register allocation for the caller
#ifdef NDEBUG
#define XE_ENABLE_TRACE_WRITER_INSTRUMENTATION 0
#else
#define XE_ENABLE_TRACE_WRITER_INSTRUMENTATION 1
#endif
namespace xe {
namespace gpu {
class TraceWriter {
public:
#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
explicit TraceWriter(uint8_t* membase);
~TraceWriter();
@ -61,6 +72,49 @@ class TraceWriter {
bool compress_output_ = true;
size_t compression_threshold_ = 1024; // Min. number of bytes to compress.
#else
// this could be annoying to maintain if new methods are added or the
// signatures change
constexpr explicit TraceWriter(uint8_t* membase) {}
static constexpr bool is_open() { return false; }
static constexpr bool Open(const std::filesystem::path& path,
uint32_t title_id) {
return false;
}
static constexpr void Flush() {}
static constexpr void Close() {}
static constexpr void WritePrimaryBufferStart(uint32_t base_ptr,
uint32_t count) {}
static constexpr void WritePrimaryBufferEnd() {}
static constexpr void WriteIndirectBufferStart(uint32_t base_ptr,
uint32_t count) {}
static constexpr void WriteIndirectBufferEnd() {}
static constexpr void WritePacketStart(uint32_t base_ptr, uint32_t count) {}
static constexpr void WritePacketEnd() {}
static constexpr void WriteMemoryRead(uint32_t base_ptr, size_t length,
const void* host_ptr = nullptr) {}
static constexpr void WriteMemoryReadCached(uint32_t base_ptr,
size_t length) {}
static constexpr void WriteMemoryReadCachedNop(uint32_t base_ptr,
size_t length) {}
static constexpr void WriteMemoryWrite(uint32_t base_ptr, size_t length,
const void* host_ptr = nullptr) {}
static constexpr void WriteEdramSnapshot(const void* snapshot) {}
static constexpr void WriteEvent(EventCommand::Type event_type) {}
static constexpr void WriteRegisters(uint32_t first_register,
const uint32_t* register_values,
uint32_t register_count,
bool execute_callbacks_on_play) {}
static constexpr void WriteGammaRamp(
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table,
const reg::DC_LUT_PWL_DATA* gamma_ramp_pwl_rgb,
uint32_t gamma_ramp_rw_component) {}
#endif
};
} // namespace gpu

View File

@ -225,6 +225,7 @@ X_STATUS UserModule::LoadContinue() {
ldr_data->xex_header_base = guest_xex_header_;
ldr_data->full_image_size = security_header->image_size;
ldr_data->image_base = this->xex_module()->base_address();
ldr_data->entry_point = entry_point_;
OnLoad();

View File

@ -198,7 +198,8 @@ bool Memory::Initialize() {
// Add handlers for MMIO.
mmio_handler_ = cpu::MMIOHandler::Install(
virtual_membase_, physical_membase_, physical_membase_ + 0x1FFFFFFF,
HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this);
HostToGuestVirtualThunk, this, AccessViolationCallbackThunk, this,
nullptr, nullptr);
if (!mmio_handler_) {
XELOGE("Unable to install MMIO handlers");
assert_always();
@ -213,6 +214,11 @@ bool Memory::Initialize() {
return true;
}
void Memory::SetMMIOExceptionRecordingCallback(
cpu::MmioAccessRecordCallback callback, void* context) {
mmio_handler_->SetMMIOExceptionRecordingCallback(callback, context);
}
static const struct {
uint64_t virtual_address_start;
uint64_t virtual_address_end;
@ -1530,7 +1536,8 @@ bool PhysicalHeap::AllocRange(uint32_t low_address, uint32_t high_address,
bool PhysicalHeap::AllocSystemHeap(uint32_t size, uint32_t alignment,
uint32_t allocation_type, uint32_t protect,
bool top_down, uint32_t* out_address) {
return Alloc(size, alignment, allocation_type, protect, top_down, out_address);
return Alloc(size, alignment, allocation_type, protect, top_down,
out_address);
}
bool PhysicalHeap::Decommit(uint32_t address, uint32_t size) {

View File

@ -498,6 +498,9 @@ class Memory {
bool Save(ByteStream* stream);
bool Restore(ByteStream* stream);
void SetMMIOExceptionRecordingCallback(cpu::MmioAccessRecordCallback callback,
void* context);
private:
int MapViews(uint8_t* mapping_base);
void UnmapViews();

View File

@ -181,7 +181,6 @@ bool Win32Window::OpenImpl() {
SetWindowPlacement(hwnd_, &initial_dpi_placement);
}
}
// Disable rounded corners starting with Windows 11 (or silently receive and
// ignore E_INVALIDARG on Windows versions before 10.0.22000.0), primarily to
// preserve all pixels of the guest output.
@ -189,7 +188,6 @@ bool Win32Window::OpenImpl() {
DwmSetWindowAttribute(hwnd_, DWMWA_WINDOW_CORNER_PREFERENCE,
&window_corner_preference,
sizeof(window_corner_preference));
// Disable flicks.
ATOM atom = GlobalAddAtomW(L"MicrosoftTabletPenServiceProperty");
const DWORD_PTR dwHwndTabletProperty =
@ -1047,7 +1045,9 @@ LRESULT Win32Window::WndProc(HWND hWnd, UINT message, WPARAM wParam,
} break;
case WM_MOVE: {
OnMonitorUpdate(MonitorUpdateEvent(this, false));
// chrispy: fix clang use of temporary error
MonitorUpdateEvent update_event{this, false};
OnMonitorUpdate(update_event);
} break;
case WM_SIZE: {
@ -1084,7 +1084,9 @@ LRESULT Win32Window::WndProc(HWND hWnd, UINT message, WPARAM wParam,
} break;
case WM_DISPLAYCHANGE: {
OnMonitorUpdate(MonitorUpdateEvent(this, true));
// chrispy: fix clang use of temporary error
MonitorUpdateEvent update_event{this, true};
OnMonitorUpdate(update_event);
} break;
case WM_DPICHANGED: {