Merge pull request #74 from chrisps/canary_experimental
Misc optimizations
This commit is contained in:
commit
b4224ff3dc
|
@ -379,6 +379,9 @@ std::vector<std::unique_ptr<hid::InputDriver>> EmulatorApp::CreateInputDrivers(
|
|||
}
|
||||
|
||||
bool EmulatorApp::OnInitialize() {
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
amd64::InitFeatureFlags();
|
||||
#endif
|
||||
Profiler::Initialize();
|
||||
Profiler::ThreadEnter("Main");
|
||||
|
||||
|
|
|
@ -51,7 +51,7 @@ uint64_t last_guest_tick_count_ = 0;
|
|||
uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
|
||||
|
||||
|
||||
using tick_mutex_type = xe_unlikely_mutex;
|
||||
using tick_mutex_type = std::mutex;
|
||||
|
||||
// Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
|
||||
// std::mutex tick_mutex_;
|
||||
|
|
|
@ -1,7 +1,15 @@
|
|||
#include "dma.h"
|
||||
#include "logging.h"
|
||||
#include "mutex.h"
|
||||
#include "platform_win.h"
|
||||
#include "xbyak/xbyak/xbyak_util.h"
|
||||
|
||||
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
|
||||
NtDelayExecutionPointer);
|
||||
XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer);
|
||||
XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId,
|
||||
NtAlertThreadByThreadId);
|
||||
|
||||
template <size_t N, typename... Ts>
|
||||
static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
|
||||
char buffer[1024];
|
||||
|
@ -213,320 +221,140 @@ void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
|
|||
written_length);
|
||||
}
|
||||
|
||||
#define XEDMA_NUM_WORKERS 4
|
||||
class alignas(256) XeDMACGeneric : public XeDMAC {
|
||||
#define MAX_INFLIGHT_DMAJOBS 65536
|
||||
#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1)
|
||||
class XeDMACGeneric : public XeDMAC {
|
||||
std::unique_ptr<xe::threading::Thread> thrd_;
|
||||
XeDMAJob* jobs_ring_;
|
||||
volatile std::atomic<uintptr_t> write_ptr_;
|
||||
|
||||
struct alignas(XE_HOST_CACHE_LINE_SIZE) {
|
||||
std::atomic<uint64_t> free_job_slots_;
|
||||
std::atomic<uint64_t> jobs_submitted_;
|
||||
std::atomic<uint64_t> jobs_completed_;
|
||||
std::atomic<uint32_t> num_workers_awoken_;
|
||||
std::atomic<uint32_t> current_job_serial_;
|
||||
|
||||
} dma_volatile_;
|
||||
|
||||
alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64];
|
||||
|
||||
volatile uint32_t jobserials_[64];
|
||||
|
||||
alignas(XE_HOST_CACHE_LINE_SIZE)
|
||||
std::unique_ptr<threading::Event> job_done_signals_[64];
|
||||
// really dont like using unique pointer for this...
|
||||
std::unique_ptr<threading::Event> job_submitted_signal_;
|
||||
std::unique_ptr<threading::Event> job_completed_signal_;
|
||||
|
||||
std::unique_ptr<threading::Thread> scheduler_thread_;
|
||||
struct WorkSlice {
|
||||
uint8_t* destination;
|
||||
uint8_t* source;
|
||||
size_t numbytes;
|
||||
volatile std::atomic<uintptr_t> read_ptr_;
|
||||
xe_mutex push_into_ring_lock_;
|
||||
};
|
||||
std::unique_ptr<threading::Thread> workers_[XEDMA_NUM_WORKERS];
|
||||
std::unique_ptr<threading::Event> worker_has_work_; //[XEDMA_NUM_WORKERS];
|
||||
std::unique_ptr<threading::Event> worker_has_finished_[XEDMA_NUM_WORKERS];
|
||||
|
||||
threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS];
|
||||
WorkSlice worker_workslice_[XEDMA_NUM_WORKERS];
|
||||
|
||||
// chrispy: this is bad
|
||||
static uint32_t find_free_hole_in_dword(uint64_t dw) {
|
||||
XEDMALOG("Finding free hole in 0x%llX", dw);
|
||||
|
||||
for (uint32_t i = 0; i < 64; ++i) {
|
||||
if (dw & (1ULL << i)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
return ~0U;
|
||||
}
|
||||
|
||||
uint32_t allocate_free_dma_slot() {
|
||||
XEDMALOG("Allocating free slot");
|
||||
uint32_t got_slot = 0;
|
||||
uint64_t slots;
|
||||
uint64_t allocated_slot;
|
||||
|
||||
do {
|
||||
slots = dma_volatile_.free_job_slots_.load();
|
||||
|
||||
got_slot = find_free_hole_in_dword(slots);
|
||||
if (!~got_slot) {
|
||||
XEDMALOG("Didn't get a slot!");
|
||||
return ~0U;
|
||||
}
|
||||
allocated_slot = slots | (1ULL << got_slot);
|
||||
|
||||
} while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
|
||||
slots, allocated_slot)));
|
||||
XEDMALOG("Allocated slot %d", got_slot);
|
||||
return got_slot;
|
||||
}
|
||||
// chrispy: on x86 this can just be interlockedbittestandreset...
|
||||
void free_dma_slot(uint32_t slot) {
|
||||
XEDMALOG("Freeing slot %d", slot);
|
||||
uint64_t slots;
|
||||
|
||||
uint64_t deallocated_slot;
|
||||
|
||||
do {
|
||||
slots = dma_volatile_.free_job_slots_.load();
|
||||
|
||||
deallocated_slot = slots & (~(1ULL << slot));
|
||||
|
||||
} while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
|
||||
slots, deallocated_slot)));
|
||||
}
|
||||
|
||||
void DoDMAJob(uint32_t idx) {
|
||||
XeDMAJob& job = jobs_[idx];
|
||||
if (job.precall) {
|
||||
job.precall(&job);
|
||||
}
|
||||
// memcpy(job.destination, job.source, job.size);
|
||||
|
||||
size_t job_size = job.size;
|
||||
|
||||
size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
size_t rem = job_size - line_rounded;
|
||||
|
||||
size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS;
|
||||
|
||||
XEDMALOG(
|
||||
"Distributing %d bytes from %p to %p across %d workers, remainder is "
|
||||
"%d",
|
||||
line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem);
|
||||
if (num_per_worker < 2048) {
|
||||
XEDMALOG("not distributing across workers, num_per_worker < 8192");
|
||||
// not worth splitting up
|
||||
memcpy(job.destination, job.source, job.size);
|
||||
job.signal_on_done->Set();
|
||||
} else {
|
||||
for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
|
||||
worker_workslice_[i].destination =
|
||||
(i * num_per_worker) + job.destination;
|
||||
worker_workslice_[i].source = (i * num_per_worker) + job.source;
|
||||
|
||||
worker_workslice_[i].numbytes = num_per_worker;
|
||||
}
|
||||
if (rem) {
|
||||
__movsb(job.destination + line_rounded, job.source + line_rounded, rem);
|
||||
}
|
||||
// wake them up
|
||||
worker_has_work_->Set();
|
||||
XEDMALOG("Starting waitall for job");
|
||||
threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS,
|
||||
false);
|
||||
|
||||
XEDMALOG("Waitall for job completed!");
|
||||
job.signal_on_done->Set();
|
||||
}
|
||||
if (job.postcall) {
|
||||
job.postcall(&job);
|
||||
}
|
||||
++dma_volatile_.jobs_completed_;
|
||||
}
|
||||
|
||||
void WorkerIter(uint32_t worker_index) {
|
||||
xenia_assert(worker_index < XEDMA_NUM_WORKERS);
|
||||
auto [dest, src, size] = worker_workslice_[worker_index];
|
||||
|
||||
// if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) {
|
||||
worker_has_work_->Reset();
|
||||
//}
|
||||
xenia_assert(size < (1ULL << 32));
|
||||
// memcpy(dest, src, size);
|
||||
dma::vastcpy(dest, src, static_cast<uint32_t>(size));
|
||||
}
|
||||
XE_NOINLINE
|
||||
void WorkerMainLoop(uint32_t worker_index) {
|
||||
do {
|
||||
XEDMALOG("Worker iter for worker %d", worker_index);
|
||||
WorkerIter(worker_index);
|
||||
|
||||
XEDMALOG("Worker %d is done\n", worker_index);
|
||||
threading::SignalAndWait(worker_has_finished_[worker_index].get(),
|
||||
worker_has_work_.get(), false);
|
||||
} while (true);
|
||||
}
|
||||
void WorkerMain(uint32_t worker_index) {
|
||||
XEDMALOG("Entered worker main loop, index %d", worker_index);
|
||||
threading::Wait(worker_has_work_.get(), false);
|
||||
XEDMALOG("First wait for worker %d completed, first job ever",
|
||||
worker_index);
|
||||
WorkerMainLoop(worker_index);
|
||||
}
|
||||
|
||||
static void WorkerMainForwarder(void* ptr) {
|
||||
// we aligned XeDma to 256 bytes and encode extra info in the low 8
|
||||
uintptr_t uptr = (uintptr_t)ptr;
|
||||
|
||||
uint32_t worker_index = (uint8_t)uptr;
|
||||
|
||||
uptr &= ~0xFFULL;
|
||||
|
||||
char name_buffer[64];
|
||||
sprintf_s(name_buffer, "dma_worker_%d", worker_index);
|
||||
|
||||
xe::threading::set_name(name_buffer);
|
||||
|
||||
reinterpret_cast<XeDMACGeneric*>(uptr)->WorkerMain(worker_index);
|
||||
}
|
||||
|
||||
void DMAMain() {
|
||||
XEDMALOG("DmaMain");
|
||||
do {
|
||||
threading::Wait(job_submitted_signal_.get(), false);
|
||||
|
||||
auto slots = dma_volatile_.free_job_slots_.load();
|
||||
|
||||
for (uint32_t i = 0; i < 64; ++i) {
|
||||
if (slots & (1ULL << i)) {
|
||||
XEDMALOG("Got new job at index %d in DMAMain", i);
|
||||
DoDMAJob(i);
|
||||
|
||||
free_dma_slot(i);
|
||||
|
||||
job_completed_signal_->Set();
|
||||
// break;
|
||||
}
|
||||
}
|
||||
|
||||
} while (true);
|
||||
}
|
||||
|
||||
static void DMAMainForwarder(void* ud) {
|
||||
xe::threading::set_name("dma_main");
|
||||
reinterpret_cast<XeDMACGeneric*>(ud)->DMAMain();
|
||||
}
|
||||
HANDLE gotjob_event;
|
||||
void WorkerWait();
|
||||
|
||||
public:
|
||||
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
|
||||
XEDMALOG("New job, %p to %p with size %d", job->source, job->destination,
|
||||
job->size);
|
||||
uint32_t slot;
|
||||
do {
|
||||
slot = allocate_free_dma_slot();
|
||||
if (!~slot) {
|
||||
XEDMALOG(
|
||||
"Didn't get a free slot, waiting for a job to complete before "
|
||||
"resuming.");
|
||||
threading::Wait(job_completed_signal_.get(), false);
|
||||
virtual ~XeDMACGeneric() {}
|
||||
void WorkerThreadMain();
|
||||
XeDMACGeneric() {
|
||||
threading::Thread::CreationParameters crparams;
|
||||
crparams.create_suspended = true;
|
||||
crparams.initial_priority = threading::ThreadPriority::kNormal;
|
||||
crparams.stack_size = 65536;
|
||||
gotjob_event = CreateEventA(nullptr, false, false, nullptr);
|
||||
thrd_ = std::move(threading::Thread::Create(
|
||||
crparams, [this]() { this->WorkerThreadMain(); }));
|
||||
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
jobs_ring_ = (XeDMAJob*)_aligned_malloc(
|
||||
MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE);
|
||||
|
||||
} while (true);
|
||||
jobs_[slot] = *job;
|
||||
write_ptr_ = 0;
|
||||
read_ptr_ = 0;
|
||||
|
||||
jobs_[slot].signal_on_done = job_done_signals_[slot].get();
|
||||
jobs_[slot].signal_on_done->Reset();
|
||||
XEDMALOG("Setting job submit signal, pushed into slot %d", slot);
|
||||
|
||||
uint32_t new_serial = dma_volatile_.current_job_serial_++;
|
||||
|
||||
jobserials_[slot] = new_serial;
|
||||
|
||||
++dma_volatile_.jobs_submitted_;
|
||||
job_submitted_signal_->Set();
|
||||
return (static_cast<uint64_t>(new_serial) << 32) |
|
||||
static_cast<uint64_t>(slot);
|
||||
|
||||
// return job_done_signals_[slot].get();
|
||||
thrd_->Resume();
|
||||
}
|
||||
|
||||
bool AllJobsDone() {
|
||||
return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_;
|
||||
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
|
||||
// std::unique_lock<xe_mutex> pushlock{push_into_ring_lock_};
|
||||
HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr);
|
||||
{
|
||||
job->dmac_specific_ = (uintptr_t)dmacevent;
|
||||
|
||||
jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job;
|
||||
write_ptr_++;
|
||||
SetEvent(gotjob_event);
|
||||
}
|
||||
return (DMACJobHandle)dmacevent;
|
||||
}
|
||||
virtual void WaitJobDone(DMACJobHandle handle) override {
|
||||
uint32_t serial = static_cast<uint32_t>(handle >> 32);
|
||||
uint32_t jobid = static_cast<uint32_t>(handle);
|
||||
do {
|
||||
if (jobserials_[jobid] != serial) {
|
||||
return; // done, our slot was reused
|
||||
while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) {
|
||||
// NtAlertThreadByThreadId.invoke<void>(thrd_->system_id());
|
||||
// while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) ==
|
||||
// WAIT_TIMEOUT) {
|
||||
// ;
|
||||
}
|
||||
//}
|
||||
|
||||
auto waitres = threading::Wait(job_done_signals_[jobid].get(), false,
|
||||
std::chrono::milliseconds{1});
|
||||
|
||||
if (waitres == threading::WaitResult::kTimeout) {
|
||||
continue;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
} while (true);
|
||||
// SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false);
|
||||
CloseHandle((HANDLE)handle);
|
||||
}
|
||||
virtual void WaitForIdle() override {
|
||||
while (!AllJobsDone()) {
|
||||
while (write_ptr_ != read_ptr_) {
|
||||
threading::MaybeYield();
|
||||
}
|
||||
}
|
||||
XeDMACGeneric() {
|
||||
XEDMALOG("Constructing xedma at addr %p", this);
|
||||
dma_volatile_.free_job_slots_.store(0ULL);
|
||||
dma_volatile_.jobs_submitted_.store(0ULL);
|
||||
dma_volatile_.jobs_completed_.store(0ULL);
|
||||
dma_volatile_.current_job_serial_.store(
|
||||
1ULL); // so that a jobhandle is never 0
|
||||
std::memset(jobs_, 0, sizeof(jobs_));
|
||||
job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false);
|
||||
job_completed_signal_ = threading::Event::CreateAutoResetEvent(false);
|
||||
worker_has_work_ = threading::Event::CreateManualResetEvent(false);
|
||||
threading::Thread::CreationParameters worker_params{};
|
||||
worker_params.create_suspended = false;
|
||||
worker_params.initial_priority = threading::ThreadPriority::kBelowNormal;
|
||||
worker_params.stack_size = 65536; // dont need much stack at all
|
||||
|
||||
for (uint32_t i = 0; i < 64; ++i) {
|
||||
job_done_signals_[i] = threading::Event::CreateManualResetEvent(false);
|
||||
}
|
||||
for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
|
||||
// worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false);
|
||||
worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false);
|
||||
worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get();
|
||||
|
||||
uintptr_t encoded = reinterpret_cast<uintptr_t>(this);
|
||||
xenia_assert(!(encoded & 0xFFULL));
|
||||
xenia_assert(i < 256);
|
||||
|
||||
encoded |= i;
|
||||
|
||||
workers_[i] = threading::Thread::Create(worker_params, [encoded]() {
|
||||
XeDMACGeneric::WorkerMainForwarder((void*)encoded);
|
||||
});
|
||||
}
|
||||
threading::Thread::CreationParameters scheduler_params{};
|
||||
scheduler_params.create_suspended = false;
|
||||
scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal;
|
||||
scheduler_params.stack_size = 65536;
|
||||
scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() {
|
||||
XeDMACGeneric::DMAMainForwarder((void*)this);
|
||||
});
|
||||
}
|
||||
};
|
||||
void XeDMACGeneric::WorkerWait() {
|
||||
constexpr unsigned NUM_PAUSE_SPINS = 2048;
|
||||
constexpr unsigned NUM_YIELD_SPINS = 8;
|
||||
#if 0
|
||||
|
||||
for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) {
|
||||
if (write_ptr_ == read_ptr_) {
|
||||
_mm_pause();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) {
|
||||
if (write_ptr_ == read_ptr_) {
|
||||
threading::MaybeYield();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
LARGE_INTEGER yield_execution_delay{};
|
||||
yield_execution_delay.QuadPart =
|
||||
-2000; //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds
|
||||
while (write_ptr_ == read_ptr_) {
|
||||
NtDelayExecutionPointer.invoke<void>(0, &yield_execution_delay);
|
||||
}
|
||||
#else
|
||||
do {
|
||||
if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) {
|
||||
while (write_ptr_ == read_ptr_) {
|
||||
_mm_pause();
|
||||
}
|
||||
}
|
||||
|
||||
} while (write_ptr_ == read_ptr_);
|
||||
#endif
|
||||
}
|
||||
void XeDMACGeneric::WorkerThreadMain() {
|
||||
while (true) {
|
||||
this->WorkerWait();
|
||||
|
||||
XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS];
|
||||
swcache::ReadFence();
|
||||
|
||||
if (current_job.precall) {
|
||||
current_job.precall(¤t_job);
|
||||
}
|
||||
|
||||
size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE;
|
||||
size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
size_t line_rem = current_job.size - line_rounded;
|
||||
|
||||
vastcpy(current_job.destination, current_job.source,
|
||||
static_cast<uint32_t>(line_rounded));
|
||||
|
||||
if (line_rem) {
|
||||
__movsb(current_job.destination + line_rounded,
|
||||
current_job.source + line_rounded, line_rem);
|
||||
}
|
||||
|
||||
if (current_job.postcall) {
|
||||
current_job.postcall(¤t_job);
|
||||
}
|
||||
read_ptr_++;
|
||||
swcache::WriteFence();
|
||||
|
||||
SetEvent((HANDLE)current_job.dmac_specific_);
|
||||
}
|
||||
}
|
||||
|
||||
XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
|
||||
} // namespace xe::dma
|
||||
|
|
|
@ -16,7 +16,8 @@ struct XeDMAJob;
|
|||
using DmaPrecall = void (*)(XeDMAJob* job);
|
||||
using DmaPostcall = void (*)(XeDMAJob* job);
|
||||
struct XeDMAJob {
|
||||
threading::Event* signal_on_done;
|
||||
//threading::Event* signal_on_done;
|
||||
uintptr_t dmac_specific_;
|
||||
uint8_t* destination;
|
||||
uint8_t* source;
|
||||
size_t size;
|
||||
|
|
|
@ -472,7 +472,7 @@ bool logging::internal::ShouldLog(LogLevel log_level) {
|
|||
std::pair<char*, size_t> logging::internal::GetThreadBuffer() {
|
||||
return {thread_log_buffer_, sizeof(thread_log_buffer_)};
|
||||
}
|
||||
|
||||
XE_NOALIAS
|
||||
void logging::internal::AppendLogLine(LogLevel log_level,
|
||||
const char prefix_char, size_t written) {
|
||||
if (!logger_ || !ShouldLog(log_level) || !written) {
|
||||
|
|
|
@ -74,11 +74,15 @@ namespace internal {
|
|||
|
||||
bool ShouldLog(LogLevel log_level);
|
||||
std::pair<char*, size_t> GetThreadBuffer();
|
||||
|
||||
XE_NOALIAS
|
||||
void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written);
|
||||
|
||||
} // namespace internal
|
||||
//technically, noalias is incorrect here, these functions do in fact alias global memory,
|
||||
//but msvc will not optimize the calls away, and the global memory modified by the calls is limited to internal logging variables,
|
||||
//so it might as well be noalias
|
||||
template <typename... Args>
|
||||
XE_NOALIAS
|
||||
XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level,
|
||||
const char prefix_char,
|
||||
const char* format,
|
||||
|
|
|
@ -400,10 +400,91 @@ static float ArchReciprocal(float den) {
|
|||
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
||||
}
|
||||
|
||||
#if 0
|
||||
using ArchFloatMask = float;
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
|
||||
return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
|
||||
return static_cast<uint32_t>(_mm_movemask_ps(_mm_set_ss(x)));
|
||||
}
|
||||
|
||||
constexpr ArchFloatMask floatmask_zero = .0f;
|
||||
#else
|
||||
using ArchFloatMask = __m128;
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
|
||||
return _mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y));
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_or_ps(x, y);
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_xor_ps(x, y);
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return _mm_and_ps(x, y);
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
|
||||
return static_cast<uint32_t>(_mm_movemask_ps(x) &1);
|
||||
}
|
||||
|
||||
constexpr ArchFloatMask floatmask_zero{.0f};
|
||||
#endif
|
||||
#else
|
||||
static float ArchMin(float x, float y) { return std::min<float>(x, y); }
|
||||
static float ArchMax(float x, float y) { return std::max<float>(x, y); }
|
||||
static float ArchReciprocal(float den) { return 1.0f / den; }
|
||||
using ArchFloatMask = unsigned;
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
|
||||
return static_cast<unsigned>(-static_cast<signed>(x != y));
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return x | y;
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return x ^ y;
|
||||
}
|
||||
|
||||
XE_FORCEINLINE
|
||||
static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||
return x & y;
|
||||
}
|
||||
constexpr ArchFloatMask floatmask_zero = 0;
|
||||
|
||||
|
||||
XE_FORCEINLINE
|
||||
static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) { return x >> 31; }
|
||||
|
||||
#endif
|
||||
XE_FORCEINLINE
|
||||
static float RefineReciprocal(float initial, float den) {
|
||||
|
|
|
@ -115,14 +115,17 @@
|
|||
#define XE_COLD __declspec(code_seg(".cold"))
|
||||
#define XE_LIKELY(...) (!!(__VA_ARGS__))
|
||||
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
|
||||
|
||||
#define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
|
||||
#define XE_NOALIAS __declspec(noalias)
|
||||
#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
||||
#define XE_FORCEINLINE __attribute__((always_inline))
|
||||
#define XE_NOINLINE __attribute__((noinline))
|
||||
#define XE_COLD __attribute__((cold))
|
||||
#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
|
||||
#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
|
||||
|
||||
#define XE_NOALIAS
|
||||
//cant do unevaluated assume
|
||||
#define XE_MSVC_ASSUME(...) static_cast<void>(0)
|
||||
#else
|
||||
#define XE_FORCEINLINE inline
|
||||
#define XE_NOINLINE
|
||||
|
@ -130,6 +133,9 @@
|
|||
|
||||
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]]
|
||||
#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) [[unlikely]]
|
||||
#define XE_NOALIAS
|
||||
#define XE_MSVC_ASSUME(...) static_cast<void>(0)
|
||||
|
||||
#endif
|
||||
|
||||
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
||||
|
@ -174,5 +180,7 @@ const char kPathSeparator = '/';
|
|||
const char kGuestPathSeparator = '\\';
|
||||
|
||||
} // namespace xe
|
||||
|
||||
#if XE_ARCH_AMD64==1
|
||||
#include "platform_amd64.h"
|
||||
#endif
|
||||
#endif // XENIA_BASE_PLATFORM_H_
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2020 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/platform.h"
|
||||
|
||||
#include "third_party/xbyak/xbyak/xbyak.h"
|
||||
#include "third_party/xbyak/xbyak/xbyak_util.h"
|
||||
DEFINE_int32(x64_extension_mask, -1,
|
||||
"Allow the detection and utilization of specific instruction set "
|
||||
"features.\n"
|
||||
" 0 = x86_64 + AVX1\n"
|
||||
" 1 = AVX2\n"
|
||||
" 2 = FMA\n"
|
||||
" 4 = LZCNT\n"
|
||||
" 8 = BMI1\n"
|
||||
" 16 = BMI2\n"
|
||||
" 32 = F16C\n"
|
||||
" 64 = Movbe\n"
|
||||
" 128 = GFNI\n"
|
||||
" 256 = AVX512F\n"
|
||||
" 512 = AVX512VL\n"
|
||||
" 1024 = AVX512BW\n"
|
||||
" 2048 = AVX512DQ\n"
|
||||
" -1 = Detect and utilize all possible processor features\n",
|
||||
"x64");
|
||||
namespace xe {
|
||||
namespace amd64 {
|
||||
static uint32_t g_feature_flags = 0U;
|
||||
static bool g_did_initialize_feature_flags = false;
|
||||
uint32_t GetFeatureFlags() {
|
||||
xenia_assert(g_did_initialize_feature_flags);
|
||||
return g_feature_flags;
|
||||
}
|
||||
XE_COLD
|
||||
XE_NOINLINE
|
||||
void InitFeatureFlags() {
|
||||
uint32_t feature_flags_ = 0U;
|
||||
|
||||
Xbyak::util::Cpu cpu_;
|
||||
#define TEST_EMIT_FEATURE(emit, ext) \
|
||||
if ((cvars::x64_extension_mask & emit) == emit) { \
|
||||
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
|
||||
}
|
||||
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
|
||||
TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
|
||||
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
|
||||
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
|
||||
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
|
||||
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
|
||||
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
|
||||
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
|
||||
#undef TEST_EMIT_FEATURE
|
||||
/*
|
||||
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
|
||||
latest version of xbyak
|
||||
*/
|
||||
unsigned int data[4];
|
||||
Xbyak::util::Cpu::getCpuid(0x80000001, data);
|
||||
unsigned amd_flags = data[2];
|
||||
if (amd_flags & (1U << 5)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
|
||||
feature_flags_ |= kX64EmitLZCNT;
|
||||
}
|
||||
}
|
||||
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
|
||||
if (amd_flags & (1U << 16)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
|
||||
feature_flags_ |= kX64EmitFMA4;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 21)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
|
||||
feature_flags_ |= kX64EmitTBM;
|
||||
}
|
||||
}
|
||||
if (amd_flags & (1U << 11)) {
|
||||
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
|
||||
feature_flags_ |= kX64EmitXOP;
|
||||
}
|
||||
}
|
||||
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||
/*
|
||||
chrispy: according to agner's tables, all amd architectures that
|
||||
we support (ones with avx) have the same timings for
|
||||
jrcxz/loop/loope/loopne as for other jmps
|
||||
*/
|
||||
feature_flags_ |= kX64FastJrcx;
|
||||
feature_flags_ |= kX64FastLoop;
|
||||
if (is_zennish) {
|
||||
// ik that i heard somewhere that this is the case for zen, but i need to
|
||||
// verify. cant find my original source for that.
|
||||
// todo: ask agner?
|
||||
feature_flags_ |= kX64FlagsIndependentVars;
|
||||
}
|
||||
}
|
||||
g_feature_flags = feature_flags_;
|
||||
g_did_initialize_feature_flags = true;
|
||||
}
|
||||
} // namespace amd64
|
||||
} // namespace xe
|
|
@ -0,0 +1,61 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2019 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef XENIA_BASE_PLATFORM_AMD64_H_
|
||||
#define XENIA_BASE_PLATFORM_AMD64_H_
|
||||
#include <cstdint>
|
||||
|
||||
namespace xe {
|
||||
namespace amd64 {
|
||||
enum X64FeatureFlags {
|
||||
kX64EmitAVX2 = 1 << 0,
|
||||
kX64EmitFMA = 1 << 1,
|
||||
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
|
||||
kX64EmitBMI1 = 1 << 3,
|
||||
kX64EmitBMI2 = 1 << 4,
|
||||
kX64EmitPrefetchW = 1 << 5,
|
||||
kX64EmitMovbe = 1 << 6,
|
||||
kX64EmitGFNI = 1 << 7,
|
||||
|
||||
kX64EmitAVX512F = 1 << 8,
|
||||
kX64EmitAVX512VL = 1 << 9,
|
||||
|
||||
kX64EmitAVX512BW = 1 << 10,
|
||||
kX64EmitAVX512DQ = 1 << 11,
|
||||
|
||||
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
||||
kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1)
|
||||
kX64FastLoop =
|
||||
1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||
kX64EmitAVX512VBMI = 1 << 14,
|
||||
kX64FlagsIndependentVars =
|
||||
1 << 15, // if true, instructions that only modify some flags (like
|
||||
// inc/dec) do not introduce false dependencies on EFLAGS
|
||||
// because the individual flags are treated as different vars by
|
||||
// the processor. (this applies to zen)
|
||||
kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx
|
||||
// instructions, and FX users need the boost
|
||||
kX64EmitFMA4 = 1 << 17, // todo: also use on zen1?
|
||||
kX64EmitTBM = 1 << 18,
|
||||
// kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
|
||||
// 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
|
||||
// instructions by using the legacy sse version if we recently cleared the
|
||||
// high 128 bits of the
|
||||
};
|
||||
|
||||
XE_NOALIAS
|
||||
uint32_t GetFeatureFlags();
|
||||
XE_COLD
|
||||
void InitFeatureFlags();
|
||||
|
||||
}
|
||||
} // namespace xe
|
||||
|
||||
#endif // XENIA_BASE_PLATFORM_AMD64_H_
|
|
@ -0,0 +1,40 @@
|
|||
#pragma once
|
||||
namespace xe {
|
||||
/*
|
||||
a very simple freelist, intended to be used with HIRFunction/Arena to
|
||||
eliminate our last-level cache miss problems with HIR simplifications not
|
||||
thread safe, doesnt need to be
|
||||
*/
|
||||
template <typename T>
|
||||
struct SimpleFreelist {
|
||||
union Node {
|
||||
union Node* next_;
|
||||
T entry_;
|
||||
};
|
||||
Node* head_;
|
||||
|
||||
static_assert(sizeof(T) >= sizeof(void*));
|
||||
SimpleFreelist() : head_(nullptr) {}
|
||||
T* NewEntry() {
|
||||
Node* result_node = head_;
|
||||
if (!result_node) {
|
||||
return nullptr;
|
||||
} else {
|
||||
head_ = result_node->next_;
|
||||
|
||||
memset(result_node, 0, sizeof(T));
|
||||
return &result_node->entry_;
|
||||
// return new (&result_node->entry_) T(args...);
|
||||
}
|
||||
}
|
||||
|
||||
void DeleteEntry(T* value) {
|
||||
memset(value, 0, sizeof(T));
|
||||
Node* node = reinterpret_cast<Node*>(value);
|
||||
node->next_ = head_;
|
||||
head_ = node;
|
||||
}
|
||||
void Reset() { head_ = nullptr;
|
||||
}
|
||||
};
|
||||
} // namespace xe
|
|
@ -50,6 +50,9 @@ XE_NTDLL_IMPORT(NtPulseEvent, cls_NtPulseEvent, NtPulseEventPointer);
|
|||
// counts
|
||||
XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
|
||||
NtReleaseSemaphorePointer);
|
||||
|
||||
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
|
||||
NtDelayExecutionPointer);
|
||||
namespace xe {
|
||||
namespace threading {
|
||||
|
||||
|
@ -109,13 +112,30 @@ void set_name(const std::string_view name) {
|
|||
set_name(GetCurrentThread(), name);
|
||||
}
|
||||
|
||||
// checked ntoskrnl, it does not modify delay, so we can place this as a
|
||||
// constant and avoid creating a stack variable
|
||||
static const LARGE_INTEGER sleepdelay0_for_maybeyield{0LL};
|
||||
|
||||
void MaybeYield() {
|
||||
#if 0
|
||||
#if defined(XE_USE_NTDLL_FUNCTIONS)
|
||||
|
||||
NtYieldExecutionPointer.invoke();
|
||||
#else
|
||||
SwitchToThread();
|
||||
#endif
|
||||
|
||||
#else
|
||||
// chrispy: SwitchToThread will only switch to a ready thread on the current
|
||||
// processor, so if one is not ready we end up spinning, constantly calling
|
||||
// switchtothread without doing any work, heating up the users cpu sleep(0)
|
||||
// however will yield to threads on other processors and surrenders the
|
||||
// current timeslice
|
||||
#if defined(XE_USE_NTDLL_FUNCTIONS)
|
||||
NtDelayExecutionPointer.invoke(0, &sleepdelay0_for_maybeyield);
|
||||
#else
|
||||
::Sleep(0);
|
||||
#endif
|
||||
#endif
|
||||
// memorybarrier is really not necessary here...
|
||||
MemoryBarrier();
|
||||
}
|
||||
|
|
|
@ -26,24 +26,6 @@
|
|||
#include "xenia/cpu/processor.h"
|
||||
#include "xenia/cpu/stack_walker.h"
|
||||
#include "xenia/cpu/xex_module.h"
|
||||
DEFINE_int32(x64_extension_mask, -1,
|
||||
"Allow the detection and utilization of specific instruction set "
|
||||
"features.\n"
|
||||
" 0 = x86_64 + AVX1\n"
|
||||
" 1 = AVX2\n"
|
||||
" 2 = FMA\n"
|
||||
" 4 = LZCNT\n"
|
||||
" 8 = BMI1\n"
|
||||
" 16 = BMI2\n"
|
||||
" 32 = F16C\n"
|
||||
" 64 = Movbe\n"
|
||||
" 128 = GFNI\n"
|
||||
" 256 = AVX512F\n"
|
||||
" 512 = AVX512VL\n"
|
||||
" 1024 = AVX512BW\n"
|
||||
" 2048 = AVX512DQ\n"
|
||||
" -1 = Detect and utilize all possible processor features\n",
|
||||
"x64");
|
||||
|
||||
DEFINE_bool(record_mmio_access_exceptions, true,
|
||||
"For guest addresses records whether we caught any mmio accesses "
|
||||
|
|
|
@ -103,7 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
"FAQ for system requirements at https://xenia.jp");
|
||||
return;
|
||||
}
|
||||
|
||||
#if 1
|
||||
feature_flags_ = amd64::GetFeatureFlags();
|
||||
#else
|
||||
#define TEST_EMIT_FEATURE(emit, ext) \
|
||||
if ((cvars::x64_extension_mask & emit) == emit) { \
|
||||
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
|
||||
|
@ -168,6 +170,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
|||
feature_flags_ |= kX64FlagsIndependentVars;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
may_use_membase32_as_zero_reg_ =
|
||||
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
|
||||
processor()->memory()->virtual_membase())) == 0;
|
||||
|
@ -913,6 +916,8 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
|
|||
|
||||
static const vec128_t xmm_consts[] = {
|
||||
/* XMMZero */ vec128f(0.0f),
|
||||
/* XMMByteSwapMask */
|
||||
vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
|
||||
/* XMMOne */ vec128f(1.0f),
|
||||
/* XMMOnePD */ vec128d(1.0),
|
||||
/* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f),
|
||||
|
@ -937,8 +942,7 @@ static const vec128_t xmm_consts[] = {
|
|||
vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu),
|
||||
/* XMMAbsMaskPD */
|
||||
vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu),
|
||||
/* XMMByteSwapMask */
|
||||
vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
|
||||
|
||||
/* XMMByteOrderMask */
|
||||
vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
|
||||
/* XMMPermuteControl15 */ vec128b(15),
|
||||
|
|
|
@ -34,7 +34,7 @@ namespace xe {
|
|||
namespace cpu {
|
||||
namespace backend {
|
||||
namespace x64 {
|
||||
|
||||
using namespace amd64;
|
||||
class X64Backend;
|
||||
class X64CodeCache;
|
||||
|
||||
|
@ -81,6 +81,7 @@ static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
|
|||
}
|
||||
enum XmmConst {
|
||||
XMMZero = 0,
|
||||
XMMByteSwapMask,
|
||||
XMMOne,
|
||||
XMMOnePD,
|
||||
XMMNegativeOne,
|
||||
|
@ -97,7 +98,7 @@ enum XmmConst {
|
|||
XMMSignMaskPD,
|
||||
XMMAbsMaskPS,
|
||||
XMMAbsMaskPD,
|
||||
XMMByteSwapMask,
|
||||
|
||||
XMMByteOrderMask,
|
||||
XMMPermuteControl15,
|
||||
XMMPermuteByteMask,
|
||||
|
@ -189,42 +190,6 @@ class XbyakAllocator : public Xbyak::Allocator {
|
|||
virtual bool useProtect() const { return false; }
|
||||
};
|
||||
|
||||
enum X64EmitterFeatureFlags {
|
||||
kX64EmitAVX2 = 1 << 0,
|
||||
kX64EmitFMA = 1 << 1,
|
||||
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
|
||||
kX64EmitBMI1 = 1 << 3,
|
||||
kX64EmitBMI2 = 1 << 4,
|
||||
kX64EmitPrefetchW = 1 << 5,
|
||||
kX64EmitMovbe = 1 << 6,
|
||||
kX64EmitGFNI = 1 << 7,
|
||||
|
||||
kX64EmitAVX512F = 1 << 8,
|
||||
kX64EmitAVX512VL = 1 << 9,
|
||||
|
||||
kX64EmitAVX512BW = 1 << 10,
|
||||
kX64EmitAVX512DQ = 1 << 11,
|
||||
|
||||
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
||||
kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1)
|
||||
kX64FastLoop =
|
||||
1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||
kX64EmitAVX512VBMI = 1 << 14,
|
||||
kX64FlagsIndependentVars =
|
||||
1 << 15, // if true, instructions that only modify some flags (like
|
||||
// inc/dec) do not introduce false dependencies on EFLAGS
|
||||
// because the individual flags are treated as different vars by
|
||||
// the processor. (this applies to zen)
|
||||
kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx
|
||||
// instructions, and FX users need the boost
|
||||
kX64EmitFMA4 = 1 << 17, // todo: also use on zen1?
|
||||
kX64EmitTBM = 1 << 18,
|
||||
// kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
|
||||
// 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
|
||||
// instructions by using the legacy sse version if we recently cleared the
|
||||
// high 128 bits of the
|
||||
};
|
||||
class ResolvableGuestCall {
|
||||
public:
|
||||
bool is_jump_;
|
||||
|
|
|
@ -1354,15 +1354,17 @@ struct VECTOR_SHA_V128
|
|||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): native version (with shift magic).
|
||||
if (i.src2.is_constant) {
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
const auto& shamt = i.src2.constant();
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
const auto& shamt = i.src2.constant();
|
||||
bool all_same = true;
|
||||
for (size_t n = 0; n < 16 - n; ++n) {
|
||||
if (shamt.u8[n] != shamt.u8[n + 1]) {
|
||||
all_same = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||
if (all_same) {
|
||||
// Every count is the same, so we can use gf2p8affineqb.
|
||||
const uint8_t shift_amount = shamt.u8[0] & 0b111;
|
||||
|
@ -1375,6 +1377,19 @@ struct VECTOR_SHA_V128
|
|||
return;
|
||||
}
|
||||
}
|
||||
else if (all_same) {
|
||||
Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1);
|
||||
|
||||
e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw
|
||||
e.vpunpckhqdq(e.xmm2, to_be_shifted, to_be_shifted);
|
||||
e.vpmovsxbw(e.xmm1, e.xmm2);
|
||||
e.vpsraw(e.xmm0, shamt.u8[0]);
|
||||
e.vpsraw(e.xmm1, shamt.u8[0]);
|
||||
e.vpacksswb(i.dest, e.xmm0, e.xmm1);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||
} else {
|
||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||
|
|
|
@ -3234,7 +3234,17 @@ struct SET_ROUNDING_MODE_I32
|
|||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
|
||||
|
||||
// ============================================================================
|
||||
// OPCODE_DELAY_EXECUTION
|
||||
// ============================================================================
|
||||
struct DELAY_EXECUTION
|
||||
: Sequence<DELAY_EXECUTION, I<OPCODE_DELAY_EXECUTION, VoidOp>> {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// todo: what if they dont have smt?
|
||||
e.pause();
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(OPCODE_DELAY_EXECUTION, DELAY_EXECUTION);
|
||||
// Include anchors to other sequence sources so they get included in the build.
|
||||
extern volatile int anchor_control;
|
||||
static int anchor_control_dest = anchor_control;
|
||||
|
|
|
@ -98,7 +98,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstantTrue()) {
|
||||
i->Replace(&OPCODE_DEBUG_BREAK_info, i->flags);
|
||||
} else {
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
}
|
||||
result = true;
|
||||
}
|
||||
|
@ -109,7 +109,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstantTrue()) {
|
||||
i->Replace(&OPCODE_TRAP_info, i->flags);
|
||||
} else {
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
}
|
||||
result = true;
|
||||
}
|
||||
|
@ -122,7 +122,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->Replace(&OPCODE_CALL_info, i->flags);
|
||||
i->src1.symbol = symbol;
|
||||
} else {
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
}
|
||||
result = true;
|
||||
}
|
||||
|
@ -146,7 +146,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->Replace(&OPCODE_CALL_INDIRECT_info, i->flags);
|
||||
i->set_src1(value);
|
||||
} else {
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
}
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstant()) { // chrispy: fix h3 bug from
|
||||
|
@ -172,7 +172,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->Replace(&OPCODE_BRANCH_info, i->flags);
|
||||
i->src1.label = label;
|
||||
} else {
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
}
|
||||
result = true;
|
||||
}
|
||||
|
@ -184,7 +184,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->Replace(&OPCODE_BRANCH_info, i->flags);
|
||||
i->src1.label = label;
|
||||
} else {
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
}
|
||||
result = true;
|
||||
}
|
||||
|
@ -195,7 +195,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
TypeName target_type = v->type;
|
||||
v->set_from(i->src1.value);
|
||||
v->Cast(target_type);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -204,7 +204,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
TypeName target_type = v->type;
|
||||
v->set_from(i->src1.value);
|
||||
v->Convert(target_type, RoundMode(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -212,7 +212,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Round(RoundMode(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -221,7 +221,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
TypeName target_type = v->type;
|
||||
v->set_from(i->src1.value);
|
||||
v->ZeroExtend(target_type);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -230,7 +230,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
TypeName target_type = v->type;
|
||||
v->set_from(i->src1.value);
|
||||
v->SignExtend(target_type);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -239,7 +239,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
TypeName target_type = v->type;
|
||||
v->set_from(i->src1.value);
|
||||
v->Truncate(target_type);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -247,7 +247,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
if (!(i->src1.value->AsUint32() & 0xF)) {
|
||||
v->set_zero(VEC128_TYPE);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
|
@ -281,22 +281,22 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
switch (v->type) {
|
||||
case INT8_TYPE:
|
||||
v->set_constant(xe::load<uint8_t>(host_addr));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
v->set_constant(xe::load<uint16_t>(host_addr));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
v->set_constant(xe::load<uint32_t>(host_addr));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
break;
|
||||
case INT64_TYPE:
|
||||
v->set_constant(xe::load<uint64_t>(host_addr));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
break;
|
||||
case VEC128_TYPE:
|
||||
|
@ -304,7 +304,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
val.low = xe::load<uint64_t>(host_addr);
|
||||
val.high = xe::load<uint64_t>(host_addr + 8);
|
||||
v->set_constant(val);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
break;
|
||||
default:
|
||||
|
@ -357,14 +357,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->src3.value->IsConstant()) {
|
||||
v->set_from(i->src2.value);
|
||||
v->Select(i->src3.value, i->src1.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
} else {
|
||||
if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) {
|
||||
v->set_from(i->src2.value);
|
||||
v->Select(i->src3.value, i->src1.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
|
@ -381,7 +381,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
} else {
|
||||
v->set_constant(uint8_t(0));
|
||||
}
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -391,7 +391,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantEQ(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -399,7 +399,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantNE(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -407,7 +407,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantSLT(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -415,7 +415,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantSLE(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -423,7 +423,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantSGT(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -431,7 +431,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantSGE(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -439,7 +439,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantULT(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -447,7 +447,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantULE(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -455,7 +455,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantUGT(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -463,7 +463,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
bool value = i->src1.value->IsConstantUGE(i->src2.value);
|
||||
i->dest->set_constant(uint8_t(value));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -477,7 +477,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
!should_skip_because_of_float) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Add(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -489,7 +489,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
TypeName target_type = v->type;
|
||||
v->set_from(ca);
|
||||
v->ZeroExtend(target_type);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
} else {
|
||||
if (i->dest->type == ca->type) {
|
||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||
|
@ -507,7 +507,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
!should_skip_because_of_float) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Sub(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -516,7 +516,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Mul(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
} else if (i->src1.value->IsConstant() ||
|
||||
i->src2.value->IsConstant()) {
|
||||
|
@ -548,7 +548,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->MulHi(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -557,13 +557,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
} else if (!i->src2.value->MaybeFloaty() &&
|
||||
i->src2.value->IsConstantZero()) {
|
||||
// division by 0 == 0 every time,
|
||||
v->set_zero(i->src2.value->type);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstant()) {
|
||||
// Division by one = no-op.
|
||||
|
@ -592,7 +592,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
}
|
||||
v->set_from(i->src1.value);
|
||||
v->Max(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -600,7 +600,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Neg();
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -608,7 +608,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Abs();
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -616,7 +616,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Sqrt();
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -624,7 +624,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->RSqrt();
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -632,7 +632,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Recip();
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -640,7 +640,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->And(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -648,7 +648,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->AndNot(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -656,7 +656,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Or(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -664,13 +664,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Xor(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
} else if (!i->src1.value->IsConstant() &&
|
||||
!i->src2.value->IsConstant() &&
|
||||
i->src1.value == i->src2.value) {
|
||||
v->set_zero(v->type);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -678,7 +678,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Not();
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -687,7 +687,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Shl(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstantZero()) {
|
||||
auto src1 = i->src1.value;
|
||||
|
@ -702,7 +702,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Shr(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
} else if (i->src2.value->IsConstantZero()) {
|
||||
auto src1 = i->src1.value;
|
||||
|
@ -716,7 +716,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Sha(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -724,7 +724,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->RotateLeft(i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -732,7 +732,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->ByteSwap();
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -740,7 +740,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_zero(v->type);
|
||||
v->CountLeadingZeros(i->src1.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -751,7 +751,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
(i->flags == INT8_TYPE || i->flags == INT16_TYPE)) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
|
||||
|
@ -765,7 +765,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
*/
|
||||
|
||||
v->set_zero(VEC128_TYPE);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
|
||||
|
@ -777,7 +777,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
i->src3.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -785,7 +785,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -793,7 +793,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_zero(v->type);
|
||||
v->Extract(i->src1.value, i->src2.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -801,7 +801,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_zero(v->type);
|
||||
v->Splat(i->src1.value);
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -809,7 +809,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -817,7 +817,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -825,7 +825,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->VectorCompareSGE(i->src2.value, hir::TypeName(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -833,7 +833,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->VectorCompareUGT(i->src2.value, hir::TypeName(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -841,7 +841,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->VectorCompareUGE(i->src2.value, hir::TypeName(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -850,7 +850,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
v->set_zero(VEC128_TYPE);
|
||||
v->VectorConvertF2I(i->src1.value,
|
||||
!!(i->flags & ARITHMETIC_UNSIGNED));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -859,7 +859,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
v->set_zero(VEC128_TYPE);
|
||||
v->VectorConvertI2F(i->src1.value,
|
||||
!!(i->flags & ARITHMETIC_UNSIGNED));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -867,7 +867,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->VectorShl(i->src2.value, hir::TypeName(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -875,7 +875,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->VectorShr(i->src2.value, hir::TypeName(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -883,7 +883,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->VectorRol(i->src2.value, hir::TypeName(i->flags));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -894,7 +894,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
v->VectorAdd(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
||||
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
||||
!!(arith_flags & ARITHMETIC_SATURATE));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -905,7 +905,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
v->VectorSub(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
||||
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
||||
!!(arith_flags & ARITHMETIC_SATURATE));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -917,7 +917,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
v->VectorAverage(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
||||
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
||||
!!(arith_flags & ARITHMETIC_SATURATE));
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
@ -926,7 +926,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
|||
if (i->src1.value->IsConstant()) {
|
||||
v->set_from(i->src1.value);
|
||||
v->DenormalFlush();
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
result = true;
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -146,7 +146,7 @@ void ContextPromotionPass::RemoveDeadStoresBlock(Block* block) {
|
|||
validity.set(static_cast<uint32_t>(offset));
|
||||
} else {
|
||||
// Already written to. Remove this store.
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
}
|
||||
}
|
||||
i = prev;
|
||||
|
|
|
@ -120,7 +120,8 @@ bool DeadCodeEliminationPass::Run(HIRBuilder* builder) {
|
|||
Instr* next = i->next;
|
||||
if (i->opcode == &OPCODE_NOP_info) {
|
||||
// Nop - remove!
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
i->Deallocate();
|
||||
}
|
||||
i = next;
|
||||
}
|
||||
|
@ -148,7 +149,9 @@ bool DeadCodeEliminationPass::Run(HIRBuilder* builder) {
|
|||
|
||||
void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) {
|
||||
i->opcode = &hir::OPCODE_NOP_info;
|
||||
i->dest->def = NULL;
|
||||
if (i->dest) {
|
||||
i->dest->def = NULL;
|
||||
}
|
||||
i->dest = NULL;
|
||||
|
||||
#define MAKE_NOP_SRC(n) \
|
||||
|
@ -163,7 +166,9 @@ void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) {
|
|||
if (value->def && value->def != i) { \
|
||||
MakeNopRecursive(value->def); \
|
||||
} \
|
||||
HIRBuilder::GetCurrent()->DeallocateValue(value); \
|
||||
} \
|
||||
HIRBuilder::GetCurrent()->DeallocateUse(use); \
|
||||
}
|
||||
MAKE_NOP_SRC(1);
|
||||
MAKE_NOP_SRC(2);
|
||||
|
@ -189,7 +194,8 @@ void DeadCodeEliminationPass::ReplaceAssignment(Instr* i) {
|
|||
use = use->next;
|
||||
}
|
||||
|
||||
i->Remove();
|
||||
i->UnlinkAndNOP();
|
||||
i->Deallocate();
|
||||
}
|
||||
|
||||
bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) {
|
||||
|
@ -204,11 +210,11 @@ bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) {
|
|||
}
|
||||
|
||||
// Load/store are paired. They can both be removed.
|
||||
use_instr->Remove();
|
||||
use_instr->UnlinkAndNOP();
|
||||
}
|
||||
|
||||
i->Remove();
|
||||
|
||||
i->UnlinkAndNOP();
|
||||
i->Deallocate();
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ bool FinalizationPass::Run(HIRBuilder* builder) {
|
|||
auto target = tail->src1.label;
|
||||
if (target->block == block->next) {
|
||||
// Jumping to subsequent block. Remove.
|
||||
tail->Remove();
|
||||
tail->UnlinkAndNOP();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -46,15 +46,27 @@ namespace hir {
|
|||
(value->type) == FLOAT64_TYPE || (value->type) == VEC128_TYPE)
|
||||
#define ASSERT_TYPES_EQUAL(value1, value2) \
|
||||
assert_true((value1->type) == (value2->type))
|
||||
|
||||
thread_local HIRBuilder* thrd_current_hirfunction = nullptr;
|
||||
HIRBuilder::HIRBuilder() {
|
||||
arena_ = new Arena();
|
||||
Reset();
|
||||
}
|
||||
|
||||
HIRBuilder* HIRBuilder::GetCurrent() { return thrd_current_hirfunction; }
|
||||
|
||||
void HIRBuilder::MakeCurrent() { thrd_current_hirfunction = this; }
|
||||
void HIRBuilder::RemoveCurrent() {
|
||||
if (thrd_current_hirfunction == this) {
|
||||
thrd_current_hirfunction = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
HIRBuilder::~HIRBuilder() {
|
||||
Reset();
|
||||
delete arena_;
|
||||
if (thrd_current_hirfunction == this) {
|
||||
thrd_current_hirfunction = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void HIRBuilder::Reset() {
|
||||
|
@ -105,7 +117,37 @@ bool HIRBuilder::Finalize() {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
Instr* HIRBuilder::AllocateInstruction() {
|
||||
Instr* result = free_instrs_.NewEntry();
|
||||
if (result) {
|
||||
return result;
|
||||
}
|
||||
return arena()->Alloc<Instr>();
|
||||
}
|
||||
|
||||
Value* HIRBuilder::AllocateValue() {
|
||||
Value* result = free_values_.NewEntry();
|
||||
if (result) {
|
||||
return result;
|
||||
}
|
||||
return arena()->Alloc<Value>();
|
||||
}
|
||||
Value::Use* HIRBuilder::AllocateUse() {
|
||||
Value::Use* result = free_uses_.NewEntry();
|
||||
if (result) {
|
||||
return result;
|
||||
}
|
||||
return arena()->Alloc<Value::Use>();
|
||||
}
|
||||
void HIRBuilder::DeallocateInstruction(Instr* instr) {
|
||||
// free_instrs_.DeleteEntry(instr);
|
||||
}
|
||||
void HIRBuilder::DeallocateValue(Value* value) {
|
||||
// free_values_.DeleteEntry(value);
|
||||
}
|
||||
void HIRBuilder::DeallocateUse(Value::Use* use) {
|
||||
// free_uses_.DeleteEntry(use);
|
||||
}
|
||||
void HIRBuilder::DumpValue(StringBuffer* str, Value* value) {
|
||||
if (value->IsConstant()) {
|
||||
switch (value->type) {
|
||||
|
@ -545,12 +587,12 @@ void HIRBuilder::MergeAdjacentBlocks(Block* left, Block* right) {
|
|||
auto sig = left->instr_tail->opcode->signature;
|
||||
if (GET_OPCODE_SIG_TYPE_SRC1(sig) == OPCODE_SIG_TYPE_L) {
|
||||
if (left->instr_tail->src1.label->block == right) {
|
||||
left->instr_tail->Remove();
|
||||
left->instr_tail->UnlinkAndNOP();
|
||||
}
|
||||
}
|
||||
if (GET_OPCODE_SIG_TYPE_SRC2(sig) == OPCODE_SIG_TYPE_L) {
|
||||
if (left->instr_tail->src2.label->block == right) {
|
||||
left->instr_tail->Remove();
|
||||
left->instr_tail->UnlinkAndNOP();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -678,7 +720,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
|
|||
}
|
||||
Block* block = current_block_;
|
||||
|
||||
Instr* instr = arena_->Alloc<Instr>();
|
||||
Instr* instr = AllocateInstruction();
|
||||
instr->next = NULL;
|
||||
instr->prev = block->instr_tail;
|
||||
if (block->instr_tail) {
|
||||
|
@ -705,7 +747,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
|
|||
}
|
||||
|
||||
Value* HIRBuilder::AllocValue(TypeName type) {
|
||||
Value* value = arena_->Alloc<Value>();
|
||||
Value* value = AllocateValue();
|
||||
value->ordinal = next_value_ordinal_++;
|
||||
value->type = type;
|
||||
value->flags = 0;
|
||||
|
@ -719,7 +761,7 @@ Value* HIRBuilder::AllocValue(TypeName type) {
|
|||
}
|
||||
|
||||
Value* HIRBuilder::CloneValue(Value* source) {
|
||||
Value* value = arena_->Alloc<Value>();
|
||||
Value* value = AllocateValue();
|
||||
value->ordinal = next_value_ordinal_++;
|
||||
value->type = source->type;
|
||||
value->flags = source->flags;
|
||||
|
@ -1295,6 +1337,9 @@ void HIRBuilder::CacheControl(Value* address, size_t cache_line_size,
|
|||
|
||||
void HIRBuilder::MemoryBarrier() { AppendInstr(OPCODE_MEMORY_BARRIER_info, 0); }
|
||||
|
||||
void HIRBuilder::DelayExecution() {
|
||||
AppendInstr(OPCODE_DELAY_EXECUTION_info, 0);
|
||||
}
|
||||
void HIRBuilder::SetRoundingMode(Value* value) {
|
||||
ASSERT_INTEGER_TYPE(value);
|
||||
Instr* i = AppendInstr(OPCODE_SET_ROUNDING_MODE_info, 0);
|
||||
|
|
|
@ -15,6 +15,8 @@
|
|||
#include "third_party/fmt/include/fmt/format.h"
|
||||
#include "xenia/base/arena.h"
|
||||
#include "xenia/base/string_buffer.h"
|
||||
|
||||
#include "xenia/base/simple_freelist.h"
|
||||
#include "xenia/cpu/hir/block.h"
|
||||
#include "xenia/cpu/hir/instr.h"
|
||||
#include "xenia/cpu/hir/label.h"
|
||||
|
@ -31,11 +33,20 @@ enum FunctionAttributes {
|
|||
};
|
||||
|
||||
class HIRBuilder {
|
||||
SimpleFreelist<Instr> free_instrs_;
|
||||
SimpleFreelist<Value> free_values_;
|
||||
SimpleFreelist<Value::Use> free_uses_;
|
||||
|
||||
public:
|
||||
HIRBuilder();
|
||||
virtual ~HIRBuilder();
|
||||
static HIRBuilder* GetCurrent();
|
||||
|
||||
void MakeCurrent();
|
||||
void RemoveCurrent();
|
||||
|
||||
virtual void Reset();
|
||||
|
||||
virtual bool Finalize();
|
||||
|
||||
void Dump(StringBuffer* str);
|
||||
|
@ -66,6 +77,18 @@ class HIRBuilder {
|
|||
void RemoveBlock(Block* block);
|
||||
void MergeAdjacentBlocks(Block* left, Block* right);
|
||||
|
||||
Instr* AllocateInstruction();
|
||||
|
||||
Value* AllocateValue();
|
||||
Value::Use* AllocateUse();
|
||||
void DeallocateInstruction(Instr* instr);
|
||||
void DeallocateValue(Value* value);
|
||||
void DeallocateUse(Value::Use* use);
|
||||
void ResetPools() {
|
||||
free_instrs_.Reset();
|
||||
free_uses_.Reset();
|
||||
free_values_.Reset();
|
||||
}
|
||||
// static allocations:
|
||||
// Value* AllocStatic(size_t length);
|
||||
|
||||
|
@ -176,7 +199,7 @@ class HIRBuilder {
|
|||
void CacheControl(Value* address, size_t cache_line_size,
|
||||
CacheControlType type);
|
||||
void MemoryBarrier();
|
||||
|
||||
void DelayExecution();
|
||||
void SetRoundingMode(Value* value);
|
||||
Value* Max(Value* value1, Value* value2);
|
||||
Value* VectorMax(Value* value1, Value* value2, TypeName part_type,
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
#include "xenia/cpu/hir/instr.h"
|
||||
|
||||
#include "xenia/cpu/hir/block.h"
|
||||
|
||||
#include "xenia/cpu/hir/hir_builder.h"
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace hir {
|
||||
|
@ -62,21 +62,35 @@ void Instr::Replace(const OpcodeInfo* new_opcode, uint16_t new_flags) {
|
|||
if (src1_use) {
|
||||
src1.value->RemoveUse(src1_use);
|
||||
src1.value = NULL;
|
||||
src1_use = NULL;
|
||||
// src1_use = NULL;
|
||||
}
|
||||
if (src2_use) {
|
||||
src2.value->RemoveUse(src2_use);
|
||||
src2.value = NULL;
|
||||
src2_use = NULL;
|
||||
// src2_use = NULL;
|
||||
}
|
||||
if (src3_use) {
|
||||
src3.value->RemoveUse(src3_use);
|
||||
src3.value = NULL;
|
||||
src3_use = NULL;
|
||||
// src3_use = NULL;
|
||||
}
|
||||
|
||||
if (src1_use) {
|
||||
HIRBuilder::GetCurrent()->DeallocateUse(src1_use);
|
||||
src1_use = nullptr;
|
||||
}
|
||||
if (src2_use) {
|
||||
HIRBuilder::GetCurrent()->DeallocateUse(src2_use);
|
||||
src2_use = nullptr;
|
||||
}
|
||||
|
||||
if (src3_use) {
|
||||
HIRBuilder::GetCurrent()->DeallocateUse(src3_use);
|
||||
src3_use = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void Instr::Remove() {
|
||||
void Instr::UnlinkAndNOP() {
|
||||
// Remove all srcs/dest.
|
||||
Replace(&OPCODE_NOP_info, 0);
|
||||
|
||||
|
@ -91,6 +105,10 @@ void Instr::Remove() {
|
|||
block->instr_tail = prev;
|
||||
}
|
||||
}
|
||||
|
||||
void Instr::Deallocate() {
|
||||
HIRBuilder::GetCurrent()->DeallocateInstruction(this);
|
||||
}
|
||||
Instr* Instr::GetDestDefSkipAssigns() {
|
||||
Instr* current_def = this;
|
||||
|
||||
|
|
|
@ -78,7 +78,12 @@ class Instr {
|
|||
|
||||
void MoveBefore(Instr* other);
|
||||
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
|
||||
void Remove();
|
||||
void UnlinkAndNOP();
|
||||
//chrispy: wanted to change this one to Remove, but i changed Remove's name to UnlinkAndNOP,
|
||||
//so if changes happened in master that we wanted to port over, and those changes used Remove, then we would have a lot of issues that the cause of would
|
||||
//be difficult to track
|
||||
//^todo: rework this comment, im frazzled
|
||||
void Deallocate();
|
||||
const OpcodeInfo* GetOpcodeInfo() const { return opcode; }
|
||||
// if opcode is null, we have bigger problems
|
||||
Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; }
|
||||
|
|
|
@ -292,7 +292,7 @@ enum Opcode {
|
|||
// as we already have OPCODE_ROUND. round double to float (
|
||||
// ppc "single" fpu instruction result rounding behavior )
|
||||
OPCODE_SET_NJM,
|
||||
|
||||
OPCODE_DELAY_EXECUTION, //for db16cyc
|
||||
__OPCODE_MAX_VALUE, // Keep at end.
|
||||
};
|
||||
|
||||
|
|
|
@ -218,7 +218,7 @@ DEFINE_OPCODE(
|
|||
"context_barrier",
|
||||
OPCODE_SIG_X,
|
||||
0)
|
||||
|
||||
DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
|
||||
DEFINE_OPCODE(
|
||||
OPCODE_LOAD_MMIO,
|
||||
"load_mmio",
|
||||
|
|
|
@ -16,13 +16,13 @@
|
|||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/byte_order.h"
|
||||
#include "xenia/base/math.h"
|
||||
|
||||
#include "xenia/cpu/hir/hir_builder.h"
|
||||
namespace xe {
|
||||
namespace cpu {
|
||||
namespace hir {
|
||||
|
||||
Value::Use* Value::AddUse(Arena* arena, Instr* instr) {
|
||||
Use* use = arena->Alloc<Use>();
|
||||
Use* use = HIRBuilder::GetCurrent()->AllocateUse();
|
||||
use->instr = instr;
|
||||
use->prev = NULL;
|
||||
use->next = use_head;
|
||||
|
@ -42,6 +42,8 @@ void Value::RemoveUse(Use* use) {
|
|||
if (use->next) {
|
||||
use->next->prev = use->prev;
|
||||
}
|
||||
|
||||
//HIRBuilder::GetCurrent()->DeallocateUse(use);
|
||||
}
|
||||
|
||||
uint32_t Value::AsUint32() {
|
||||
|
|
|
@ -789,8 +789,15 @@ int InstrEmit_norx(PPCHIRBuilder& f, const InstrData& i) {
|
|||
int InstrEmit_orx(PPCHIRBuilder& f, const InstrData& i) {
|
||||
// RA <- (RS) | (RB)
|
||||
if (i.X.RT == i.X.RB && i.X.RT == i.X.RA && !i.X.Rc) {
|
||||
// Sometimes used as no-op.
|
||||
f.Nop();
|
||||
// chrispy: this special version of orx is db16cyc and is heavily used in
|
||||
// spinlocks. since we do not emit any code for this we end up wasting a ton
|
||||
// of power
|
||||
if (i.code == 0x7FFFFB78) {
|
||||
f.DelayExecution();
|
||||
} else {
|
||||
// Sometimes used as no-op.
|
||||
f.Nop();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
Value* ra;
|
||||
|
|
|
@ -117,6 +117,7 @@ bool PPCFrontend::DefineFunction(GuestFunction* function,
|
|||
uint32_t debug_info_flags) {
|
||||
auto translator = translator_pool_.Allocate(this);
|
||||
bool result = translator->Translate(function, debug_info_flags);
|
||||
translator->Reset();
|
||||
translator_pool_.Release(translator);
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -96,10 +96,25 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : frontend_(frontend) {
|
|||
|
||||
PPCTranslator::~PPCTranslator() = default;
|
||||
|
||||
class HirBuilderScope {
|
||||
PPCHIRBuilder* builder_;
|
||||
|
||||
public:
|
||||
HirBuilderScope(PPCHIRBuilder* builder) : builder_(builder) {
|
||||
builder_->MakeCurrent();
|
||||
}
|
||||
|
||||
~HirBuilderScope() {
|
||||
if (builder_) {
|
||||
builder_->RemoveCurrent();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
bool PPCTranslator::Translate(GuestFunction* function,
|
||||
uint32_t debug_info_flags) {
|
||||
SCOPE_profile_cpu_f("cpu");
|
||||
|
||||
HirBuilderScope hir_build_scope{builder_.get()};
|
||||
// Reset() all caching when we leave.
|
||||
xe::make_reset_scope(builder_);
|
||||
xe::make_reset_scope(compiler_);
|
||||
|
@ -196,7 +211,7 @@ bool PPCTranslator::Translate(GuestFunction* function,
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
void PPCTranslator::Reset() { builder_->ResetPools(); }
|
||||
void PPCTranslator::DumpSource(GuestFunction* function,
|
||||
StringBuffer* string_buffer) {
|
||||
Memory* memory = frontend_->memory();
|
||||
|
|
|
@ -31,7 +31,7 @@ class PPCTranslator {
|
|||
~PPCTranslator();
|
||||
|
||||
bool Translate(GuestFunction* function, uint32_t debug_info_flags);
|
||||
|
||||
void Reset();
|
||||
private:
|
||||
void DumpSource(GuestFunction* function, StringBuffer* string_buffer);
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -19,6 +19,7 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/dma.h"
|
||||
#include "xenia/base/ring_buffer.h"
|
||||
#include "xenia/base/threading.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
|
@ -66,6 +67,11 @@ enum class GammaRampType {
|
|||
};
|
||||
|
||||
class CommandProcessor {
|
||||
protected:
|
||||
RingBuffer
|
||||
reader_; // chrispy: instead of having ringbuffer on stack, have it near
|
||||
// the start of the class so we can access it via rel8. This
|
||||
// also reduces the number of params we need to pass
|
||||
public:
|
||||
enum class SwapPostEffect {
|
||||
kNone,
|
||||
|
@ -76,7 +82,7 @@ class CommandProcessor {
|
|||
CommandProcessor(GraphicsSystem* graphics_system,
|
||||
kernel::KernelState* kernel_state);
|
||||
virtual ~CommandProcessor();
|
||||
|
||||
dma::XeDMAC* GetDMAC() const { return dmac_; }
|
||||
uint32_t counter() const { return counter_; }
|
||||
void increment_counter() { counter_++; }
|
||||
|
||||
|
@ -101,7 +107,7 @@ class CommandProcessor {
|
|||
// screen right in the beginning of 4D530AA4 is not a resolved render target,
|
||||
// for instance).
|
||||
virtual void IssueSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width,
|
||||
uint32_t frontbuffer_height) = 0;
|
||||
uint32_t frontbuffer_height) {}
|
||||
|
||||
// May be called not only from the command processor thread when the command
|
||||
// processor is paused, and the termination of this function may be explicitly
|
||||
|
@ -153,7 +159,7 @@ class CommandProcessor {
|
|||
// rarely needed, most register writes have no special logic here
|
||||
XE_NOINLINE
|
||||
void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
|
||||
XE_FORCEINLINE
|
||||
|
||||
virtual void WriteRegister(uint32_t index, uint32_t value);
|
||||
|
||||
// mem has big-endian register values
|
||||
|
@ -165,12 +171,53 @@ class CommandProcessor {
|
|||
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
XE_NOINLINE
|
||||
virtual void WriteOneRegisterFromRing(
|
||||
xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t base,
|
||||
uint32_t
|
||||
num_times); // repeatedly write a value to one register, presumably a
|
||||
// register with special handling for writes
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteFetchRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteBoolRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteLoopRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteALURangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteFetchRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteBoolRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteLoopRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteREGISTERSRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
|
||||
return gamma_ramp_256_entry_table_;
|
||||
}
|
||||
|
@ -186,75 +233,22 @@ class CommandProcessor {
|
|||
|
||||
uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
|
||||
virtual void OnPrimaryBufferEnd() {}
|
||||
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length);
|
||||
bool ExecutePacket(RingBuffer* reader);
|
||||
bool ExecutePacketType0(RingBuffer* reader, uint32_t packet);
|
||||
bool ExecutePacketType1(RingBuffer* reader, uint32_t packet);
|
||||
bool ExecutePacketType2(RingBuffer* reader, uint32_t packet);
|
||||
bool ExecutePacketType3(RingBuffer* reader, uint32_t packet);
|
||||
bool ExecutePacketType3_ME_INIT(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_NOP(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_INTERRUPT(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_XE_SWAP(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_INDIRECT_BUFFER(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_WAIT_REG_MEM(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_REG_RMW(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_REG_TO_MEM(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_MEM_WRITE(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_COND_WRITE(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_EVENT_WRITE(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_EVENT_WRITE_SHD(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3Draw(RingBuffer* reader, uint32_t packet,
|
||||
const char* opcode_name,
|
||||
uint32_t viz_query_condition,
|
||||
uint32_t count_remaining);
|
||||
bool ExecutePacketType3_DRAW_INDX(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_SET_CONSTANT(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_SET_SHADER_CONSTANTS(RingBuffer* reader,
|
||||
uint32_t packet, uint32_t count);
|
||||
bool ExecutePacketType3_IM_LOAD(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_IM_LOAD_IMMEDIATE(RingBuffer* reader,
|
||||
|
||||
uint32_t packet, uint32_t count);
|
||||
bool ExecutePacketType3_INVALIDATE_STATE(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
bool ExecutePacketType3_VIZ_QUERY(RingBuffer* reader, uint32_t packet,
|
||||
uint32_t count);
|
||||
#include "pm4_command_processor_declare.h"
|
||||
|
||||
virtual Shader* LoadShader(xenos::ShaderType shader_type,
|
||||
uint32_t guest_address,
|
||||
const uint32_t* host_address,
|
||||
uint32_t dword_count) = 0;
|
||||
uint32_t dword_count) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
virtual bool IssueDraw(xenos::PrimitiveType prim_type, uint32_t index_count,
|
||||
IndexBufferInfo* index_buffer_info,
|
||||
bool major_mode_explicit) = 0;
|
||||
virtual bool IssueCopy() = 0;
|
||||
bool major_mode_explicit) {
|
||||
return false;
|
||||
}
|
||||
virtual bool IssueCopy() { return false; }
|
||||
|
||||
// "Actual" is for the command processor thread, to be read by the
|
||||
// implementations.
|
||||
|
@ -267,7 +261,7 @@ class CommandProcessor {
|
|||
Memory* memory_ = nullptr;
|
||||
kernel::KernelState* kernel_state_ = nullptr;
|
||||
GraphicsSystem* graphics_system_ = nullptr;
|
||||
RegisterFile* register_file_ = nullptr;
|
||||
RegisterFile* XE_RESTRICT register_file_ = nullptr;
|
||||
|
||||
TraceWriter trace_writer_;
|
||||
enum class TraceState {
|
||||
|
@ -316,6 +310,7 @@ class CommandProcessor {
|
|||
reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
|
||||
reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
|
||||
uint32_t gamma_ramp_rw_component_ = 0;
|
||||
dma::XeDMAC* dmac_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,4 +1,5 @@
|
|||
/**
|
||||
/**
|
||||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
|
@ -35,6 +36,7 @@
|
|||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
#include "xenia/kernel/kernel_state.h"
|
||||
#include "xenia/kernel/user_module.h"
|
||||
#include "xenia/ui/d3d12/d3d12_descriptor_heap_pool.h"
|
||||
#include "xenia/ui/d3d12/d3d12_provider.h"
|
||||
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
|
||||
|
@ -46,6 +48,7 @@ namespace d3d12 {
|
|||
|
||||
class D3D12CommandProcessor final : public CommandProcessor {
|
||||
public:
|
||||
#include "../pm4_command_processor_declare.h"
|
||||
explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
|
||||
kernel::KernelState* kernel_state);
|
||||
~D3D12CommandProcessor();
|
||||
|
@ -205,22 +208,70 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
protected:
|
||||
bool SetupContext() override;
|
||||
void ShutdownContext() override;
|
||||
XE_FORCEINLINE
|
||||
|
||||
void WriteRegister(uint32_t index, uint32_t value) override;
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers) override;
|
||||
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
|
||||
uint32_t start_index, uint32_t* base, uint32_t num_registers);
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_registers) override;
|
||||
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||
XE_FORCEINLINE void WriteRegisterRangeFromRing_WithKnownBound(
|
||||
xe::RingBuffer* ring, uint32_t base, uint32_t num_registers);
|
||||
|
||||
XE_NOINLINE
|
||||
void WriteRegisterRangeFromRing_WraparoundCase(xe::RingBuffer* ring,
|
||||
uint32_t base,
|
||||
uint32_t num_registers);
|
||||
XE_FORCEINLINE
|
||||
virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_registers);
|
||||
XE_NOINLINE
|
||||
virtual void WriteOneRegisterFromRing(uint32_t base,
|
||||
uint32_t num_times) override;
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteFetchRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteBoolRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteLoopRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteALURangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteFetchRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteBoolRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteLoopRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteREGISTERSRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||
uint32_t num_registers);
|
||||
|
||||
void OnGammaRamp256EntryTableValueWritten() override;
|
||||
void OnGammaRampPWLValueWritten() override;
|
||||
|
||||
|
@ -367,6 +418,14 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
const draw_util::Scissor& scissor,
|
||||
bool primitive_polygonal,
|
||||
reg::RB_DEPTHCONTROL normalized_depth_control);
|
||||
|
||||
template <bool primitive_polygonal, bool edram_rov_used>
|
||||
XE_NOINLINE void UpdateSystemConstantValues_Impl(
|
||||
bool shared_memory_is_uav, uint32_t line_loop_closing_index,
|
||||
xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info,
|
||||
uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||
uint32_t normalized_color_mask);
|
||||
|
||||
void UpdateSystemConstantValues(bool shared_memory_is_uav,
|
||||
bool primitive_polygonal,
|
||||
uint32_t line_loop_closing_index,
|
||||
|
@ -619,8 +678,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
uint32_t current_graphics_root_up_to_date_;
|
||||
|
||||
// System shader constants.
|
||||
alignas(XE_HOST_CACHE_LINE_SIZE)
|
||||
DxbcShaderTranslator::SystemConstants system_constants_;
|
||||
alignas(XE_HOST_CACHE_LINE_SIZE)
|
||||
DxbcShaderTranslator::SystemConstants system_constants_;
|
||||
|
||||
// Float constant usage masks of the last draw call.
|
||||
// chrispy: make sure accesses to these cant cross cacheline boundaries
|
||||
|
|
|
@ -0,0 +1,122 @@
|
|||
#pragma once
|
||||
// requires windows.h
|
||||
#include <stdint.h>
|
||||
|
||||
namespace lightweight_nvapi {
|
||||
|
||||
using nvstatus_t = int;
|
||||
|
||||
using nvintfid_t = unsigned int;
|
||||
|
||||
#ifndef LIGHTWEIGHT_NVAPI_EXCLUDE_D3D12
|
||||
constexpr nvintfid_t id_NvAPI_D3D12_QueryCpuVisibleVidmem = 0x26322BC3;
|
||||
|
||||
using cb_NvAPI_D3D12_QueryCpuVisibleVidmem = nvstatus_t (*)(
|
||||
ID3D12Device* pDevice, uint64_t* pTotalBytes, uint64_t* pFreeBytes);
|
||||
|
||||
constexpr nvintfid_t id_NvAPI_D3D12_UseDriverHeapPriorities = 0xF0D978A8;
|
||||
using cb_NvAPI_D3D12_UseDriverHeapPriorities =
|
||||
nvstatus_t (*)(ID3D12Device* pDevice);
|
||||
enum NV_D3D12_RESOURCE_FLAGS {
|
||||
NV_D3D12_RESOURCE_FLAG_NONE = 0,
|
||||
NV_D3D12_RESOURCE_FLAG_HTEX = 1, //!< Create HTEX texture
|
||||
NV_D3D12_RESOURCE_FLAG_CPUVISIBLE_VIDMEM =
|
||||
2, //!< Hint to create resource in cpuvisible vidmem
|
||||
};
|
||||
|
||||
struct NV_RESOURCE_PARAMS {
|
||||
uint32_t version; //!< Version of structure. Must always be first member
|
||||
NV_D3D12_RESOURCE_FLAGS
|
||||
NVResourceFlags; //!< Additional NV specific flags (set the
|
||||
//!< NV_D3D12_RESOURCE_FLAG_HTEX bit to create HTEX
|
||||
//!< texture)
|
||||
};
|
||||
|
||||
using cb_NvAPI_D3D12_CreateCommittedResource = nvstatus_t (*)(
|
||||
ID3D12Device* pDevice, const D3D12_HEAP_PROPERTIES* pHeapProperties,
|
||||
D3D12_HEAP_FLAGS HeapFlags, const D3D12_RESOURCE_DESC* pDesc,
|
||||
D3D12_RESOURCE_STATES InitialState,
|
||||
const D3D12_CLEAR_VALUE* pOptimizedClearValue,
|
||||
const NV_RESOURCE_PARAMS* pNVResourceParams, REFIID riid,
|
||||
void** ppvResource, bool* pSupported);
|
||||
constexpr nvintfid_t id_NvAPI_D3D12_CreateCommittedResource = 0x27E98AEu;
|
||||
#endif
|
||||
class nvapi_state_t {
|
||||
HMODULE nvapi64_;
|
||||
void* (*queryinterface_)(unsigned int intfid);
|
||||
bool available_;
|
||||
bool init_ptrs();
|
||||
|
||||
bool call_init_interface();
|
||||
void call_deinit_interface();
|
||||
|
||||
public:
|
||||
nvapi_state_t() : nvapi64_(LoadLibraryA("nvapi64.dll")), available_(false) {
|
||||
available_ = init_ptrs();
|
||||
}
|
||||
~nvapi_state_t();
|
||||
template <typename T>
|
||||
T* query_interface(unsigned int intfid) {
|
||||
if (queryinterface_ == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
return reinterpret_cast<T*>(queryinterface_(intfid));
|
||||
}
|
||||
|
||||
bool is_available() const { return available_; }
|
||||
};
|
||||
inline bool nvapi_state_t::call_init_interface() {
|
||||
int result = -1;
|
||||
auto initInterfaceEx = query_interface<int(int)>(0xAD298D3F);
|
||||
if (!initInterfaceEx) {
|
||||
auto initInterface = query_interface<int()>(0x150E828u);
|
||||
if (initInterface) {
|
||||
result = initInterface();
|
||||
}
|
||||
} else {
|
||||
result = initInterfaceEx(0);
|
||||
}
|
||||
return result == 0;
|
||||
}
|
||||
inline void nvapi_state_t::call_deinit_interface() {
|
||||
auto deinitinterfaceex = query_interface<void(int)>(0xD7C61344);
|
||||
if (deinitinterfaceex) {
|
||||
deinitinterfaceex(1); // or 0? im not sure what the proper value is
|
||||
} else {
|
||||
auto deinitinterface = query_interface<void()>(0xD22BDD7E);
|
||||
if (deinitinterface) {
|
||||
deinitinterface();
|
||||
}
|
||||
}
|
||||
}
|
||||
inline bool nvapi_state_t::init_ptrs() {
|
||||
if (!nvapi64_) return false;
|
||||
queryinterface_ = reinterpret_cast<void* (*)(unsigned)>(
|
||||
GetProcAddress(nvapi64_, "nvapi_QueryInterface"));
|
||||
|
||||
if (!queryinterface_) {
|
||||
return false;
|
||||
}
|
||||
if (!call_init_interface()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
inline nvapi_state_t::~nvapi_state_t() {
|
||||
if (available_) {
|
||||
call_deinit_interface();
|
||||
}
|
||||
}
|
||||
inline void init_nvapi() {
|
||||
/// HMODULE moddy = LoadLibraryA("nvapi64.dll");
|
||||
|
||||
// FARPROC quif = GetProcAddress(moddy, "nvapi_QueryInterface");
|
||||
|
||||
nvapi_state_t nvapi{};
|
||||
|
||||
auto queryvisible = nvapi.query_interface<void>(0x26322BC3);
|
||||
return;
|
||||
}
|
||||
|
||||
} // namespace lightweight_nvapi
|
|
@ -108,12 +108,11 @@ bool D3D12PrimitiveProcessor::InitializeBuiltinIndexBuffer(
|
|||
size_bytes);
|
||||
return false;
|
||||
}
|
||||
|
||||
Microsoft::WRL::ComPtr<ID3D12Resource> upload_resource;
|
||||
if (FAILED(device->CreateCommittedResource(
|
||||
&ui::d3d12::util::kHeapPropertiesUpload,
|
||||
if (!provider.CreateUploadResource(
|
||||
provider.GetHeapFlagCreateNotZeroed(), &resource_desc,
|
||||
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
|
||||
IID_PPV_ARGS(&upload_resource)))) {
|
||||
D3D12_RESOURCE_STATE_GENERIC_READ, IID_PPV_ARGS(&upload_resource))) {
|
||||
XELOGE(
|
||||
"D3D12 primitive processor: Failed to create the built-in index "
|
||||
"buffer upload resource with {} bytes",
|
||||
|
|
|
@ -5492,11 +5492,19 @@ void D3D12RenderTargetCache::SetCommandListRenderTargets(
|
|||
}
|
||||
|
||||
// Bind the render targets.
|
||||
if (are_current_command_list_render_targets_valid_ &&
|
||||
std::memcmp(current_command_list_render_targets_,
|
||||
depth_and_color_render_targets,
|
||||
sizeof(current_command_list_render_targets_))) {
|
||||
are_current_command_list_render_targets_valid_ = false;
|
||||
if (are_current_command_list_render_targets_valid_) {
|
||||
// chrispy: the small memcmp doesnt get optimized by msvc
|
||||
|
||||
for (unsigned i = 0;
|
||||
i < sizeof(current_command_list_render_targets_) /
|
||||
sizeof(current_command_list_render_targets_[0]);
|
||||
++i) {
|
||||
if ((const void*)current_command_list_render_targets_[i] !=
|
||||
(const void*)depth_and_color_render_targets[i]) {
|
||||
are_current_command_list_render_targets_valid_ = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
uint32_t render_targets_are_srgb;
|
||||
if (gamma_render_target_as_srgb_) {
|
||||
|
|
|
@ -467,7 +467,7 @@ void D3D12TextureCache::EndFrame() {
|
|||
XELOGE("Unsupported texture formats used in the frame:");
|
||||
unsupported_header_written = true;
|
||||
}
|
||||
XELOGE("* {}{}{}{}", FormatInfo::Get(xenos::TextureFormat(i))->name,
|
||||
XELOGE("* {}{}{}{}", FormatInfo::GetName(xenos::TextureFormat(i)),
|
||||
unsupported_features & kUnsupportedResourceBit ? " resource" : "",
|
||||
unsupported_features & kUnsupportedUnormBit ? " unsigned" : "",
|
||||
unsupported_features & kUnsupportedSnormBit ? " signed" : "");
|
||||
|
@ -523,12 +523,16 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
// chrispy: optimize this further
|
||||
bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate(
|
||||
const TextureSRVKey* keys,
|
||||
const D3D12Shader::TextureBinding* host_shader_bindings,
|
||||
size_t host_shader_binding_count) const {
|
||||
for (size_t i = 0; i < host_shader_binding_count; ++i) {
|
||||
if (i + 8 < host_shader_binding_count) {
|
||||
PrefetchTextureBinding<swcache::PrefetchTag::Nontemporal>(
|
||||
host_shader_bindings[i + 8].fetch_constant);
|
||||
}
|
||||
const TextureSRVKey& key = keys[i];
|
||||
const TextureBinding* binding =
|
||||
GetValidTextureBinding(host_shader_bindings[i].fetch_constant);
|
||||
|
@ -538,8 +542,9 @@ bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate(
|
|||
}
|
||||
continue;
|
||||
}
|
||||
if (key.key != binding->key || key.host_swizzle != binding->host_swizzle ||
|
||||
key.swizzled_signs != binding->swizzled_signs) {
|
||||
if ((key.key != binding->key) |
|
||||
(key.host_swizzle != binding->host_swizzle) |
|
||||
(key.swizzled_signs != binding->swizzled_signs)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -666,8 +671,12 @@ uint32_t D3D12TextureCache::GetActiveTextureBindlessSRVIndex(
|
|||
}
|
||||
return descriptor_index;
|
||||
}
|
||||
|
||||
D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
|
||||
void D3D12TextureCache::PrefetchSamplerParameters(
|
||||
const D3D12Shader::SamplerBinding& binding) const {
|
||||
swcache::PrefetchL1(®ister_file()[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 +
|
||||
binding.fetch_constant * 6]);
|
||||
}
|
||||
D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
|
||||
const D3D12Shader::SamplerBinding& binding) const {
|
||||
const auto& regs = register_file();
|
||||
const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
|
||||
|
@ -694,7 +703,7 @@ D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
|
|||
nullptr, nullptr, nullptr,
|
||||
&mip_min_level, nullptr);
|
||||
parameters.mip_min_level = mip_min_level;
|
||||
|
||||
//high cache miss count here, prefetch fetch earlier
|
||||
// TODO(Triang3l): Disable filtering for texture formats not supporting it.
|
||||
xenos::AnisoFilter aniso_filter =
|
||||
binding.aniso_filter == xenos::AnisoFilter::kUseFetchConst
|
||||
|
|
|
@ -119,7 +119,8 @@ class D3D12TextureCache final : public TextureCache {
|
|||
D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||
uint32_t GetActiveTextureBindlessSRVIndex(
|
||||
const D3D12Shader::TextureBinding& host_shader_binding);
|
||||
|
||||
void PrefetchSamplerParameters(
|
||||
const D3D12Shader::SamplerBinding& binding) const;
|
||||
SamplerParameters GetSamplerParameters(
|
||||
const D3D12Shader::SamplerBinding& binding) const;
|
||||
void WriteSampler(SamplerParameters parameters,
|
||||
|
@ -712,7 +713,7 @@ class D3D12TextureCache final : public TextureCache {
|
|||
}
|
||||
|
||||
LoadShaderIndex GetLoadShaderIndex(TextureKey key) const;
|
||||
|
||||
// chrispy: todo, can use simple branchless tests here
|
||||
static constexpr bool AreDimensionsCompatible(
|
||||
xenos::FetchOpDimension binding_dimension,
|
||||
xenos::DataDimension resource_dimension) {
|
||||
|
|
|
@ -1047,8 +1047,7 @@ bool PipelineCache::ConfigurePipeline(
|
|||
PipelineDescription& description = runtime_description.description;
|
||||
|
||||
if (current_pipeline_ != nullptr &&
|
||||
!std::memcmp(¤t_pipeline_->description.description, &description,
|
||||
sizeof(description))) {
|
||||
current_pipeline_->description.description == description) {
|
||||
*pipeline_handle_out = current_pipeline_;
|
||||
*root_signature_out = runtime_description.root_signature;
|
||||
return true;
|
||||
|
@ -1059,8 +1058,7 @@ bool PipelineCache::ConfigurePipeline(
|
|||
auto found_range = pipelines_.equal_range(hash);
|
||||
for (auto it = found_range.first; it != found_range.second; ++it) {
|
||||
Pipeline* found_pipeline = it->second;
|
||||
if (!std::memcmp(&found_pipeline->description.description, &description,
|
||||
sizeof(description))) {
|
||||
if (found_pipeline->description.description == description) {
|
||||
current_pipeline_ = found_pipeline;
|
||||
*pipeline_handle_out = found_pipeline;
|
||||
*root_signature_out = found_pipeline->description.root_signature;
|
||||
|
|
|
@ -226,6 +226,7 @@ class PipelineCache {
|
|||
|
||||
PipelineRenderTarget render_targets[xenos::kMaxColorRenderTargets];
|
||||
|
||||
inline bool operator==(const PipelineDescription& other) const;
|
||||
static constexpr uint32_t kVersion = 0x20210425;
|
||||
});
|
||||
|
||||
|
@ -424,7 +425,34 @@ class PipelineCache {
|
|||
size_t creation_threads_shutdown_from_ = SIZE_MAX;
|
||||
std::vector<std::unique_ptr<xe::threading::Thread>> creation_threads_;
|
||||
};
|
||||
inline bool PipelineCache::PipelineDescription::operator==(
|
||||
const PipelineDescription& other) const {
|
||||
constexpr size_t cmp_size = sizeof(PipelineDescription);
|
||||
#if XE_ARCH_AMD64 == 1
|
||||
if constexpr (cmp_size == 64) {
|
||||
if (vertex_shader_hash != other.vertex_shader_hash ||
|
||||
vertex_shader_modification != other.vertex_shader_modification) {
|
||||
return false;
|
||||
}
|
||||
const __m128i* thiz = (const __m128i*)this;
|
||||
const __m128i* thoze = (const __m128i*)&other;
|
||||
__m128i cmp32 =
|
||||
_mm_cmpeq_epi8(_mm_loadu_si128(thiz + 1), _mm_loadu_si128(thoze + 1));
|
||||
|
||||
cmp32 = _mm_and_si128(cmp32, _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 2),
|
||||
_mm_loadu_si128(thoze + 2)));
|
||||
|
||||
cmp32 = _mm_and_si128(cmp32, _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 3),
|
||||
_mm_loadu_si128(thoze + 3)));
|
||||
|
||||
return _mm_movemask_epi8(cmp32) == 0xFFFF;
|
||||
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
return !memcmp(this, &other, cmp_size);
|
||||
}
|
||||
}
|
||||
} // namespace d3d12
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -320,22 +320,38 @@ uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
|
|||
// scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
|
||||
// usually exists and can't be disabled.
|
||||
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||
|
||||
float viewport_bottom = 0.0f;
|
||||
uint32_t enable_window_offset =
|
||||
regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable;
|
||||
|
||||
bool not_pix_center = !regs.Get<reg::PA_SU_VTX_CNTL>().pix_center;
|
||||
|
||||
float window_y_offset_f = float(window_y_offset);
|
||||
|
||||
float yoffset = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
||||
|
||||
// First calculate all the integer.0 or integer.5 offsetting exactly at full
|
||||
// precision.
|
||||
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
|
||||
viewport_bottom += float(window_y_offset);
|
||||
// chrispy: branch mispredicts here causing some pain according to vtune
|
||||
float sm1 = .0f, sm2 = .0f, sm3 = .0f, sm4 = .0f;
|
||||
|
||||
if (enable_window_offset) {
|
||||
sm1 = window_y_offset_f;
|
||||
}
|
||||
if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
|
||||
viewport_bottom += 0.5f;
|
||||
if (not_pix_center) {
|
||||
sm2 = 0.5f;
|
||||
}
|
||||
// Then apply the floating-point viewport offset.
|
||||
if (pa_cl_vte_cntl.vport_y_offset_ena) {
|
||||
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
||||
sm3 = yoffset;
|
||||
}
|
||||
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
|
||||
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
|
||||
: 1.0f;
|
||||
sm4 = pa_cl_vte_cntl.vport_y_scale_ena
|
||||
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
|
||||
: 1.0f;
|
||||
|
||||
viewport_bottom = sm1 + sm2 + sm3 + sm4;
|
||||
|
||||
// Using floor, or, rather, truncation (because maxing with zero anyway)
|
||||
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
|
||||
// GPUs on Direct3D 12 (but not WARP), also like in
|
||||
|
|
|
@ -929,8 +929,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
XELOGW(
|
||||
"Resolving to format {}, which is untested - treating like {}. "
|
||||
"Report the game to Xenia developers!",
|
||||
FormatInfo::Get(dest_format)->name,
|
||||
FormatInfo::Get(dest_closest_format)->name);
|
||||
FormatInfo::GetName(dest_format),
|
||||
FormatInfo::GetName(dest_closest_format));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1002,7 +1002,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
}
|
||||
} else {
|
||||
XELOGE("Tried to resolve to format {}, which is not a ColorFormat",
|
||||
dest_format_info.name);
|
||||
FormatInfo::GetName(dest_format));
|
||||
copy_dest_extent_start = copy_dest_base_adjusted;
|
||||
copy_dest_extent_end = copy_dest_base_adjusted;
|
||||
}
|
||||
|
@ -1117,7 +1117,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
|||
xenos::DepthRenderTargetFormat(depth_edram_info.format))
|
||||
: xenos::GetColorRenderTargetFormatName(
|
||||
xenos::ColorRenderTargetFormat(color_edram_info.format)),
|
||||
dest_format_info.name, rb_copy_dest_base, copy_dest_extent_start,
|
||||
FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
|
||||
copy_dest_extent_end);
|
||||
|
||||
return true;
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
|
||||
|
||||
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT;
|
||||
|
||||
virtual bool ExecutePacket();
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType0( uint32_t packet) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType1( uint32_t packet) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType2( uint32_t packet) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3( uint32_t packet) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_ME_INIT( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
bool ExecutePacketType3_NOP( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_INTERRUPT( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_XE_SWAP( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_INDIRECT_BUFFER( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_WAIT_REG_MEM( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_REG_RMW( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_REG_TO_MEM( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_MEM_WRITE( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_COND_WRITE( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_EVENT_WRITE( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_EVENT_WRITE_SHD( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_EVENT_WRITE_EXT( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_EVENT_WRITE_ZPD( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3Draw( uint32_t packet,
|
||||
const char* opcode_name,
|
||||
uint32_t viz_query_condition,
|
||||
uint32_t count_remaining) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_DRAW_INDX( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_DRAW_INDX_2( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_FORCEINLINE
|
||||
bool ExecutePacketType3_SET_CONSTANT( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType3_SET_CONSTANT2( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
XE_FORCEINLINE
|
||||
bool ExecutePacketType3_LOAD_ALU_CONSTANT( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_SET_SHADER_CONSTANTS(
|
||||
uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_IM_LOAD( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_IM_LOAD_IMMEDIATE( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_INVALIDATE_STATE( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
bool ExecutePacketType3_VIZ_QUERY( uint32_t packet,
|
||||
uint32_t count) XE_RESTRICT;
|
||||
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteEventInitiator(uint32_t value) XE_RESTRICT;
|
||||
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT;
|
||||
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
uint32_t GetCurrentRingReadCount();
|
||||
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool ExecutePacketType3_CountOverflow(uint32_t count);
|
File diff suppressed because it is too large
Load Diff
|
@ -233,15 +233,27 @@ void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last,
|
|||
// Fire per-range watches.
|
||||
for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
|
||||
WatchNode* node = watch_buckets_[i];
|
||||
if (i + 1 <= bucket_last) {
|
||||
WatchNode* nextnode = watch_buckets_[i + 1];
|
||||
if (nextnode) {
|
||||
swcache::PrefetchL1(nextnode->range);
|
||||
}
|
||||
}
|
||||
while (node != nullptr) {
|
||||
WatchRange* range = node->range;
|
||||
// Store the next node now since when the callback is triggered, the links
|
||||
// will be broken.
|
||||
node = node->bucket_node_next;
|
||||
if (node) {
|
||||
swcache::PrefetchL1(node);
|
||||
}
|
||||
if (page_first <= range->page_last && page_last >= range->page_first) {
|
||||
range->callback(global_lock, range->callback_context,
|
||||
range->callback_data, range->callback_argument,
|
||||
invalidated_by_gpu);
|
||||
if (node && node->range) {
|
||||
swcache::PrefetchL1(node->range);
|
||||
}
|
||||
UnlinkWatchRange(range);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -440,7 +440,7 @@ void TextureCache::TextureKey::LogAction(const char* action) const {
|
|||
"base at 0x{:08X} (pitch {}), mips at 0x{:08X}",
|
||||
action, tiled ? "tiled" : "linear", scaled_resolve ? "scaled " : "",
|
||||
GetWidth(), GetHeight(), GetDepthOrArraySize(), GetLogDimensionName(),
|
||||
FormatInfo::Get(format)->name, mip_max_level + 1, packed_mips ? "" : "un",
|
||||
FormatInfo::GetName(format), mip_max_level + 1, packed_mips ? "" : "un",
|
||||
mip_max_level != 0 ? "s" : "", base_page << 12, pitch << 5,
|
||||
mip_page << 12);
|
||||
}
|
||||
|
@ -453,7 +453,7 @@ void TextureCache::Texture::LogAction(const char* action) const {
|
|||
action, key_.tiled ? "tiled" : "linear",
|
||||
key_.scaled_resolve ? "scaled " : "", key_.GetWidth(), key_.GetHeight(),
|
||||
key_.GetDepthOrArraySize(), key_.GetLogDimensionName(),
|
||||
FormatInfo::Get(key_.format)->name, key_.mip_max_level + 1,
|
||||
FormatInfo::GetName(key_.format), key_.mip_max_level + 1,
|
||||
key_.packed_mips ? "" : "un", key_.mip_max_level != 0 ? "s" : "",
|
||||
key_.base_page << 12, key_.pitch << 5, GetGuestBaseSize(),
|
||||
key_.mip_page << 12, GetGuestMipsSize());
|
||||
|
|
|
@ -128,6 +128,14 @@ class TextureCache {
|
|||
return (binding->texture && binding->texture->IsResolved()) ||
|
||||
(binding->texture_signed && binding->texture_signed->IsResolved());
|
||||
}
|
||||
template <swcache::PrefetchTag tag>
|
||||
void PrefetchTextureBinding(uint32_t fetch_constant_index) const {
|
||||
swcache::Prefetch<tag>(&texture_bindings_[fetch_constant_index]);
|
||||
swcache::Prefetch<tag>(
|
||||
&texture_bindings_[fetch_constant_index +
|
||||
1]); // we may cross a cache line boundary :( size
|
||||
// of the structure is 0x28
|
||||
}
|
||||
|
||||
protected:
|
||||
struct TextureKey {
|
||||
|
|
|
@ -85,7 +85,7 @@ void TextureDump(const TextureInfo& src, void* buffer, size_t length) {
|
|||
assert_unhandled_case(src.format);
|
||||
std::memset(&dds_header.pixel_format, 0xCD,
|
||||
sizeof(dds_header.pixel_format));
|
||||
XELOGW("Skipping {} for texture dump.", src.format_info()->name);
|
||||
XELOGW("Skipping {} for texture dump.", src.format_name());
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
@ -96,7 +96,7 @@ void TextureDump(const TextureInfo& src, void* buffer, size_t length) {
|
|||
std::filesystem::path path = "texture_dumps";
|
||||
path /= fmt::format("{:05d}_{:08X}_{:08X}_{:08X}.dds", dump_counter++,
|
||||
src.memory.base_address, src.memory.mip_address,
|
||||
src.format_info()->name);
|
||||
src.format_name());
|
||||
|
||||
FILE* handle = filesystem::OpenFile(path, "wb");
|
||||
if (handle) {
|
||||
|
|
|
@ -159,151 +159,6 @@ void TextureInfo::GetMipSize(uint32_t mip, uint32_t* out_width,
|
|||
*out_height = std::max(height_pow2 >> mip, 1u);
|
||||
}
|
||||
|
||||
uint32_t TextureInfo::GetMipLocation(uint32_t mip, uint32_t* offset_x,
|
||||
uint32_t* offset_y, bool is_guest) const {
|
||||
if (mip == 0) {
|
||||
// Short-circuit. Mip 0 is always stored in base_address.
|
||||
if (!has_packed_mips) {
|
||||
*offset_x = 0;
|
||||
*offset_y = 0;
|
||||
} else {
|
||||
GetPackedTileOffset(0, offset_x, offset_y);
|
||||
}
|
||||
return memory.base_address;
|
||||
}
|
||||
|
||||
if (!memory.mip_address) {
|
||||
// Short-circuit. There is no mip data.
|
||||
*offset_x = 0;
|
||||
*offset_y = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t address_base, address_offset;
|
||||
address_base = memory.mip_address;
|
||||
address_offset = 0;
|
||||
|
||||
auto bytes_per_block = format_info()->bytes_per_block();
|
||||
|
||||
if (!has_packed_mips) {
|
||||
for (uint32_t i = 1; i < mip; i++) {
|
||||
address_offset +=
|
||||
GetMipExtent(i, is_guest).all_blocks() * bytes_per_block;
|
||||
}
|
||||
*offset_x = 0;
|
||||
*offset_y = 0;
|
||||
return address_base + address_offset;
|
||||
}
|
||||
|
||||
uint32_t width_pow2 = xe::next_pow2(width + 1);
|
||||
uint32_t height_pow2 = xe::next_pow2(height + 1);
|
||||
|
||||
// Walk forward to find the address of the mip.
|
||||
uint32_t packed_mip_base = 1;
|
||||
for (uint32_t i = packed_mip_base; i < mip; i++, packed_mip_base++) {
|
||||
uint32_t mip_width = std::max(width_pow2 >> i, 1u);
|
||||
uint32_t mip_height = std::max(height_pow2 >> i, 1u);
|
||||
if (std::min(mip_width, mip_height) <= 16) {
|
||||
// We've reached the point where the mips are packed into a single tile.
|
||||
break;
|
||||
}
|
||||
address_offset += GetMipExtent(i, is_guest).all_blocks() * bytes_per_block;
|
||||
}
|
||||
|
||||
// Now, check if the mip is packed at an offset.
|
||||
GetPackedTileOffset(width_pow2 >> mip, height_pow2 >> mip, format_info(),
|
||||
mip - packed_mip_base, offset_x, offset_y);
|
||||
return address_base + address_offset;
|
||||
}
|
||||
|
||||
bool TextureInfo::GetPackedTileOffset(uint32_t width, uint32_t height,
|
||||
const FormatInfo* format_info,
|
||||
int packed_tile, uint32_t* offset_x,
|
||||
uint32_t* offset_y) {
|
||||
// Tile size is 32x32, and once textures go <=16 they are packed into a
|
||||
// single tile together. The math here is insane. Most sourced
|
||||
// from graph paper and looking at dds dumps.
|
||||
// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
||||
// 0 +.4x4.+ +.....8x8.....+ +............16x16............+
|
||||
// 1 +.4x4.+ +.....8x8.....+ +............16x16............+
|
||||
// 2 +.4x4.+ +.....8x8.....+ +............16x16............+
|
||||
// 3 +.4x4.+ +.....8x8.....+ +............16x16............+
|
||||
// 4 x +.....8x8.....+ +............16x16............+
|
||||
// 5 +.....8x8.....+ +............16x16............+
|
||||
// 6 +.....8x8.....+ +............16x16............+
|
||||
// 7 +.....8x8.....+ +............16x16............+
|
||||
// 8 2x2 +............16x16............+
|
||||
// 9 2x2 +............16x16............+
|
||||
// 0 +............16x16............+
|
||||
// ... .....
|
||||
// This only works for square textures, or textures that are some non-pot
|
||||
// <= square. As soon as the aspect ratio goes weird, the textures start to
|
||||
// stretch across tiles.
|
||||
//
|
||||
// The 2x2 and 1x1 squares are packed in their specific positions because
|
||||
// each square is the size of at least one block (which is 4x4 pixels max)
|
||||
//
|
||||
// if (tile_aligned(w) > tile_aligned(h)) {
|
||||
// // wider than tall, so packed horizontally
|
||||
// } else if (tile_aligned(w) < tile_aligned(h)) {
|
||||
// // taller than wide, so packed vertically
|
||||
// } else {
|
||||
// square
|
||||
// }
|
||||
// It's important to use logical sizes here, as the input sizes will be
|
||||
// for the entire packed tile set, not the actual texture.
|
||||
// The minimum dimension is what matters most: if either width or height
|
||||
// is <= 16 this mode kicks in.
|
||||
|
||||
uint32_t log2_width = xe::log2_ceil(width);
|
||||
uint32_t log2_height = xe::log2_ceil(height);
|
||||
if (std::min(log2_width, log2_height) > 4) {
|
||||
// Too big, not packed.
|
||||
*offset_x = 0;
|
||||
*offset_y = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Find the block offset of the mip.
|
||||
if (packed_tile < 3) {
|
||||
if (log2_width > log2_height) {
|
||||
// Wider than tall. Laid out vertically.
|
||||
*offset_x = 0;
|
||||
*offset_y = 16 >> packed_tile;
|
||||
} else {
|
||||
// Taller than wide. Laid out horizontally.
|
||||
*offset_x = 16 >> packed_tile;
|
||||
*offset_y = 0;
|
||||
}
|
||||
} else {
|
||||
if (log2_width > log2_height) {
|
||||
// Wider than tall. Laid out vertically.
|
||||
*offset_x = 16 >> (packed_tile - 2);
|
||||
*offset_y = 0;
|
||||
} else {
|
||||
// Taller than wide. Laid out horizontally.
|
||||
*offset_x = 0;
|
||||
*offset_y = 16 >> (packed_tile - 2);
|
||||
}
|
||||
}
|
||||
|
||||
*offset_x /= format_info->block_width;
|
||||
*offset_y /= format_info->block_height;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TextureInfo::GetPackedTileOffset(int packed_tile, uint32_t* offset_x,
|
||||
uint32_t* offset_y) const {
|
||||
if (!has_packed_mips) {
|
||||
*offset_x = 0;
|
||||
*offset_y = 0;
|
||||
return false;
|
||||
}
|
||||
return GetPackedTileOffset(xe::next_pow2(width + 1),
|
||||
xe::next_pow2(height + 1), format_info(),
|
||||
packed_tile, offset_x, offset_y);
|
||||
}
|
||||
|
||||
uint64_t TextureInfo::hash() const {
|
||||
return XXH3_64bits(this, sizeof(TextureInfo));
|
||||
}
|
||||
|
|
|
@ -181,7 +181,7 @@ inline xenos::TextureFormat DepthRenderTargetToTextureFormat(
|
|||
}
|
||||
}
|
||||
|
||||
enum class FormatType {
|
||||
enum class FormatType : uint32_t {
|
||||
// Uncompressed, and is also a ColorFormat.
|
||||
kResolvable,
|
||||
// Uncompressed, but resolve or memory export cannot be done to the format.
|
||||
|
@ -190,12 +190,12 @@ enum class FormatType {
|
|||
};
|
||||
|
||||
struct FormatInfo {
|
||||
xenos::TextureFormat format;
|
||||
const char* name;
|
||||
FormatType type;
|
||||
uint32_t block_width;
|
||||
uint32_t block_height;
|
||||
uint32_t bits_per_pixel;
|
||||
const xenos::TextureFormat format;
|
||||
|
||||
const FormatType type;
|
||||
const uint32_t block_width;
|
||||
const uint32_t block_height;
|
||||
const uint32_t bits_per_pixel;
|
||||
|
||||
uint32_t bytes_per_block() const {
|
||||
return block_width * block_height * bits_per_pixel / 8;
|
||||
|
@ -203,6 +203,20 @@ struct FormatInfo {
|
|||
|
||||
static const FormatInfo* Get(uint32_t gpu_format);
|
||||
|
||||
static const char* GetName(uint32_t gpu_format);
|
||||
static const char* GetName(xenos::TextureFormat format) {
|
||||
return GetName(static_cast<uint32_t>(format));
|
||||
}
|
||||
|
||||
static unsigned char GetWidthShift(uint32_t gpu_format);
|
||||
static unsigned char GetHeightShift(uint32_t gpu_format);
|
||||
|
||||
static unsigned char GetWidthShift(xenos::TextureFormat gpu_format) {
|
||||
return GetWidthShift(static_cast<uint32_t>(gpu_format));
|
||||
}
|
||||
static unsigned char GetHeightShift(xenos::TextureFormat gpu_format) {
|
||||
return GetHeightShift(static_cast<uint32_t>(gpu_format));
|
||||
}
|
||||
static const FormatInfo* Get(xenos::TextureFormat format) {
|
||||
return Get(static_cast<uint32_t>(format));
|
||||
}
|
||||
|
@ -259,7 +273,9 @@ struct TextureInfo {
|
|||
const FormatInfo* format_info() const {
|
||||
return FormatInfo::Get(static_cast<uint32_t>(format));
|
||||
}
|
||||
|
||||
const char* format_name() const {
|
||||
return FormatInfo::GetName(static_cast<uint32_t>(format));
|
||||
}
|
||||
bool is_compressed() const {
|
||||
return format_info()->type == FormatType::kCompressed;
|
||||
}
|
||||
|
@ -281,18 +297,6 @@ struct TextureInfo {
|
|||
|
||||
void GetMipSize(uint32_t mip, uint32_t* width, uint32_t* height) const;
|
||||
|
||||
// Get the memory location of a mip. offset_x and offset_y are in blocks.
|
||||
uint32_t GetMipLocation(uint32_t mip, uint32_t* offset_x, uint32_t* offset_y,
|
||||
bool is_guest) const;
|
||||
|
||||
static bool GetPackedTileOffset(uint32_t width, uint32_t height,
|
||||
const FormatInfo* format_info,
|
||||
int packed_tile, uint32_t* offset_x,
|
||||
uint32_t* offset_y);
|
||||
|
||||
bool GetPackedTileOffset(int packed_tile, uint32_t* offset_x,
|
||||
uint32_t* offset_y) const;
|
||||
|
||||
uint64_t hash() const;
|
||||
bool operator==(const TextureInfo& other) const {
|
||||
return std::memcmp(this, &other, sizeof(TextureInfo)) == 0;
|
||||
|
|
|
@ -17,77 +17,60 @@ namespace gpu {
|
|||
using namespace xe::gpu::xenos;
|
||||
|
||||
#define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) \
|
||||
{xenos::TextureFormat::texture_format, #texture_format, FormatType::format, block_width, block_height, bits_per_pixel}
|
||||
{xenos::TextureFormat::texture_format, FormatType::format, block_width, block_height, bits_per_pixel}
|
||||
const FormatInfo* FormatInfo::Get(uint32_t gpu_format) {
|
||||
static const FormatInfo format_infos[64] = {
|
||||
FORMAT_INFO(k_1_REVERSE , kUncompressed, 1, 1, 1),
|
||||
FORMAT_INFO(k_1 , kUncompressed, 1, 1, 1),
|
||||
FORMAT_INFO(k_8 , kResolvable, 1, 1, 8),
|
||||
FORMAT_INFO(k_1_5_5_5 , kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_5_6_5 , kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_6_5_5 , kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_8_8_8_8 , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_2_10_10_10 , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_8_A , kResolvable, 1, 1, 8),
|
||||
FORMAT_INFO(k_8_B , kResolvable, 1, 1, 8),
|
||||
FORMAT_INFO(k_8_8 , kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_Cr_Y1_Cb_Y0_REP , kCompressed, 2, 1, 16),
|
||||
FORMAT_INFO(k_Y1_Cr_Y0_Cb_REP , kCompressed, 2, 1, 16),
|
||||
FORMAT_INFO(k_16_16_EDRAM , kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_8_8_8_8_A , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_4_4_4_4 , kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_10_11_11 , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_11_11_10 , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_DXT1 , kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_DXT2_3 , kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_DXT4_5 , kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_16_16_16_16_EDRAM , kUncompressed, 1, 1, 64),
|
||||
FORMAT_INFO(k_24_8 , kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_24_8_FLOAT , kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_16 , kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16 , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_16_16_16_16 , kResolvable, 1, 1, 64),
|
||||
FORMAT_INFO(k_16_EXPAND , kUncompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16_EXPAND , kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_16_16_16_16_EXPAND , kUncompressed, 1, 1, 64),
|
||||
FORMAT_INFO(k_16_FLOAT , kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16_FLOAT , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_16_16_16_16_FLOAT , kResolvable, 1, 1, 64),
|
||||
FORMAT_INFO(k_32 , kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_32_32 , kUncompressed, 1, 1, 64),
|
||||
FORMAT_INFO(k_32_32_32_32 , kUncompressed, 1, 1, 128),
|
||||
FORMAT_INFO(k_32_FLOAT , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_32_32_FLOAT , kResolvable, 1, 1, 64),
|
||||
FORMAT_INFO(k_32_32_32_32_FLOAT , kResolvable, 1, 1, 128),
|
||||
FORMAT_INFO(k_32_AS_8 , kCompressed, 4, 1, 8),
|
||||
FORMAT_INFO(k_32_AS_8_8 , kCompressed, 2, 1, 16),
|
||||
FORMAT_INFO(k_16_MPEG , kUncompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16_MPEG , kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_8_INTERLACED , kUncompressed, 1, 1, 8),
|
||||
FORMAT_INFO(k_32_AS_8_INTERLACED , kCompressed, 4, 1, 8),
|
||||
FORMAT_INFO(k_32_AS_8_8_INTERLACED , kCompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_INTERLACED , kUncompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_MPEG_INTERLACED , kUncompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16_MPEG_INTERLACED , kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_DXN , kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_8_8_8_8_AS_16_16_16_16 , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_DXT1_AS_16_16_16_16 , kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_DXT2_3_AS_16_16_16_16 , kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_DXT4_5_AS_16_16_16_16 , kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_2_10_10_10_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_10_11_11_AS_16_16_16_16 , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_11_11_10_AS_16_16_16_16 , kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_32_32_32_FLOAT , kUncompressed, 1, 1, 96),
|
||||
FORMAT_INFO(k_DXT3A , kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_DXT5A , kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_CTX1 , kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_DXT3A_AS_1_1_1_1 , kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_8_8_8_8_GAMMA_EDRAM , kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_2_10_10_10_FLOAT_EDRAM , kUncompressed, 1, 1, 32),
|
||||
#include "texture_info_formats.inl"
|
||||
};
|
||||
return &format_infos[gpu_format];
|
||||
}
|
||||
#undef FORMAT_INFO
|
||||
|
||||
|
||||
constexpr unsigned char GetShift(unsigned pow) {
|
||||
unsigned char sh = 0;
|
||||
|
||||
while (!(pow & 1)) {
|
||||
pow>>=1;
|
||||
sh++;
|
||||
}
|
||||
|
||||
return sh;
|
||||
}
|
||||
/*
|
||||
todo: getwidthshift and getheightshift should not need a full 64 byte table each
|
||||
there are 15 elements for GetWidthShift where the shift will not be 0. the max shift that will be returned is 2, and there are 64 elements total
|
||||
this means we can use a boolean table that also acts as a sparse indexer ( popcnt preceding bits to get index) and shift and mask a 32 bit word to get the shift
|
||||
*/
|
||||
unsigned char FormatInfo::GetWidthShift(uint32_t gpu_format) {
|
||||
#define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) GetShift(block_width)
|
||||
alignas(XE_HOST_CACHE_LINE_SIZE)
|
||||
constexpr unsigned char wshift_table[64] = {
|
||||
#include "texture_info_formats.inl"
|
||||
};
|
||||
#undef FORMAT_INFO
|
||||
|
||||
return wshift_table[gpu_format];
|
||||
}
|
||||
unsigned char FormatInfo::GetHeightShift(uint32_t gpu_format) {
|
||||
#define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) GetShift(block_height)
|
||||
alignas(XE_HOST_CACHE_LINE_SIZE)
|
||||
constexpr unsigned char hshift_table[64] = {
|
||||
#include "texture_info_formats.inl"
|
||||
};
|
||||
#undef FORMAT_INFO
|
||||
|
||||
return hshift_table[gpu_format];
|
||||
}
|
||||
#define FORMAT_INFO(texture_format,...) #texture_format
|
||||
static constexpr const char* const format_name_table[64] = {
|
||||
#include "texture_info_formats.inl"
|
||||
|
||||
};
|
||||
#undef FORMAT_INFO
|
||||
const char* FormatInfo::GetName(uint32_t gpu_format) {
|
||||
|
||||
return format_name_table[gpu_format];
|
||||
}
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
FORMAT_INFO(k_1_REVERSE, kUncompressed, 1, 1, 1),
|
||||
FORMAT_INFO(k_1, kUncompressed, 1, 1, 1),
|
||||
FORMAT_INFO(k_8, kResolvable, 1, 1, 8),
|
||||
FORMAT_INFO(k_1_5_5_5, kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_5_6_5, kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_6_5_5, kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_8_8_8_8, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_2_10_10_10, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_8_A, kResolvable, 1, 1, 8),
|
||||
FORMAT_INFO(k_8_B, kResolvable, 1, 1, 8),
|
||||
FORMAT_INFO(k_8_8, kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_Cr_Y1_Cb_Y0_REP, kCompressed, 2, 1, 16),
|
||||
FORMAT_INFO(k_Y1_Cr_Y0_Cb_REP, kCompressed, 2, 1, 16),
|
||||
FORMAT_INFO(k_16_16_EDRAM, kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_8_8_8_8_A, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_4_4_4_4, kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_10_11_11, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_11_11_10, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_DXT1, kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_DXT2_3, kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_DXT4_5, kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_16_16_16_16_EDRAM, kUncompressed, 1, 1, 64),
|
||||
FORMAT_INFO(k_24_8, kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_24_8_FLOAT, kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_16, kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_16_16_16_16, kResolvable, 1, 1, 64),
|
||||
FORMAT_INFO(k_16_EXPAND, kUncompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16_EXPAND, kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_16_16_16_16_EXPAND, kUncompressed, 1, 1, 64),
|
||||
FORMAT_INFO(k_16_FLOAT, kResolvable, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16_FLOAT, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_16_16_16_16_FLOAT, kResolvable, 1, 1, 64),
|
||||
FORMAT_INFO(k_32, kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_32_32, kUncompressed, 1, 1, 64),
|
||||
FORMAT_INFO(k_32_32_32_32, kUncompressed, 1, 1, 128),
|
||||
FORMAT_INFO(k_32_FLOAT, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_32_32_FLOAT, kResolvable, 1, 1, 64),
|
||||
FORMAT_INFO(k_32_32_32_32_FLOAT, kResolvable, 1, 1, 128),
|
||||
FORMAT_INFO(k_32_AS_8, kCompressed, 4, 1, 8),
|
||||
FORMAT_INFO(k_32_AS_8_8, kCompressed, 2, 1, 16),
|
||||
FORMAT_INFO(k_16_MPEG, kUncompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16_MPEG, kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_8_INTERLACED, kUncompressed, 1, 1, 8),
|
||||
FORMAT_INFO(k_32_AS_8_INTERLACED, kCompressed, 4, 1, 8),
|
||||
FORMAT_INFO(k_32_AS_8_8_INTERLACED, kCompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_INTERLACED, kUncompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_MPEG_INTERLACED, kUncompressed, 1, 1, 16),
|
||||
FORMAT_INFO(k_16_16_MPEG_INTERLACED, kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_DXN, kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_8_8_8_8_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_DXT1_AS_16_16_16_16, kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_DXT2_3_AS_16_16_16_16, kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_DXT4_5_AS_16_16_16_16, kCompressed, 4, 4, 8),
|
||||
FORMAT_INFO(k_2_10_10_10_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_10_11_11_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_11_11_10_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||
FORMAT_INFO(k_32_32_32_FLOAT, kUncompressed, 1, 1, 96),
|
||||
FORMAT_INFO(k_DXT3A, kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_DXT5A, kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_CTX1, kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_DXT3A_AS_1_1_1_1, kCompressed, 4, 4, 4),
|
||||
FORMAT_INFO(k_8_8_8_8_GAMMA_EDRAM, kUncompressed, 1, 1, 32),
|
||||
FORMAT_INFO(k_2_10_10_10_FLOAT_EDRAM, kUncompressed, 1, 1, 32),
|
|
@ -199,9 +199,8 @@ bool GetPackedMipOffset(uint32_t width, uint32_t height, uint32_t depth,
|
|||
}
|
||||
}
|
||||
|
||||
const FormatInfo* format_info = FormatInfo::Get(format);
|
||||
x_blocks /= format_info->block_width;
|
||||
y_blocks /= format_info->block_height;
|
||||
x_blocks >>= FormatInfo::GetWidthShift(format);
|
||||
y_blocks >>= FormatInfo::GetHeightShift(format);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -273,9 +272,10 @@ TextureGuestLayout GetGuestTextureLayout(
|
|||
}
|
||||
layout.mips_total_extent_bytes = 0;
|
||||
|
||||
const FormatInfo* format_info = FormatInfo::Get(format);
|
||||
uint32_t bytes_per_block = format_info->bytes_per_block();
|
||||
|
||||
const FormatInfo* const format_info = FormatInfo::Get(format);
|
||||
const uint32_t bytes_per_block = format_info->bytes_per_block();
|
||||
const unsigned char block_width_sh = FormatInfo::GetWidthShift(format);
|
||||
const unsigned char block_height_sh = FormatInfo::GetHeightShift(format);
|
||||
// The loop counter can mean two things depending on whether the packed mip
|
||||
// tail is stored as mip 0, because in this case, it would be ambiguous since
|
||||
// both the base and the mips would be on "level 0", but stored separately and
|
||||
|
@ -320,10 +320,13 @@ TextureGuestLayout GetGuestTextureLayout(
|
|||
z_slice_stride_texel_rows_unaligned =
|
||||
std::max(xe::next_pow2(height_texels) >> level, uint32_t(1));
|
||||
}
|
||||
uint32_t row_pitch_blocks_tile_aligned = xe::align(
|
||||
xe::align(row_pitch_texels_unaligned, format_info->block_width) /
|
||||
format_info->block_width,
|
||||
xenos::kTextureTileWidthHeight);
|
||||
// maybe do 1 << block_width_sh instead of format_info->block_width, since
|
||||
// we'll have cl loaded with the shift anyway
|
||||
uint32_t row_pitch_blocks_tile_aligned =
|
||||
xe::align(xe::align<uint32_t>(row_pitch_texels_unaligned,
|
||||
format_info->block_width) >>
|
||||
block_width_sh,
|
||||
xenos::kTextureTileWidthHeight);
|
||||
level_layout.row_pitch_bytes =
|
||||
row_pitch_blocks_tile_aligned * bytes_per_block;
|
||||
// Assuming the provided pitch is already 256-byte-aligned for linear, but
|
||||
|
@ -335,10 +338,11 @@ TextureGuestLayout GetGuestTextureLayout(
|
|||
}
|
||||
level_layout.z_slice_stride_block_rows =
|
||||
dimension != xenos::DataDimension::k1D
|
||||
? xe::align(xe::align(z_slice_stride_texel_rows_unaligned,
|
||||
format_info->block_height) /
|
||||
format_info->block_height,
|
||||
xenos::kTextureTileWidthHeight)
|
||||
? xe::align<uint32_t>(
|
||||
xe::align<uint32_t>(z_slice_stride_texel_rows_unaligned,
|
||||
format_info->block_height) >>
|
||||
block_height_sh,
|
||||
xenos::kTextureTileWidthHeight)
|
||||
: 1;
|
||||
level_layout.array_slice_stride_bytes =
|
||||
level_layout.row_pitch_bytes * level_layout.z_slice_stride_block_rows;
|
||||
|
@ -358,13 +362,13 @@ TextureGuestLayout GetGuestTextureLayout(
|
|||
// the stride. For tiled textures, this is the dimensions aligned to 32x32x4
|
||||
// blocks (or x1 for the missing dimensions).
|
||||
uint32_t level_width_blocks =
|
||||
xe::align(std::max(width_texels >> level, uint32_t(1)),
|
||||
format_info->block_width) /
|
||||
format_info->block_width;
|
||||
xe::align<uint32_t>(std::max(width_texels >> level, uint32_t(1)),
|
||||
format_info->block_width) >>
|
||||
block_width_sh;
|
||||
uint32_t level_height_blocks =
|
||||
xe::align(std::max(height_texels >> level, uint32_t(1)),
|
||||
format_info->block_height) /
|
||||
format_info->block_height;
|
||||
xe::align<uint32_t>(std::max(height_texels >> level, uint32_t(1)),
|
||||
format_info->block_height) >>
|
||||
block_height_sh;
|
||||
uint32_t level_depth = std::max(depth >> level, uint32_t(1));
|
||||
if (is_tiled) {
|
||||
level_layout.x_extent_blocks =
|
||||
|
@ -415,20 +419,20 @@ TextureGuestLayout GetGuestTextureLayout(
|
|||
GetPackedMipOffset(width_texels, height_texels, depth, format,
|
||||
packed_sublevel, packed_sublevel_x_blocks,
|
||||
packed_sublevel_y_blocks, packed_sublevel_z);
|
||||
level_layout.x_extent_blocks = std::max(
|
||||
level_layout.x_extent_blocks = std::max<uint32_t>(
|
||||
level_layout.x_extent_blocks,
|
||||
packed_sublevel_x_blocks +
|
||||
xe::align(
|
||||
std::max(width_texels >> packed_sublevel, uint32_t(1)),
|
||||
format_info->block_width) /
|
||||
format_info->block_width);
|
||||
level_layout.y_extent_blocks = std::max(
|
||||
(xe::align<uint32_t>(
|
||||
std::max(width_texels >> packed_sublevel, uint32_t(1)),
|
||||
format_info->block_width) >>
|
||||
block_width_sh));
|
||||
level_layout.y_extent_blocks = std::max<uint32_t>(
|
||||
level_layout.y_extent_blocks,
|
||||
packed_sublevel_y_blocks +
|
||||
xe::align(
|
||||
std::max(height_texels >> packed_sublevel, uint32_t(1)),
|
||||
format_info->block_height) /
|
||||
format_info->block_height);
|
||||
(xe::align<uint32_t>(
|
||||
std::max(height_texels >> packed_sublevel, uint32_t(1)),
|
||||
format_info->block_height) >>
|
||||
block_height_sh));
|
||||
level_layout.z_extent =
|
||||
std::max(level_layout.z_extent,
|
||||
packed_sublevel_z +
|
||||
|
|
|
@ -743,7 +743,7 @@ void TraceViewer::DrawTextureInfo(
|
|||
ImGui::NextColumn();
|
||||
ImGui::Text("Fetch Slot: %u", texture_binding.fetch_constant);
|
||||
ImGui::Text("Guest Address: %.8X", texture_info.memory.base_address);
|
||||
ImGui::Text("Format: %s", texture_info.format_info()->name);
|
||||
ImGui::Text("Format: %s", texture_info.format_name());
|
||||
switch (texture_info.dimension) {
|
||||
case xenos::DataDimension::k1D:
|
||||
ImGui::Text("1D: %dpx", texture_info.width + 1);
|
||||
|
|
|
@ -32,10 +32,11 @@
|
|||
#include "xenia/gpu/vulkan/vulkan_shader.h"
|
||||
#include "xenia/gpu/vulkan/vulkan_shared_memory.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
#include "xenia/kernel/kernel_state.h"
|
||||
#include "xenia/kernel/user_module.h"
|
||||
#include "xenia/ui/vulkan/vulkan_presenter.h"
|
||||
#include "xenia/ui/vulkan/vulkan_provider.h"
|
||||
#include "xenia/ui/vulkan/vulkan_util.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace vulkan {
|
||||
|
@ -4171,6 +4172,8 @@ uint32_t VulkanCommandProcessor::WriteTransientTextureBindings(
|
|||
return descriptor_set_write_count;
|
||||
}
|
||||
|
||||
#define COMMAND_PROCESSOR VulkanCommandProcessor
|
||||
#include "../pm4_command_processor_implement.h"
|
||||
} // namespace vulkan
|
||||
} // namespace gpu
|
||||
} // namespace xe
|
||||
|
|
|
@ -53,6 +53,7 @@ class VulkanCommandProcessor final : public CommandProcessor {
|
|||
kStorageBufferCompute,
|
||||
kCount,
|
||||
};
|
||||
#include "../pm4_command_processor_declare.h"
|
||||
|
||||
class ScratchBufferAcquisition {
|
||||
public:
|
||||
|
|
|
@ -2020,7 +2020,7 @@ bool VulkanTextureCache::Initialize() {
|
|||
// Log which formats are not supported or supported via fallbacks.
|
||||
const HostFormatPair& best_host_format = kBestHostFormats[i];
|
||||
const char* guest_format_name =
|
||||
FormatInfo::Get(xenos::TextureFormat(i))->name;
|
||||
FormatInfo::GetName(xenos::TextureFormat(i));
|
||||
if (best_host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) {
|
||||
assert_not_null(guest_format_name);
|
||||
if (host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) {
|
||||
|
|
|
@ -1045,8 +1045,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
|
|||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
inline uint32_t GpuSwap(uint32_t value, Endian endianness) {
|
||||
XE_NOINLINE
|
||||
XE_NOALIAS
|
||||
static uint32_t GpuSwap(uint32_t value, Endian endianness) {
|
||||
switch (endianness) {
|
||||
default:
|
||||
case Endian::kNone:
|
||||
|
|
|
@ -511,7 +511,8 @@ template <size_t I = 0, typename... Ps>
|
|||
StringBuffer* thread_local_string_buffer();
|
||||
|
||||
template <typename Tuple>
|
||||
void PrintKernelCall(cpu::Export* export_entry, const Tuple& params) {
|
||||
XE_NOALIAS void PrintKernelCall(cpu::Export* export_entry,
|
||||
const Tuple& params) {
|
||||
auto& string_buffer = *thread_local_string_buffer();
|
||||
string_buffer.Reset();
|
||||
string_buffer.Append(export_entry->name);
|
||||
|
@ -526,58 +527,89 @@ void PrintKernelCall(cpu::Export* export_entry, const Tuple& params) {
|
|||
string_buffer.to_string_view());
|
||||
}
|
||||
}
|
||||
/*
|
||||
todo: need faster string formatting/concatenation (all arguments are
|
||||
always turned into strings except if kHighFrequency)
|
||||
|
||||
*/
|
||||
template <typename F, typename Tuple, std::size_t... I>
|
||||
auto KernelTrampoline(F&& f, Tuple&& t, std::index_sequence<I...>) {
|
||||
XE_FORCEINLINE static auto KernelTrampoline(F&& f, Tuple&& t,
|
||||
std::index_sequence<I...>) {
|
||||
return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
|
||||
}
|
||||
|
||||
template <KernelModuleId MODULE, uint16_t ORDINAL, typename R, typename... Ps>
|
||||
xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name,
|
||||
xe::cpu::ExportTag::type tags) {
|
||||
static_assert(
|
||||
std::is_void<R>::value || std::is_base_of<shim::Result, R>::value,
|
||||
"R must be void or derive from shim::Result");
|
||||
static_assert((std::is_base_of_v<shim::Param, Ps> && ...),
|
||||
"Ps must derive from shim::Param");
|
||||
static const auto export_entry = new cpu::Export(
|
||||
ORDINAL, xe::cpu::Export::Type::kFunction, name,
|
||||
tags | xe::cpu::ExportTag::kImplemented | xe::cpu::ExportTag::kLog);
|
||||
static R (*FN)(Ps & ...) = fn;
|
||||
struct X {
|
||||
static void Trampoline(PPCContext* ppc_context) {
|
||||
++export_entry->function_data.call_count;
|
||||
Param::Init init = {
|
||||
ppc_context,
|
||||
0,
|
||||
};
|
||||
// Using braces initializer instead of make_tuple because braces
|
||||
// enforce execution order across compilers.
|
||||
// The make_tuple order is undefined per the C++ standard and
|
||||
// cause inconsitencies between msvc and clang.
|
||||
std::tuple<Ps...> params = {Ps(init)...};
|
||||
if (export_entry->tags & xe::cpu::ExportTag::kLog &&
|
||||
(!(export_entry->tags & xe::cpu::ExportTag::kHighFrequency) ||
|
||||
cvars::log_high_frequency_kernel_calls)) {
|
||||
PrintKernelCall(export_entry, params);
|
||||
}
|
||||
if constexpr (std::is_void<R>::value) {
|
||||
KernelTrampoline(FN, std::forward<std::tuple<Ps...>>(params),
|
||||
std::make_index_sequence<sizeof...(Ps)>());
|
||||
} else {
|
||||
auto result =
|
||||
KernelTrampoline(FN, std::forward<std::tuple<Ps...>>(params),
|
||||
std::make_index_sequence<sizeof...(Ps)>());
|
||||
result.Store(ppc_context);
|
||||
if (export_entry->tags &
|
||||
(xe::cpu::ExportTag::kLog | xe::cpu::ExportTag::kLogResult)) {
|
||||
// TODO(benvanik): log result.
|
||||
struct ExportRegistrerHelper {
|
||||
template <R (*fn)(Ps&...), xe::cpu::ExportTag::type tags>
|
||||
static xe::cpu::Export* RegisterExport(const char* name) {
|
||||
static_assert(
|
||||
std::is_void<R>::value || std::is_base_of<shim::Result, R>::value,
|
||||
"R must be void or derive from shim::Result");
|
||||
static_assert((std::is_base_of_v<shim::Param, Ps> && ...),
|
||||
"Ps must derive from shim::Param");
|
||||
constexpr auto TAGS =
|
||||
tags | xe::cpu::ExportTag::kImplemented | xe::cpu::ExportTag::kLog;
|
||||
|
||||
static const auto export_entry =
|
||||
new cpu::Export(ORDINAL, xe::cpu::Export::Type::kFunction, name, TAGS);
|
||||
struct X {
|
||||
static void Trampoline(PPCContext* ppc_context) {
|
||||
++export_entry->function_data.call_count;
|
||||
Param::Init init = {
|
||||
ppc_context,
|
||||
0,
|
||||
};
|
||||
// Using braces initializer instead of make_tuple because braces
|
||||
// enforce execution order across compilers.
|
||||
// The make_tuple order is undefined per the C++ standard and
|
||||
// cause inconsitencies between msvc and clang.
|
||||
std::tuple<Ps...> params = {Ps(init)...};
|
||||
if (TAGS & xe::cpu::ExportTag::kLog &&
|
||||
(!(TAGS & xe::cpu::ExportTag::kHighFrequency) ||
|
||||
cvars::log_high_frequency_kernel_calls)) {
|
||||
PrintKernelCall(export_entry, params);
|
||||
}
|
||||
if constexpr (std::is_void<R>::value) {
|
||||
KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
|
||||
std::make_index_sequence<sizeof...(Ps)>());
|
||||
} else {
|
||||
auto result =
|
||||
KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
|
||||
std::make_index_sequence<sizeof...(Ps)>());
|
||||
result.Store(ppc_context);
|
||||
if (TAGS &
|
||||
(xe::cpu::ExportTag::kLog | xe::cpu::ExportTag::kLogResult)) {
|
||||
// TODO(benvanik): log result.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
export_entry->function_data.trampoline = &X::Trampoline;
|
||||
return export_entry;
|
||||
};
|
||||
struct Y {
|
||||
static void Trampoline(PPCContext* ppc_context) {
|
||||
Param::Init init = {
|
||||
ppc_context,
|
||||
0,
|
||||
};
|
||||
std::tuple<Ps...> params = {Ps(init)...};
|
||||
if constexpr (std::is_void<R>::value) {
|
||||
KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
|
||||
std::make_index_sequence<sizeof...(Ps)>());
|
||||
} else {
|
||||
auto result =
|
||||
KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
|
||||
std::make_index_sequence<sizeof...(Ps)>());
|
||||
result.Store(ppc_context);
|
||||
}
|
||||
}
|
||||
};
|
||||
export_entry->function_data.trampoline = &X::Trampoline;
|
||||
return export_entry;
|
||||
}
|
||||
};
|
||||
template <KernelModuleId MODULE, uint16_t ORDINAL, typename R, typename... Ps>
|
||||
auto GetRegister(R (*fngetter)(Ps&...)) {
|
||||
return static_cast<ExportRegistrerHelper<MODULE, ORDINAL, R, Ps...>*>(
|
||||
nullptr);
|
||||
}
|
||||
|
||||
} // namespace shim
|
||||
|
@ -585,13 +617,17 @@ xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name,
|
|||
using xe::cpu::ExportTag;
|
||||
|
||||
#define DECLARE_EXPORT(module_name, name, category, tags) \
|
||||
using _register_##module_name##_##name = \
|
||||
std::remove_cv_t<std::remove_reference_t< \
|
||||
decltype(*xe::kernel::shim::GetRegister< \
|
||||
xe::kernel::shim::KernelModuleId::module_name, \
|
||||
ordinals::name>(&name##_entry))>>; \
|
||||
const auto EXPORT_##module_name##_##name = RegisterExport_##module_name( \
|
||||
xe::kernel::shim::RegisterExport< \
|
||||
xe::kernel::shim::KernelModuleId::module_name, ordinals::name>( \
|
||||
&name##_entry, #name, \
|
||||
tags | (static_cast<xe::cpu::ExportTag::type>( \
|
||||
xe::cpu::ExportCategory::category) \
|
||||
<< xe::cpu::ExportTag::CategoryShift)));
|
||||
_register_##module_name##_##name ::RegisterExport< \
|
||||
&name##_entry, tags | (static_cast<xe::cpu::ExportTag::type>( \
|
||||
xe::cpu::ExportCategory::category) \
|
||||
<< xe::cpu::ExportTag::CategoryShift)>( \
|
||||
#name));
|
||||
|
||||
#define DECLARE_EMPTY_REGISTER_EXPORTS(module_name, group_name) \
|
||||
void xe::kernel::module_name::Register##group_name##Exports( \
|
||||
|
|
|
@ -316,8 +316,46 @@ void Memory::Reset() {
|
|||
heaps_.v90000000.Reset();
|
||||
heaps_.physical.Reset();
|
||||
}
|
||||
|
||||
XE_NOALIAS
|
||||
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
||||
#if 1
|
||||
#define HEAP_INDEX(name) \
|
||||
offsetof(Memory, heaps_.name) - offsetof(Memory, heaps_)
|
||||
|
||||
const char* heap_select = (const char*)&this->heaps_;
|
||||
|
||||
unsigned selected_heap_offset = 0;
|
||||
unsigned high_nibble = address >> 28;
|
||||
|
||||
if (high_nibble < 0x4) {
|
||||
selected_heap_offset = HEAP_INDEX(v00000000);
|
||||
} else if (address < 0x7F000000) {
|
||||
selected_heap_offset = HEAP_INDEX(v40000000);
|
||||
} else if (high_nibble < 0x8) {
|
||||
heap_select = nullptr;
|
||||
// return nullptr;
|
||||
} else if (high_nibble < 0x9) {
|
||||
selected_heap_offset = HEAP_INDEX(v80000000);
|
||||
// return &heaps_.v80000000;
|
||||
} else if (high_nibble < 0xA) {
|
||||
// return &heaps_.v90000000;
|
||||
selected_heap_offset = HEAP_INDEX(v90000000);
|
||||
} else if (high_nibble < 0xC) {
|
||||
// return &heaps_.vA0000000;
|
||||
selected_heap_offset = HEAP_INDEX(vA0000000);
|
||||
} else if (high_nibble < 0xE) {
|
||||
// return &heaps_.vC0000000;
|
||||
selected_heap_offset = HEAP_INDEX(vC0000000);
|
||||
} else if (address < 0xFFD00000) {
|
||||
// return &heaps_.vE0000000;
|
||||
selected_heap_offset = HEAP_INDEX(vE0000000);
|
||||
} else {
|
||||
// return nullptr;
|
||||
heap_select = nullptr;
|
||||
}
|
||||
return reinterpret_cast<const BaseHeap*>(selected_heap_offset + heap_select);
|
||||
|
||||
#else
|
||||
if (address < 0x40000000) {
|
||||
return &heaps_.v00000000;
|
||||
} else if (address < 0x7F000000) {
|
||||
|
@ -337,6 +375,7 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
|||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
|
||||
|
@ -465,8 +504,8 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
|
|||
}
|
||||
|
||||
bool Memory::AccessViolationCallback(
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
void* host_address, bool is_write) {
|
||||
global_unique_lock_type global_lock_locked_once, void* host_address,
|
||||
bool is_write) {
|
||||
// Access via physical_membase_ is special, when need to bypass everything
|
||||
// (for instance, for a data provider to actually write the data) so only
|
||||
// triggering callbacks on virtual memory regions.
|
||||
|
@ -493,16 +532,15 @@ bool Memory::AccessViolationCallback(
|
|||
}
|
||||
|
||||
bool Memory::AccessViolationCallbackThunk(
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
void* context, void* host_address, bool is_write) {
|
||||
global_unique_lock_type global_lock_locked_once, void* context,
|
||||
void* host_address, bool is_write) {
|
||||
return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
|
||||
std::move(global_lock_locked_once), host_address, is_write);
|
||||
}
|
||||
|
||||
bool Memory::TriggerPhysicalMemoryCallbacks(
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect) {
|
||||
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
|
||||
uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
|
||||
BaseHeap* heap = LookupHeap(virtual_address);
|
||||
if (heap->heap_type() == HeapType::kGuestPhysical) {
|
||||
auto physical_heap = static_cast<PhysicalHeap*>(heap);
|
||||
|
@ -1711,9 +1749,8 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
|||
}
|
||||
|
||||
bool PhysicalHeap::TriggerCallbacks(
|
||||
global_unique_lock_type global_lock_locked_once,
|
||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
||||
bool unwatch_exact_range, bool unprotect) {
|
||||
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
|
||||
uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
|
||||
// TODO(Triang3l): Support read watches.
|
||||
assert_true(is_write);
|
||||
if (!is_write) {
|
||||
|
|
|
@ -473,8 +473,9 @@ class Memory {
|
|||
void SystemHeapFree(uint32_t address);
|
||||
|
||||
// Gets the heap for the address space containing the given address.
|
||||
XE_NOALIAS
|
||||
const BaseHeap* LookupHeap(uint32_t address) const;
|
||||
|
||||
XE_NOALIAS
|
||||
inline BaseHeap* LookupHeap(uint32_t address) {
|
||||
return const_cast<BaseHeap*>(
|
||||
const_cast<const Memory*>(this)->LookupHeap(address));
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
#include "xenia/base/math.h"
|
||||
#include "xenia/ui/d3d12/d3d12_immediate_drawer.h"
|
||||
#include "xenia/ui/d3d12/d3d12_presenter.h"
|
||||
|
||||
#include "xenia/ui/d3d12/d3d12_util.h"
|
||||
DEFINE_bool(d3d12_debug, false, "Enable Direct3D 12 and DXGI debug layer.",
|
||||
"D3D12");
|
||||
DEFINE_bool(d3d12_break_on_error, false,
|
||||
|
@ -35,6 +35,8 @@ DEFINE_int32(
|
|||
"system responsibility)",
|
||||
"D3D12");
|
||||
|
||||
DEFINE_bool(d3d12_nvapi_use_driver_heap_priorities, false, "nvidia stuff",
|
||||
"D3D12");
|
||||
namespace xe {
|
||||
namespace ui {
|
||||
namespace d3d12 {
|
||||
|
@ -61,6 +63,7 @@ std::unique_ptr<D3D12Provider> D3D12Provider::Create() {
|
|||
"supported GPUs.");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return provider;
|
||||
}
|
||||
|
||||
|
@ -476,10 +479,69 @@ bool D3D12Provider::Initialize() {
|
|||
// Get the graphics analysis interface, will silently fail if PIX is not
|
||||
// attached.
|
||||
pfn_dxgi_get_debug_interface1_(0, IID_PPV_ARGS(&graphics_analysis_));
|
||||
if (GetAdapterVendorID() == ui::GraphicsProvider::GpuVendorID::kNvidia) {
|
||||
nvapi_ = new lightweight_nvapi::nvapi_state_t();
|
||||
if (!nvapi_->is_available()) {
|
||||
delete nvapi_;
|
||||
nvapi_ = nullptr;
|
||||
} else {
|
||||
using namespace lightweight_nvapi;
|
||||
|
||||
nvapi_createcommittedresource_ =
|
||||
(cb_NvAPI_D3D12_CreateCommittedResource)nvapi_->query_interface<void>(
|
||||
id_NvAPI_D3D12_CreateCommittedResource);
|
||||
nvapi_querycpuvisiblevidmem_ =
|
||||
(cb_NvAPI_D3D12_QueryCpuVisibleVidmem)nvapi_->query_interface<void>(
|
||||
id_NvAPI_D3D12_QueryCpuVisibleVidmem);
|
||||
nvapi_usedriverheappriorities_ =
|
||||
(cb_NvAPI_D3D12_UseDriverHeapPriorities)nvapi_->query_interface<void>(
|
||||
id_NvAPI_D3D12_UseDriverHeapPriorities);
|
||||
|
||||
if (nvapi_usedriverheappriorities_) {
|
||||
if (cvars::d3d12_nvapi_use_driver_heap_priorities) {
|
||||
if (nvapi_usedriverheappriorities_(device_) != 0) {
|
||||
XELOGI("Failed to enable driver heap priorities");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
uint32_t D3D12Provider::CreateUploadResource(
|
||||
D3D12_HEAP_FLAGS HeapFlags, _In_ const D3D12_RESOURCE_DESC* pDesc,
|
||||
D3D12_RESOURCE_STATES InitialResourceState, REFIID riidResource,
|
||||
void** ppvResource, bool try_create_cpuvisible,
|
||||
const D3D12_CLEAR_VALUE* pOptimizedClearValue) const {
|
||||
auto device = GetDevice();
|
||||
|
||||
if (try_create_cpuvisible && nvapi_createcommittedresource_) {
|
||||
lightweight_nvapi::NV_RESOURCE_PARAMS nvrp;
|
||||
nvrp.NVResourceFlags =
|
||||
lightweight_nvapi::NV_D3D12_RESOURCE_FLAG_CPUVISIBLE_VIDMEM;
|
||||
nvrp.version = 0; // nothing checks the version
|
||||
|
||||
if (nvapi_createcommittedresource_(
|
||||
device, &ui::d3d12::util::kHeapPropertiesUpload, HeapFlags, pDesc,
|
||||
InitialResourceState, pOptimizedClearValue, &nvrp, riidResource,
|
||||
ppvResource, nullptr) != 0) {
|
||||
XELOGI(
|
||||
"Failed to create CPUVISIBLE_VIDMEM upload resource, will just do "
|
||||
"normal CreateCommittedResource");
|
||||
} else {
|
||||
return UPLOAD_RESULT_CREATE_CPUVISIBLE;
|
||||
}
|
||||
}
|
||||
if (FAILED(device->CreateCommittedResource(
|
||||
&ui::d3d12::util::kHeapPropertiesUpload, HeapFlags, pDesc,
|
||||
InitialResourceState, pOptimizedClearValue, riidResource,
|
||||
ppvResource))) {
|
||||
XELOGE("Failed to create the gamma ramp upload buffer");
|
||||
return UPLOAD_RESULT_CREATE_FAILED;
|
||||
}
|
||||
|
||||
return UPLOAD_RESULT_CREATE_SUCCESS;
|
||||
}
|
||||
std::unique_ptr<Presenter> D3D12Provider::CreatePresenter(
|
||||
Presenter::HostGpuLossCallback host_gpu_loss_callback) {
|
||||
return D3D12Presenter::Create(host_gpu_loss_callback, *this);
|
||||
|
|
|
@ -14,13 +14,21 @@
|
|||
|
||||
#include "xenia/ui/d3d12/d3d12_api.h"
|
||||
#include "xenia/ui/graphics_provider.h"
|
||||
|
||||
// chrispy: this is here to prevent clang format from moving d3d12_nvapi above
|
||||
// the headers it depends on
|
||||
#define HEADERFENCE
|
||||
#undef HEADERFENCE
|
||||
#include "xenia/gpu/d3d12/d3d12_nvapi.hpp"
|
||||
#define XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES 1
|
||||
|
||||
namespace xe {
|
||||
namespace ui {
|
||||
namespace d3d12 {
|
||||
|
||||
enum {
|
||||
UPLOAD_RESULT_CREATE_FAILED = 0,
|
||||
UPLOAD_RESULT_CREATE_SUCCESS = 1,
|
||||
UPLOAD_RESULT_CREATE_CPUVISIBLE = 2
|
||||
};
|
||||
class D3D12Provider : public GraphicsProvider {
|
||||
public:
|
||||
~D3D12Provider();
|
||||
|
@ -34,6 +42,11 @@ class D3D12Provider : public GraphicsProvider {
|
|||
Presenter::FatalErrorHostGpuLossCallback) override;
|
||||
|
||||
std::unique_ptr<ImmediateDrawer> CreateImmediateDrawer() override;
|
||||
uint32_t CreateUploadResource(
|
||||
D3D12_HEAP_FLAGS HeapFlags, _In_ const D3D12_RESOURCE_DESC* pDesc,
|
||||
D3D12_RESOURCE_STATES InitialResourceState, REFIID riidResource,
|
||||
void** ppvResource, bool try_create_cpuvisible = false,
|
||||
const D3D12_CLEAR_VALUE* pOptimizedClearValue = nullptr) const;
|
||||
|
||||
IDXGIFactory2* GetDXGIFactory() const { return dxgi_factory_; }
|
||||
// nullptr if PIX not attached.
|
||||
|
@ -193,6 +206,14 @@ class D3D12Provider : public GraphicsProvider {
|
|||
bool ps_specified_stencil_reference_supported_;
|
||||
bool rasterizer_ordered_views_supported_;
|
||||
bool unaligned_block_textures_supported_;
|
||||
|
||||
lightweight_nvapi::nvapi_state_t* nvapi_;
|
||||
lightweight_nvapi::cb_NvAPI_D3D12_CreateCommittedResource
|
||||
nvapi_createcommittedresource_ = nullptr;
|
||||
lightweight_nvapi::cb_NvAPI_D3D12_UseDriverHeapPriorities
|
||||
nvapi_usedriverheappriorities_ = nullptr;
|
||||
lightweight_nvapi::cb_NvAPI_D3D12_QueryCpuVisibleVidmem
|
||||
nvapi_querycpuvisiblevidmem_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace d3d12
|
||||
|
|
|
@ -81,10 +81,10 @@ D3D12UploadBufferPool::CreatePageImplementation() {
|
|||
util::FillBufferResourceDesc(buffer_desc, page_size_,
|
||||
D3D12_RESOURCE_FLAG_NONE);
|
||||
Microsoft::WRL::ComPtr<ID3D12Resource> buffer;
|
||||
if (FAILED(provider_.GetDevice()->CreateCommittedResource(
|
||||
&util::kHeapPropertiesUpload, provider_.GetHeapFlagCreateNotZeroed(),
|
||||
&buffer_desc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
|
||||
IID_PPV_ARGS(&buffer)))) {
|
||||
|
||||
if (!provider_.CreateUploadResource(
|
||||
provider_.GetHeapFlagCreateNotZeroed(), &buffer_desc,
|
||||
D3D12_RESOURCE_STATE_GENERIC_READ, IID_PPV_ARGS(&buffer))) {
|
||||
XELOGE("Failed to create a D3D upload buffer with {} bytes", page_size_);
|
||||
return nullptr;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue