Merge pull request #74 from chrisps/canary_experimental
Misc optimizations
This commit is contained in:
commit
b4224ff3dc
|
@ -379,6 +379,9 @@ std::vector<std::unique_ptr<hid::InputDriver>> EmulatorApp::CreateInputDrivers(
|
||||||
}
|
}
|
||||||
|
|
||||||
bool EmulatorApp::OnInitialize() {
|
bool EmulatorApp::OnInitialize() {
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
amd64::InitFeatureFlags();
|
||||||
|
#endif
|
||||||
Profiler::Initialize();
|
Profiler::Initialize();
|
||||||
Profiler::ThreadEnter("Main");
|
Profiler::ThreadEnter("Main");
|
||||||
|
|
||||||
|
|
|
@ -51,7 +51,7 @@ uint64_t last_guest_tick_count_ = 0;
|
||||||
uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
|
uint64_t last_host_tick_count_ = Clock::QueryHostTickCount();
|
||||||
|
|
||||||
|
|
||||||
using tick_mutex_type = xe_unlikely_mutex;
|
using tick_mutex_type = std::mutex;
|
||||||
|
|
||||||
// Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
|
// Mutex to ensure last_host_tick_count_ and last_guest_tick_count_ are in sync
|
||||||
// std::mutex tick_mutex_;
|
// std::mutex tick_mutex_;
|
||||||
|
|
|
@ -1,7 +1,15 @@
|
||||||
#include "dma.h"
|
#include "dma.h"
|
||||||
#include "logging.h"
|
#include "logging.h"
|
||||||
|
#include "mutex.h"
|
||||||
|
#include "platform_win.h"
|
||||||
#include "xbyak/xbyak/xbyak_util.h"
|
#include "xbyak/xbyak/xbyak_util.h"
|
||||||
|
|
||||||
|
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
|
||||||
|
NtDelayExecutionPointer);
|
||||||
|
XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer);
|
||||||
|
XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId,
|
||||||
|
NtAlertThreadByThreadId);
|
||||||
|
|
||||||
template <size_t N, typename... Ts>
|
template <size_t N, typename... Ts>
|
||||||
static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
|
static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
|
||||||
char buffer[1024];
|
char buffer[1024];
|
||||||
|
@ -213,320 +221,140 @@ void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
|
||||||
written_length);
|
written_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
#define XEDMA_NUM_WORKERS 4
|
#define MAX_INFLIGHT_DMAJOBS 65536
|
||||||
class alignas(256) XeDMACGeneric : public XeDMAC {
|
#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1)
|
||||||
|
class XeDMACGeneric : public XeDMAC {
|
||||||
|
std::unique_ptr<xe::threading::Thread> thrd_;
|
||||||
|
XeDMAJob* jobs_ring_;
|
||||||
|
volatile std::atomic<uintptr_t> write_ptr_;
|
||||||
|
|
||||||
struct alignas(XE_HOST_CACHE_LINE_SIZE) {
|
struct alignas(XE_HOST_CACHE_LINE_SIZE) {
|
||||||
std::atomic<uint64_t> free_job_slots_;
|
volatile std::atomic<uintptr_t> read_ptr_;
|
||||||
std::atomic<uint64_t> jobs_submitted_;
|
xe_mutex push_into_ring_lock_;
|
||||||
std::atomic<uint64_t> jobs_completed_;
|
|
||||||
std::atomic<uint32_t> num_workers_awoken_;
|
|
||||||
std::atomic<uint32_t> current_job_serial_;
|
|
||||||
|
|
||||||
} dma_volatile_;
|
|
||||||
|
|
||||||
alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64];
|
|
||||||
|
|
||||||
volatile uint32_t jobserials_[64];
|
|
||||||
|
|
||||||
alignas(XE_HOST_CACHE_LINE_SIZE)
|
|
||||||
std::unique_ptr<threading::Event> job_done_signals_[64];
|
|
||||||
// really dont like using unique pointer for this...
|
|
||||||
std::unique_ptr<threading::Event> job_submitted_signal_;
|
|
||||||
std::unique_ptr<threading::Event> job_completed_signal_;
|
|
||||||
|
|
||||||
std::unique_ptr<threading::Thread> scheduler_thread_;
|
|
||||||
struct WorkSlice {
|
|
||||||
uint8_t* destination;
|
|
||||||
uint8_t* source;
|
|
||||||
size_t numbytes;
|
|
||||||
};
|
};
|
||||||
std::unique_ptr<threading::Thread> workers_[XEDMA_NUM_WORKERS];
|
HANDLE gotjob_event;
|
||||||
std::unique_ptr<threading::Event> worker_has_work_; //[XEDMA_NUM_WORKERS];
|
void WorkerWait();
|
||||||
std::unique_ptr<threading::Event> worker_has_finished_[XEDMA_NUM_WORKERS];
|
|
||||||
|
|
||||||
threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS];
|
|
||||||
WorkSlice worker_workslice_[XEDMA_NUM_WORKERS];
|
|
||||||
|
|
||||||
// chrispy: this is bad
|
|
||||||
static uint32_t find_free_hole_in_dword(uint64_t dw) {
|
|
||||||
XEDMALOG("Finding free hole in 0x%llX", dw);
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < 64; ++i) {
|
|
||||||
if (dw & (1ULL << i)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
return ~0U;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t allocate_free_dma_slot() {
|
|
||||||
XEDMALOG("Allocating free slot");
|
|
||||||
uint32_t got_slot = 0;
|
|
||||||
uint64_t slots;
|
|
||||||
uint64_t allocated_slot;
|
|
||||||
|
|
||||||
do {
|
|
||||||
slots = dma_volatile_.free_job_slots_.load();
|
|
||||||
|
|
||||||
got_slot = find_free_hole_in_dword(slots);
|
|
||||||
if (!~got_slot) {
|
|
||||||
XEDMALOG("Didn't get a slot!");
|
|
||||||
return ~0U;
|
|
||||||
}
|
|
||||||
allocated_slot = slots | (1ULL << got_slot);
|
|
||||||
|
|
||||||
} while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
|
|
||||||
slots, allocated_slot)));
|
|
||||||
XEDMALOG("Allocated slot %d", got_slot);
|
|
||||||
return got_slot;
|
|
||||||
}
|
|
||||||
// chrispy: on x86 this can just be interlockedbittestandreset...
|
|
||||||
void free_dma_slot(uint32_t slot) {
|
|
||||||
XEDMALOG("Freeing slot %d", slot);
|
|
||||||
uint64_t slots;
|
|
||||||
|
|
||||||
uint64_t deallocated_slot;
|
|
||||||
|
|
||||||
do {
|
|
||||||
slots = dma_volatile_.free_job_slots_.load();
|
|
||||||
|
|
||||||
deallocated_slot = slots & (~(1ULL << slot));
|
|
||||||
|
|
||||||
} while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
|
|
||||||
slots, deallocated_slot)));
|
|
||||||
}
|
|
||||||
|
|
||||||
void DoDMAJob(uint32_t idx) {
|
|
||||||
XeDMAJob& job = jobs_[idx];
|
|
||||||
if (job.precall) {
|
|
||||||
job.precall(&job);
|
|
||||||
}
|
|
||||||
// memcpy(job.destination, job.source, job.size);
|
|
||||||
|
|
||||||
size_t job_size = job.size;
|
|
||||||
|
|
||||||
size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE;
|
|
||||||
|
|
||||||
size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE;
|
|
||||||
|
|
||||||
size_t rem = job_size - line_rounded;
|
|
||||||
|
|
||||||
size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS;
|
|
||||||
|
|
||||||
XEDMALOG(
|
|
||||||
"Distributing %d bytes from %p to %p across %d workers, remainder is "
|
|
||||||
"%d",
|
|
||||||
line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem);
|
|
||||||
if (num_per_worker < 2048) {
|
|
||||||
XEDMALOG("not distributing across workers, num_per_worker < 8192");
|
|
||||||
// not worth splitting up
|
|
||||||
memcpy(job.destination, job.source, job.size);
|
|
||||||
job.signal_on_done->Set();
|
|
||||||
} else {
|
|
||||||
for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
|
|
||||||
worker_workslice_[i].destination =
|
|
||||||
(i * num_per_worker) + job.destination;
|
|
||||||
worker_workslice_[i].source = (i * num_per_worker) + job.source;
|
|
||||||
|
|
||||||
worker_workslice_[i].numbytes = num_per_worker;
|
|
||||||
}
|
|
||||||
if (rem) {
|
|
||||||
__movsb(job.destination + line_rounded, job.source + line_rounded, rem);
|
|
||||||
}
|
|
||||||
// wake them up
|
|
||||||
worker_has_work_->Set();
|
|
||||||
XEDMALOG("Starting waitall for job");
|
|
||||||
threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS,
|
|
||||||
false);
|
|
||||||
|
|
||||||
XEDMALOG("Waitall for job completed!");
|
|
||||||
job.signal_on_done->Set();
|
|
||||||
}
|
|
||||||
if (job.postcall) {
|
|
||||||
job.postcall(&job);
|
|
||||||
}
|
|
||||||
++dma_volatile_.jobs_completed_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void WorkerIter(uint32_t worker_index) {
|
|
||||||
xenia_assert(worker_index < XEDMA_NUM_WORKERS);
|
|
||||||
auto [dest, src, size] = worker_workslice_[worker_index];
|
|
||||||
|
|
||||||
// if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) {
|
|
||||||
worker_has_work_->Reset();
|
|
||||||
//}
|
|
||||||
xenia_assert(size < (1ULL << 32));
|
|
||||||
// memcpy(dest, src, size);
|
|
||||||
dma::vastcpy(dest, src, static_cast<uint32_t>(size));
|
|
||||||
}
|
|
||||||
XE_NOINLINE
|
|
||||||
void WorkerMainLoop(uint32_t worker_index) {
|
|
||||||
do {
|
|
||||||
XEDMALOG("Worker iter for worker %d", worker_index);
|
|
||||||
WorkerIter(worker_index);
|
|
||||||
|
|
||||||
XEDMALOG("Worker %d is done\n", worker_index);
|
|
||||||
threading::SignalAndWait(worker_has_finished_[worker_index].get(),
|
|
||||||
worker_has_work_.get(), false);
|
|
||||||
} while (true);
|
|
||||||
}
|
|
||||||
void WorkerMain(uint32_t worker_index) {
|
|
||||||
XEDMALOG("Entered worker main loop, index %d", worker_index);
|
|
||||||
threading::Wait(worker_has_work_.get(), false);
|
|
||||||
XEDMALOG("First wait for worker %d completed, first job ever",
|
|
||||||
worker_index);
|
|
||||||
WorkerMainLoop(worker_index);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void WorkerMainForwarder(void* ptr) {
|
|
||||||
// we aligned XeDma to 256 bytes and encode extra info in the low 8
|
|
||||||
uintptr_t uptr = (uintptr_t)ptr;
|
|
||||||
|
|
||||||
uint32_t worker_index = (uint8_t)uptr;
|
|
||||||
|
|
||||||
uptr &= ~0xFFULL;
|
|
||||||
|
|
||||||
char name_buffer[64];
|
|
||||||
sprintf_s(name_buffer, "dma_worker_%d", worker_index);
|
|
||||||
|
|
||||||
xe::threading::set_name(name_buffer);
|
|
||||||
|
|
||||||
reinterpret_cast<XeDMACGeneric*>(uptr)->WorkerMain(worker_index);
|
|
||||||
}
|
|
||||||
|
|
||||||
void DMAMain() {
|
|
||||||
XEDMALOG("DmaMain");
|
|
||||||
do {
|
|
||||||
threading::Wait(job_submitted_signal_.get(), false);
|
|
||||||
|
|
||||||
auto slots = dma_volatile_.free_job_slots_.load();
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < 64; ++i) {
|
|
||||||
if (slots & (1ULL << i)) {
|
|
||||||
XEDMALOG("Got new job at index %d in DMAMain", i);
|
|
||||||
DoDMAJob(i);
|
|
||||||
|
|
||||||
free_dma_slot(i);
|
|
||||||
|
|
||||||
job_completed_signal_->Set();
|
|
||||||
// break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} while (true);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void DMAMainForwarder(void* ud) {
|
|
||||||
xe::threading::set_name("dma_main");
|
|
||||||
reinterpret_cast<XeDMACGeneric*>(ud)->DMAMain();
|
|
||||||
}
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
virtual ~XeDMACGeneric() {}
|
||||||
|
void WorkerThreadMain();
|
||||||
|
XeDMACGeneric() {
|
||||||
|
threading::Thread::CreationParameters crparams;
|
||||||
|
crparams.create_suspended = true;
|
||||||
|
crparams.initial_priority = threading::ThreadPriority::kNormal;
|
||||||
|
crparams.stack_size = 65536;
|
||||||
|
gotjob_event = CreateEventA(nullptr, false, false, nullptr);
|
||||||
|
thrd_ = std::move(threading::Thread::Create(
|
||||||
|
crparams, [this]() { this->WorkerThreadMain(); }));
|
||||||
|
|
||||||
|
jobs_ring_ = (XeDMAJob*)_aligned_malloc(
|
||||||
|
MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE);
|
||||||
|
|
||||||
|
write_ptr_ = 0;
|
||||||
|
read_ptr_ = 0;
|
||||||
|
|
||||||
|
thrd_->Resume();
|
||||||
|
}
|
||||||
|
|
||||||
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
|
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
|
||||||
XEDMALOG("New job, %p to %p with size %d", job->source, job->destination,
|
// std::unique_lock<xe_mutex> pushlock{push_into_ring_lock_};
|
||||||
job->size);
|
HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr);
|
||||||
uint32_t slot;
|
{
|
||||||
do {
|
job->dmac_specific_ = (uintptr_t)dmacevent;
|
||||||
slot = allocate_free_dma_slot();
|
|
||||||
if (!~slot) {
|
|
||||||
XEDMALOG(
|
|
||||||
"Didn't get a free slot, waiting for a job to complete before "
|
|
||||||
"resuming.");
|
|
||||||
threading::Wait(job_completed_signal_.get(), false);
|
|
||||||
|
|
||||||
} else {
|
jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job;
|
||||||
break;
|
write_ptr_++;
|
||||||
|
SetEvent(gotjob_event);
|
||||||
}
|
}
|
||||||
|
return (DMACJobHandle)dmacevent;
|
||||||
} while (true);
|
|
||||||
jobs_[slot] = *job;
|
|
||||||
|
|
||||||
jobs_[slot].signal_on_done = job_done_signals_[slot].get();
|
|
||||||
jobs_[slot].signal_on_done->Reset();
|
|
||||||
XEDMALOG("Setting job submit signal, pushed into slot %d", slot);
|
|
||||||
|
|
||||||
uint32_t new_serial = dma_volatile_.current_job_serial_++;
|
|
||||||
|
|
||||||
jobserials_[slot] = new_serial;
|
|
||||||
|
|
||||||
++dma_volatile_.jobs_submitted_;
|
|
||||||
job_submitted_signal_->Set();
|
|
||||||
return (static_cast<uint64_t>(new_serial) << 32) |
|
|
||||||
static_cast<uint64_t>(slot);
|
|
||||||
|
|
||||||
// return job_done_signals_[slot].get();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool AllJobsDone() {
|
|
||||||
return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_;
|
|
||||||
}
|
}
|
||||||
virtual void WaitJobDone(DMACJobHandle handle) override {
|
virtual void WaitJobDone(DMACJobHandle handle) override {
|
||||||
uint32_t serial = static_cast<uint32_t>(handle >> 32);
|
while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) {
|
||||||
uint32_t jobid = static_cast<uint32_t>(handle);
|
// NtAlertThreadByThreadId.invoke<void>(thrd_->system_id());
|
||||||
do {
|
// while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) ==
|
||||||
if (jobserials_[jobid] != serial) {
|
// WAIT_TIMEOUT) {
|
||||||
return; // done, our slot was reused
|
// ;
|
||||||
}
|
}
|
||||||
|
//}
|
||||||
|
|
||||||
auto waitres = threading::Wait(job_done_signals_[jobid].get(), false,
|
// SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false);
|
||||||
std::chrono::milliseconds{1});
|
CloseHandle((HANDLE)handle);
|
||||||
|
|
||||||
if (waitres == threading::WaitResult::kTimeout) {
|
|
||||||
continue;
|
|
||||||
} else {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
} while (true);
|
|
||||||
}
|
}
|
||||||
virtual void WaitForIdle() override {
|
virtual void WaitForIdle() override {
|
||||||
while (!AllJobsDone()) {
|
while (write_ptr_ != read_ptr_) {
|
||||||
threading::MaybeYield();
|
threading::MaybeYield();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
XeDMACGeneric() {
|
|
||||||
XEDMALOG("Constructing xedma at addr %p", this);
|
|
||||||
dma_volatile_.free_job_slots_.store(0ULL);
|
|
||||||
dma_volatile_.jobs_submitted_.store(0ULL);
|
|
||||||
dma_volatile_.jobs_completed_.store(0ULL);
|
|
||||||
dma_volatile_.current_job_serial_.store(
|
|
||||||
1ULL); // so that a jobhandle is never 0
|
|
||||||
std::memset(jobs_, 0, sizeof(jobs_));
|
|
||||||
job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false);
|
|
||||||
job_completed_signal_ = threading::Event::CreateAutoResetEvent(false);
|
|
||||||
worker_has_work_ = threading::Event::CreateManualResetEvent(false);
|
|
||||||
threading::Thread::CreationParameters worker_params{};
|
|
||||||
worker_params.create_suspended = false;
|
|
||||||
worker_params.initial_priority = threading::ThreadPriority::kBelowNormal;
|
|
||||||
worker_params.stack_size = 65536; // dont need much stack at all
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < 64; ++i) {
|
|
||||||
job_done_signals_[i] = threading::Event::CreateManualResetEvent(false);
|
|
||||||
}
|
|
||||||
for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
|
|
||||||
// worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false);
|
|
||||||
worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false);
|
|
||||||
worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get();
|
|
||||||
|
|
||||||
uintptr_t encoded = reinterpret_cast<uintptr_t>(this);
|
|
||||||
xenia_assert(!(encoded & 0xFFULL));
|
|
||||||
xenia_assert(i < 256);
|
|
||||||
|
|
||||||
encoded |= i;
|
|
||||||
|
|
||||||
workers_[i] = threading::Thread::Create(worker_params, [encoded]() {
|
|
||||||
XeDMACGeneric::WorkerMainForwarder((void*)encoded);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
threading::Thread::CreationParameters scheduler_params{};
|
|
||||||
scheduler_params.create_suspended = false;
|
|
||||||
scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal;
|
|
||||||
scheduler_params.stack_size = 65536;
|
|
||||||
scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() {
|
|
||||||
XeDMACGeneric::DMAMainForwarder((void*)this);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
void XeDMACGeneric::WorkerWait() {
|
||||||
|
constexpr unsigned NUM_PAUSE_SPINS = 2048;
|
||||||
|
constexpr unsigned NUM_YIELD_SPINS = 8;
|
||||||
|
#if 0
|
||||||
|
|
||||||
|
for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) {
|
||||||
|
if (write_ptr_ == read_ptr_) {
|
||||||
|
_mm_pause();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) {
|
||||||
|
if (write_ptr_ == read_ptr_) {
|
||||||
|
threading::MaybeYield();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LARGE_INTEGER yield_execution_delay{};
|
||||||
|
yield_execution_delay.QuadPart =
|
||||||
|
-2000; //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds
|
||||||
|
while (write_ptr_ == read_ptr_) {
|
||||||
|
NtDelayExecutionPointer.invoke<void>(0, &yield_execution_delay);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
do {
|
||||||
|
if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) {
|
||||||
|
while (write_ptr_ == read_ptr_) {
|
||||||
|
_mm_pause();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (write_ptr_ == read_ptr_);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
void XeDMACGeneric::WorkerThreadMain() {
|
||||||
|
while (true) {
|
||||||
|
this->WorkerWait();
|
||||||
|
|
||||||
|
XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS];
|
||||||
|
swcache::ReadFence();
|
||||||
|
|
||||||
|
if (current_job.precall) {
|
||||||
|
current_job.precall(¤t_job);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE;
|
||||||
|
size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE;
|
||||||
|
|
||||||
|
size_t line_rem = current_job.size - line_rounded;
|
||||||
|
|
||||||
|
vastcpy(current_job.destination, current_job.source,
|
||||||
|
static_cast<uint32_t>(line_rounded));
|
||||||
|
|
||||||
|
if (line_rem) {
|
||||||
|
__movsb(current_job.destination + line_rounded,
|
||||||
|
current_job.source + line_rounded, line_rem);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current_job.postcall) {
|
||||||
|
current_job.postcall(¤t_job);
|
||||||
|
}
|
||||||
|
read_ptr_++;
|
||||||
|
swcache::WriteFence();
|
||||||
|
|
||||||
|
SetEvent((HANDLE)current_job.dmac_specific_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
|
XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
|
||||||
} // namespace xe::dma
|
} // namespace xe::dma
|
||||||
|
|
|
@ -16,7 +16,8 @@ struct XeDMAJob;
|
||||||
using DmaPrecall = void (*)(XeDMAJob* job);
|
using DmaPrecall = void (*)(XeDMAJob* job);
|
||||||
using DmaPostcall = void (*)(XeDMAJob* job);
|
using DmaPostcall = void (*)(XeDMAJob* job);
|
||||||
struct XeDMAJob {
|
struct XeDMAJob {
|
||||||
threading::Event* signal_on_done;
|
//threading::Event* signal_on_done;
|
||||||
|
uintptr_t dmac_specific_;
|
||||||
uint8_t* destination;
|
uint8_t* destination;
|
||||||
uint8_t* source;
|
uint8_t* source;
|
||||||
size_t size;
|
size_t size;
|
||||||
|
|
|
@ -472,7 +472,7 @@ bool logging::internal::ShouldLog(LogLevel log_level) {
|
||||||
std::pair<char*, size_t> logging::internal::GetThreadBuffer() {
|
std::pair<char*, size_t> logging::internal::GetThreadBuffer() {
|
||||||
return {thread_log_buffer_, sizeof(thread_log_buffer_)};
|
return {thread_log_buffer_, sizeof(thread_log_buffer_)};
|
||||||
}
|
}
|
||||||
|
XE_NOALIAS
|
||||||
void logging::internal::AppendLogLine(LogLevel log_level,
|
void logging::internal::AppendLogLine(LogLevel log_level,
|
||||||
const char prefix_char, size_t written) {
|
const char prefix_char, size_t written) {
|
||||||
if (!logger_ || !ShouldLog(log_level) || !written) {
|
if (!logger_ || !ShouldLog(log_level) || !written) {
|
||||||
|
|
|
@ -74,11 +74,15 @@ namespace internal {
|
||||||
|
|
||||||
bool ShouldLog(LogLevel log_level);
|
bool ShouldLog(LogLevel log_level);
|
||||||
std::pair<char*, size_t> GetThreadBuffer();
|
std::pair<char*, size_t> GetThreadBuffer();
|
||||||
|
XE_NOALIAS
|
||||||
void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written);
|
void AppendLogLine(LogLevel log_level, const char prefix_char, size_t written);
|
||||||
|
|
||||||
} // namespace internal
|
} // namespace internal
|
||||||
|
//technically, noalias is incorrect here, these functions do in fact alias global memory,
|
||||||
|
//but msvc will not optimize the calls away, and the global memory modified by the calls is limited to internal logging variables,
|
||||||
|
//so it might as well be noalias
|
||||||
template <typename... Args>
|
template <typename... Args>
|
||||||
|
XE_NOALIAS
|
||||||
XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level,
|
XE_NOINLINE XE_COLD static void AppendLogLineFormat_Impl(LogLevel log_level,
|
||||||
const char prefix_char,
|
const char prefix_char,
|
||||||
const char* format,
|
const char* format,
|
||||||
|
|
|
@ -400,10 +400,91 @@ static float ArchReciprocal(float den) {
|
||||||
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
using ArchFloatMask = float;
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
|
||||||
|
return _mm_cvtss_f32(_mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return _mm_cvtss_f32(_mm_or_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return _mm_cvtss_f32(_mm_xor_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return _mm_cvtss_f32(_mm_and_ps(_mm_set_ss(x), _mm_set_ss(y)));
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
|
||||||
|
return static_cast<uint32_t>(_mm_movemask_ps(_mm_set_ss(x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr ArchFloatMask floatmask_zero = .0f;
|
||||||
|
#else
|
||||||
|
using ArchFloatMask = __m128;
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
|
||||||
|
return _mm_cmpneq_ss(_mm_set_ss(x), _mm_set_ss(y));
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return _mm_or_ps(x, y);
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return _mm_xor_ps(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return _mm_and_ps(x, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) {
|
||||||
|
return static_cast<uint32_t>(_mm_movemask_ps(x) &1);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr ArchFloatMask floatmask_zero{.0f};
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
static float ArchMin(float x, float y) { return std::min<float>(x, y); }
|
static float ArchMin(float x, float y) { return std::min<float>(x, y); }
|
||||||
static float ArchMax(float x, float y) { return std::max<float>(x, y); }
|
static float ArchMax(float x, float y) { return std::max<float>(x, y); }
|
||||||
static float ArchReciprocal(float den) { return 1.0f / den; }
|
static float ArchReciprocal(float den) { return 1.0f / den; }
|
||||||
|
using ArchFloatMask = unsigned;
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchCmpneqFloatMask(float x, float y) {
|
||||||
|
return static_cast<unsigned>(-static_cast<signed>(x != y));
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return x | y;
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchXORFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return x ^ y;
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static ArchFloatMask ArchANDFloatMask(ArchFloatMask x, ArchFloatMask y) {
|
||||||
|
return x & y;
|
||||||
|
}
|
||||||
|
constexpr ArchFloatMask floatmask_zero = 0;
|
||||||
|
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static uint32_t ArchFloatMaskSignbit(ArchFloatMask x) { return x >> 31; }
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
static float RefineReciprocal(float initial, float den) {
|
static float RefineReciprocal(float initial, float den) {
|
||||||
|
|
|
@ -115,14 +115,17 @@
|
||||||
#define XE_COLD __declspec(code_seg(".cold"))
|
#define XE_COLD __declspec(code_seg(".cold"))
|
||||||
#define XE_LIKELY(...) (!!(__VA_ARGS__))
|
#define XE_LIKELY(...) (!!(__VA_ARGS__))
|
||||||
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
|
#define XE_UNLIKELY(...) (!!(__VA_ARGS__))
|
||||||
|
#define XE_MSVC_ASSUME(...) __assume(__VA_ARGS__)
|
||||||
|
#define XE_NOALIAS __declspec(noalias)
|
||||||
#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
#elif XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
||||||
#define XE_FORCEINLINE __attribute__((always_inline))
|
#define XE_FORCEINLINE __attribute__((always_inline))
|
||||||
#define XE_NOINLINE __attribute__((noinline))
|
#define XE_NOINLINE __attribute__((noinline))
|
||||||
#define XE_COLD __attribute__((cold))
|
#define XE_COLD __attribute__((cold))
|
||||||
#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
|
#define XE_LIKELY(...) __builtin_expect(!!(__VA_ARGS__), true)
|
||||||
#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
|
#define XE_UNLIKELY(...) __builtin_expect(!!(__VA_ARGS__), false)
|
||||||
|
#define XE_NOALIAS
|
||||||
|
//cant do unevaluated assume
|
||||||
|
#define XE_MSVC_ASSUME(...) static_cast<void>(0)
|
||||||
#else
|
#else
|
||||||
#define XE_FORCEINLINE inline
|
#define XE_FORCEINLINE inline
|
||||||
#define XE_NOINLINE
|
#define XE_NOINLINE
|
||||||
|
@ -130,6 +133,9 @@
|
||||||
|
|
||||||
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]]
|
#define XE_LIKELY_IF(...) if (!!(__VA_ARGS__)) [[likely]]
|
||||||
#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) [[unlikely]]
|
#define XE_UNLIKELY_IF(...) if (!!(__VA_ARGS__)) [[unlikely]]
|
||||||
|
#define XE_NOALIAS
|
||||||
|
#define XE_MSVC_ASSUME(...) static_cast<void>(0)
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
#if XE_COMPILER_HAS_GNU_EXTENSIONS == 1
|
||||||
|
@ -174,5 +180,7 @@ const char kPathSeparator = '/';
|
||||||
const char kGuestPathSeparator = '\\';
|
const char kGuestPathSeparator = '\\';
|
||||||
|
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
#if XE_ARCH_AMD64==1
|
||||||
|
#include "platform_amd64.h"
|
||||||
|
#endif
|
||||||
#endif // XENIA_BASE_PLATFORM_H_
|
#endif // XENIA_BASE_PLATFORM_H_
|
||||||
|
|
|
@ -0,0 +1,115 @@
|
||||||
|
/**
|
||||||
|
******************************************************************************
|
||||||
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
******************************************************************************
|
||||||
|
* Copyright 2020 Ben Vanik. All rights reserved. *
|
||||||
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
|
******************************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include "xenia/base/cvar.h"
|
||||||
|
#include "xenia/base/platform.h"
|
||||||
|
|
||||||
|
#include "third_party/xbyak/xbyak/xbyak.h"
|
||||||
|
#include "third_party/xbyak/xbyak/xbyak_util.h"
|
||||||
|
DEFINE_int32(x64_extension_mask, -1,
|
||||||
|
"Allow the detection and utilization of specific instruction set "
|
||||||
|
"features.\n"
|
||||||
|
" 0 = x86_64 + AVX1\n"
|
||||||
|
" 1 = AVX2\n"
|
||||||
|
" 2 = FMA\n"
|
||||||
|
" 4 = LZCNT\n"
|
||||||
|
" 8 = BMI1\n"
|
||||||
|
" 16 = BMI2\n"
|
||||||
|
" 32 = F16C\n"
|
||||||
|
" 64 = Movbe\n"
|
||||||
|
" 128 = GFNI\n"
|
||||||
|
" 256 = AVX512F\n"
|
||||||
|
" 512 = AVX512VL\n"
|
||||||
|
" 1024 = AVX512BW\n"
|
||||||
|
" 2048 = AVX512DQ\n"
|
||||||
|
" -1 = Detect and utilize all possible processor features\n",
|
||||||
|
"x64");
|
||||||
|
namespace xe {
|
||||||
|
namespace amd64 {
|
||||||
|
static uint32_t g_feature_flags = 0U;
|
||||||
|
static bool g_did_initialize_feature_flags = false;
|
||||||
|
uint32_t GetFeatureFlags() {
|
||||||
|
xenia_assert(g_did_initialize_feature_flags);
|
||||||
|
return g_feature_flags;
|
||||||
|
}
|
||||||
|
XE_COLD
|
||||||
|
XE_NOINLINE
|
||||||
|
void InitFeatureFlags() {
|
||||||
|
uint32_t feature_flags_ = 0U;
|
||||||
|
|
||||||
|
Xbyak::util::Cpu cpu_;
|
||||||
|
#define TEST_EMIT_FEATURE(emit, ext) \
|
||||||
|
if ((cvars::x64_extension_mask & emit) == emit) { \
|
||||||
|
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitAVX2, Xbyak::util::Cpu::tAVX2);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitFMA, Xbyak::util::Cpu::tFMA);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitAVX512VL, Xbyak::util::Cpu::tAVX512VL);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitAVX512BW, Xbyak::util::Cpu::tAVX512BW);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitAVX512DQ, Xbyak::util::Cpu::tAVX512DQ);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitAVX512VBMI, Xbyak::util::Cpu::tAVX512VBMI);
|
||||||
|
TEST_EMIT_FEATURE(kX64EmitPrefetchW, Xbyak::util::Cpu::tPREFETCHW);
|
||||||
|
#undef TEST_EMIT_FEATURE
|
||||||
|
/*
|
||||||
|
fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in
|
||||||
|
latest version of xbyak
|
||||||
|
*/
|
||||||
|
unsigned int data[4];
|
||||||
|
Xbyak::util::Cpu::getCpuid(0x80000001, data);
|
||||||
|
unsigned amd_flags = data[2];
|
||||||
|
if (amd_flags & (1U << 5)) {
|
||||||
|
if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) {
|
||||||
|
feature_flags_ |= kX64EmitLZCNT;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// todo: although not reported by cpuid, zen 1 and zen+ also have fma4
|
||||||
|
if (amd_flags & (1U << 16)) {
|
||||||
|
if ((cvars::x64_extension_mask & kX64EmitFMA4) == kX64EmitFMA4) {
|
||||||
|
feature_flags_ |= kX64EmitFMA4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (amd_flags & (1U << 21)) {
|
||||||
|
if ((cvars::x64_extension_mask & kX64EmitTBM) == kX64EmitTBM) {
|
||||||
|
feature_flags_ |= kX64EmitTBM;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (amd_flags & (1U << 11)) {
|
||||||
|
if ((cvars::x64_extension_mask & kX64EmitXOP) == kX64EmitXOP) {
|
||||||
|
feature_flags_ |= kX64EmitXOP;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cpu_.has(Xbyak::util::Cpu::tAMD)) {
|
||||||
|
bool is_zennish = cpu_.displayFamily >= 0x17;
|
||||||
|
/*
|
||||||
|
chrispy: according to agner's tables, all amd architectures that
|
||||||
|
we support (ones with avx) have the same timings for
|
||||||
|
jrcxz/loop/loope/loopne as for other jmps
|
||||||
|
*/
|
||||||
|
feature_flags_ |= kX64FastJrcx;
|
||||||
|
feature_flags_ |= kX64FastLoop;
|
||||||
|
if (is_zennish) {
|
||||||
|
// ik that i heard somewhere that this is the case for zen, but i need to
|
||||||
|
// verify. cant find my original source for that.
|
||||||
|
// todo: ask agner?
|
||||||
|
feature_flags_ |= kX64FlagsIndependentVars;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g_feature_flags = feature_flags_;
|
||||||
|
g_did_initialize_feature_flags = true;
|
||||||
|
}
|
||||||
|
} // namespace amd64
|
||||||
|
} // namespace xe
|
|
@ -0,0 +1,61 @@
|
||||||
|
/**
|
||||||
|
******************************************************************************
|
||||||
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
******************************************************************************
|
||||||
|
* Copyright 2019 Ben Vanik. All rights reserved. *
|
||||||
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
|
******************************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef XENIA_BASE_PLATFORM_AMD64_H_
|
||||||
|
#define XENIA_BASE_PLATFORM_AMD64_H_
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
namespace xe {
|
||||||
|
namespace amd64 {
|
||||||
|
enum X64FeatureFlags {
|
||||||
|
kX64EmitAVX2 = 1 << 0,
|
||||||
|
kX64EmitFMA = 1 << 1,
|
||||||
|
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
|
||||||
|
kX64EmitBMI1 = 1 << 3,
|
||||||
|
kX64EmitBMI2 = 1 << 4,
|
||||||
|
kX64EmitPrefetchW = 1 << 5,
|
||||||
|
kX64EmitMovbe = 1 << 6,
|
||||||
|
kX64EmitGFNI = 1 << 7,
|
||||||
|
|
||||||
|
kX64EmitAVX512F = 1 << 8,
|
||||||
|
kX64EmitAVX512VL = 1 << 9,
|
||||||
|
|
||||||
|
kX64EmitAVX512BW = 1 << 10,
|
||||||
|
kX64EmitAVX512DQ = 1 << 11,
|
||||||
|
|
||||||
|
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
||||||
|
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
||||||
|
kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1)
|
||||||
|
kX64FastLoop =
|
||||||
|
1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
||||||
|
kX64EmitAVX512VBMI = 1 << 14,
|
||||||
|
kX64FlagsIndependentVars =
|
||||||
|
1 << 15, // if true, instructions that only modify some flags (like
|
||||||
|
// inc/dec) do not introduce false dependencies on EFLAGS
|
||||||
|
// because the individual flags are treated as different vars by
|
||||||
|
// the processor. (this applies to zen)
|
||||||
|
kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx
|
||||||
|
// instructions, and FX users need the boost
|
||||||
|
kX64EmitFMA4 = 1 << 17, // todo: also use on zen1?
|
||||||
|
kX64EmitTBM = 1 << 18,
|
||||||
|
// kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
|
||||||
|
// 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
|
||||||
|
// instructions by using the legacy sse version if we recently cleared the
|
||||||
|
// high 128 bits of the
|
||||||
|
};
|
||||||
|
|
||||||
|
XE_NOALIAS
|
||||||
|
uint32_t GetFeatureFlags();
|
||||||
|
XE_COLD
|
||||||
|
void InitFeatureFlags();
|
||||||
|
|
||||||
|
}
|
||||||
|
} // namespace xe
|
||||||
|
|
||||||
|
#endif // XENIA_BASE_PLATFORM_AMD64_H_
|
|
@ -0,0 +1,40 @@
|
||||||
|
#pragma once
|
||||||
|
namespace xe {
|
||||||
|
/*
|
||||||
|
a very simple freelist, intended to be used with HIRFunction/Arena to
|
||||||
|
eliminate our last-level cache miss problems with HIR simplifications not
|
||||||
|
thread safe, doesnt need to be
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
struct SimpleFreelist {
|
||||||
|
union Node {
|
||||||
|
union Node* next_;
|
||||||
|
T entry_;
|
||||||
|
};
|
||||||
|
Node* head_;
|
||||||
|
|
||||||
|
static_assert(sizeof(T) >= sizeof(void*));
|
||||||
|
SimpleFreelist() : head_(nullptr) {}
|
||||||
|
T* NewEntry() {
|
||||||
|
Node* result_node = head_;
|
||||||
|
if (!result_node) {
|
||||||
|
return nullptr;
|
||||||
|
} else {
|
||||||
|
head_ = result_node->next_;
|
||||||
|
|
||||||
|
memset(result_node, 0, sizeof(T));
|
||||||
|
return &result_node->entry_;
|
||||||
|
// return new (&result_node->entry_) T(args...);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void DeleteEntry(T* value) {
|
||||||
|
memset(value, 0, sizeof(T));
|
||||||
|
Node* node = reinterpret_cast<Node*>(value);
|
||||||
|
node->next_ = head_;
|
||||||
|
head_ = node;
|
||||||
|
}
|
||||||
|
void Reset() { head_ = nullptr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace xe
|
|
@ -50,6 +50,9 @@ XE_NTDLL_IMPORT(NtPulseEvent, cls_NtPulseEvent, NtPulseEventPointer);
|
||||||
// counts
|
// counts
|
||||||
XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
|
XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
|
||||||
NtReleaseSemaphorePointer);
|
NtReleaseSemaphorePointer);
|
||||||
|
|
||||||
|
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
|
||||||
|
NtDelayExecutionPointer);
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace threading {
|
namespace threading {
|
||||||
|
|
||||||
|
@ -109,13 +112,30 @@ void set_name(const std::string_view name) {
|
||||||
set_name(GetCurrentThread(), name);
|
set_name(GetCurrentThread(), name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// checked ntoskrnl, it does not modify delay, so we can place this as a
|
||||||
|
// constant and avoid creating a stack variable
|
||||||
|
static const LARGE_INTEGER sleepdelay0_for_maybeyield{0LL};
|
||||||
|
|
||||||
void MaybeYield() {
|
void MaybeYield() {
|
||||||
|
#if 0
|
||||||
#if defined(XE_USE_NTDLL_FUNCTIONS)
|
#if defined(XE_USE_NTDLL_FUNCTIONS)
|
||||||
|
|
||||||
NtYieldExecutionPointer.invoke();
|
NtYieldExecutionPointer.invoke();
|
||||||
#else
|
#else
|
||||||
SwitchToThread();
|
SwitchToThread();
|
||||||
#endif
|
#endif
|
||||||
|
#else
|
||||||
|
// chrispy: SwitchToThread will only switch to a ready thread on the current
|
||||||
|
// processor, so if one is not ready we end up spinning, constantly calling
|
||||||
|
// switchtothread without doing any work, heating up the users cpu sleep(0)
|
||||||
|
// however will yield to threads on other processors and surrenders the
|
||||||
|
// current timeslice
|
||||||
|
#if defined(XE_USE_NTDLL_FUNCTIONS)
|
||||||
|
NtDelayExecutionPointer.invoke(0, &sleepdelay0_for_maybeyield);
|
||||||
|
#else
|
||||||
|
::Sleep(0);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
// memorybarrier is really not necessary here...
|
// memorybarrier is really not necessary here...
|
||||||
MemoryBarrier();
|
MemoryBarrier();
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,24 +26,6 @@
|
||||||
#include "xenia/cpu/processor.h"
|
#include "xenia/cpu/processor.h"
|
||||||
#include "xenia/cpu/stack_walker.h"
|
#include "xenia/cpu/stack_walker.h"
|
||||||
#include "xenia/cpu/xex_module.h"
|
#include "xenia/cpu/xex_module.h"
|
||||||
DEFINE_int32(x64_extension_mask, -1,
|
|
||||||
"Allow the detection and utilization of specific instruction set "
|
|
||||||
"features.\n"
|
|
||||||
" 0 = x86_64 + AVX1\n"
|
|
||||||
" 1 = AVX2\n"
|
|
||||||
" 2 = FMA\n"
|
|
||||||
" 4 = LZCNT\n"
|
|
||||||
" 8 = BMI1\n"
|
|
||||||
" 16 = BMI2\n"
|
|
||||||
" 32 = F16C\n"
|
|
||||||
" 64 = Movbe\n"
|
|
||||||
" 128 = GFNI\n"
|
|
||||||
" 256 = AVX512F\n"
|
|
||||||
" 512 = AVX512VL\n"
|
|
||||||
" 1024 = AVX512BW\n"
|
|
||||||
" 2048 = AVX512DQ\n"
|
|
||||||
" -1 = Detect and utilize all possible processor features\n",
|
|
||||||
"x64");
|
|
||||||
|
|
||||||
DEFINE_bool(record_mmio_access_exceptions, true,
|
DEFINE_bool(record_mmio_access_exceptions, true,
|
||||||
"For guest addresses records whether we caught any mmio accesses "
|
"For guest addresses records whether we caught any mmio accesses "
|
||||||
|
|
|
@ -103,7 +103,9 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
||||||
"FAQ for system requirements at https://xenia.jp");
|
"FAQ for system requirements at https://xenia.jp");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#if 1
|
||||||
|
feature_flags_ = amd64::GetFeatureFlags();
|
||||||
|
#else
|
||||||
#define TEST_EMIT_FEATURE(emit, ext) \
|
#define TEST_EMIT_FEATURE(emit, ext) \
|
||||||
if ((cvars::x64_extension_mask & emit) == emit) { \
|
if ((cvars::x64_extension_mask & emit) == emit) { \
|
||||||
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
|
feature_flags_ |= (cpu_.has(ext) ? emit : 0); \
|
||||||
|
@ -168,6 +170,7 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
||||||
feature_flags_ |= kX64FlagsIndependentVars;
|
feature_flags_ |= kX64FlagsIndependentVars;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
may_use_membase32_as_zero_reg_ =
|
may_use_membase32_as_zero_reg_ =
|
||||||
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
|
static_cast<uint32_t>(reinterpret_cast<uintptr_t>(
|
||||||
processor()->memory()->virtual_membase())) == 0;
|
processor()->memory()->virtual_membase())) == 0;
|
||||||
|
@ -913,6 +916,8 @@ static inline vec128_t v128_setr_bytes(unsigned char v0, unsigned char v1,
|
||||||
|
|
||||||
static const vec128_t xmm_consts[] = {
|
static const vec128_t xmm_consts[] = {
|
||||||
/* XMMZero */ vec128f(0.0f),
|
/* XMMZero */ vec128f(0.0f),
|
||||||
|
/* XMMByteSwapMask */
|
||||||
|
vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
|
||||||
/* XMMOne */ vec128f(1.0f),
|
/* XMMOne */ vec128f(1.0f),
|
||||||
/* XMMOnePD */ vec128d(1.0),
|
/* XMMOnePD */ vec128d(1.0),
|
||||||
/* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f),
|
/* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f),
|
||||||
|
@ -937,8 +942,7 @@ static const vec128_t xmm_consts[] = {
|
||||||
vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu),
|
vec128i(0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu, 0x7FFFFFFFu),
|
||||||
/* XMMAbsMaskPD */
|
/* XMMAbsMaskPD */
|
||||||
vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu),
|
vec128i(0xFFFFFFFFu, 0x7FFFFFFFu, 0xFFFFFFFFu, 0x7FFFFFFFu),
|
||||||
/* XMMByteSwapMask */
|
|
||||||
vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu),
|
|
||||||
/* XMMByteOrderMask */
|
/* XMMByteOrderMask */
|
||||||
vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
|
vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
|
||||||
/* XMMPermuteControl15 */ vec128b(15),
|
/* XMMPermuteControl15 */ vec128b(15),
|
||||||
|
|
|
@ -34,7 +34,7 @@ namespace xe {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
namespace backend {
|
namespace backend {
|
||||||
namespace x64 {
|
namespace x64 {
|
||||||
|
using namespace amd64;
|
||||||
class X64Backend;
|
class X64Backend;
|
||||||
class X64CodeCache;
|
class X64CodeCache;
|
||||||
|
|
||||||
|
@ -81,6 +81,7 @@ static SimdDomain PickDomain2(SimdDomain dom1, SimdDomain dom2) {
|
||||||
}
|
}
|
||||||
enum XmmConst {
|
enum XmmConst {
|
||||||
XMMZero = 0,
|
XMMZero = 0,
|
||||||
|
XMMByteSwapMask,
|
||||||
XMMOne,
|
XMMOne,
|
||||||
XMMOnePD,
|
XMMOnePD,
|
||||||
XMMNegativeOne,
|
XMMNegativeOne,
|
||||||
|
@ -97,7 +98,7 @@ enum XmmConst {
|
||||||
XMMSignMaskPD,
|
XMMSignMaskPD,
|
||||||
XMMAbsMaskPS,
|
XMMAbsMaskPS,
|
||||||
XMMAbsMaskPD,
|
XMMAbsMaskPD,
|
||||||
XMMByteSwapMask,
|
|
||||||
XMMByteOrderMask,
|
XMMByteOrderMask,
|
||||||
XMMPermuteControl15,
|
XMMPermuteControl15,
|
||||||
XMMPermuteByteMask,
|
XMMPermuteByteMask,
|
||||||
|
@ -189,42 +190,6 @@ class XbyakAllocator : public Xbyak::Allocator {
|
||||||
virtual bool useProtect() const { return false; }
|
virtual bool useProtect() const { return false; }
|
||||||
};
|
};
|
||||||
|
|
||||||
enum X64EmitterFeatureFlags {
|
|
||||||
kX64EmitAVX2 = 1 << 0,
|
|
||||||
kX64EmitFMA = 1 << 1,
|
|
||||||
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
|
|
||||||
kX64EmitBMI1 = 1 << 3,
|
|
||||||
kX64EmitBMI2 = 1 << 4,
|
|
||||||
kX64EmitPrefetchW = 1 << 5,
|
|
||||||
kX64EmitMovbe = 1 << 6,
|
|
||||||
kX64EmitGFNI = 1 << 7,
|
|
||||||
|
|
||||||
kX64EmitAVX512F = 1 << 8,
|
|
||||||
kX64EmitAVX512VL = 1 << 9,
|
|
||||||
|
|
||||||
kX64EmitAVX512BW = 1 << 10,
|
|
||||||
kX64EmitAVX512DQ = 1 << 11,
|
|
||||||
|
|
||||||
kX64EmitAVX512Ortho = kX64EmitAVX512F | kX64EmitAVX512VL,
|
|
||||||
kX64EmitAVX512Ortho64 = kX64EmitAVX512Ortho | kX64EmitAVX512DQ,
|
|
||||||
kX64FastJrcx = 1 << 12, // jrcxz is as fast as any other jump ( >= Zen1)
|
|
||||||
kX64FastLoop =
|
|
||||||
1 << 13, // loop/loope/loopne is as fast as any other jump ( >= Zen2)
|
|
||||||
kX64EmitAVX512VBMI = 1 << 14,
|
|
||||||
kX64FlagsIndependentVars =
|
|
||||||
1 << 15, // if true, instructions that only modify some flags (like
|
|
||||||
// inc/dec) do not introduce false dependencies on EFLAGS
|
|
||||||
// because the individual flags are treated as different vars by
|
|
||||||
// the processor. (this applies to zen)
|
|
||||||
kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx
|
|
||||||
// instructions, and FX users need the boost
|
|
||||||
kX64EmitFMA4 = 1 << 17, // todo: also use on zen1?
|
|
||||||
kX64EmitTBM = 1 << 18,
|
|
||||||
// kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
|
|
||||||
// 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
|
|
||||||
// instructions by using the legacy sse version if we recently cleared the
|
|
||||||
// high 128 bits of the
|
|
||||||
};
|
|
||||||
class ResolvableGuestCall {
|
class ResolvableGuestCall {
|
||||||
public:
|
public:
|
||||||
bool is_jump_;
|
bool is_jump_;
|
||||||
|
|
|
@ -1354,7 +1354,6 @@ struct VECTOR_SHA_V128
|
||||||
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
|
||||||
// TODO(benvanik): native version (with shift magic).
|
// TODO(benvanik): native version (with shift magic).
|
||||||
if (i.src2.is_constant) {
|
if (i.src2.is_constant) {
|
||||||
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
|
||||||
const auto& shamt = i.src2.constant();
|
const auto& shamt = i.src2.constant();
|
||||||
bool all_same = true;
|
bool all_same = true;
|
||||||
for (size_t n = 0; n < 16 - n; ++n) {
|
for (size_t n = 0; n < 16 - n; ++n) {
|
||||||
|
@ -1363,6 +1362,9 @@ struct VECTOR_SHA_V128
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (e.IsFeatureEnabled(kX64EmitGFNI)) {
|
||||||
if (all_same) {
|
if (all_same) {
|
||||||
// Every count is the same, so we can use gf2p8affineqb.
|
// Every count is the same, so we can use gf2p8affineqb.
|
||||||
const uint8_t shift_amount = shamt.u8[0] & 0b111;
|
const uint8_t shift_amount = shamt.u8[0] & 0b111;
|
||||||
|
@ -1375,6 +1377,19 @@ struct VECTOR_SHA_V128
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (all_same) {
|
||||||
|
Xmm to_be_shifted = GetInputRegOrConstant(e, i.src1, e.xmm1);
|
||||||
|
|
||||||
|
e.vpmovsxbw(e.xmm0, to_be_shifted); //_mm_srai_epi16 / psraw
|
||||||
|
e.vpunpckhqdq(e.xmm2, to_be_shifted, to_be_shifted);
|
||||||
|
e.vpmovsxbw(e.xmm1, e.xmm2);
|
||||||
|
e.vpsraw(e.xmm0, shamt.u8[0]);
|
||||||
|
e.vpsraw(e.xmm1, shamt.u8[0]);
|
||||||
|
e.vpacksswb(i.dest, e.xmm0, e.xmm1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
|
||||||
} else {
|
} else {
|
||||||
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
|
||||||
|
|
|
@ -3234,7 +3234,17 @@ struct SET_ROUNDING_MODE_I32
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
|
EMITTER_OPCODE_TABLE(OPCODE_SET_ROUNDING_MODE, SET_ROUNDING_MODE_I32);
|
||||||
|
// ============================================================================
|
||||||
|
// OPCODE_DELAY_EXECUTION
|
||||||
|
// ============================================================================
|
||||||
|
struct DELAY_EXECUTION
|
||||||
|
: Sequence<DELAY_EXECUTION, I<OPCODE_DELAY_EXECUTION, VoidOp>> {
|
||||||
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
// todo: what if they dont have smt?
|
||||||
|
e.pause();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
EMITTER_OPCODE_TABLE(OPCODE_DELAY_EXECUTION, DELAY_EXECUTION);
|
||||||
// Include anchors to other sequence sources so they get included in the build.
|
// Include anchors to other sequence sources so they get included in the build.
|
||||||
extern volatile int anchor_control;
|
extern volatile int anchor_control;
|
||||||
static int anchor_control_dest = anchor_control;
|
static int anchor_control_dest = anchor_control;
|
||||||
|
|
|
@ -98,7 +98,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstantTrue()) {
|
if (i->src1.value->IsConstantTrue()) {
|
||||||
i->Replace(&OPCODE_DEBUG_BREAK_info, i->flags);
|
i->Replace(&OPCODE_DEBUG_BREAK_info, i->flags);
|
||||||
} else {
|
} else {
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
|
@ -109,7 +109,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstantTrue()) {
|
if (i->src1.value->IsConstantTrue()) {
|
||||||
i->Replace(&OPCODE_TRAP_info, i->flags);
|
i->Replace(&OPCODE_TRAP_info, i->flags);
|
||||||
} else {
|
} else {
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
|
@ -122,7 +122,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
i->Replace(&OPCODE_CALL_info, i->flags);
|
i->Replace(&OPCODE_CALL_info, i->flags);
|
||||||
i->src1.symbol = symbol;
|
i->src1.symbol = symbol;
|
||||||
} else {
|
} else {
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
|
@ -146,7 +146,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
i->Replace(&OPCODE_CALL_INDIRECT_info, i->flags);
|
i->Replace(&OPCODE_CALL_INDIRECT_info, i->flags);
|
||||||
i->set_src1(value);
|
i->set_src1(value);
|
||||||
} else {
|
} else {
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
result = true;
|
result = true;
|
||||||
} else if (i->src2.value->IsConstant()) { // chrispy: fix h3 bug from
|
} else if (i->src2.value->IsConstant()) { // chrispy: fix h3 bug from
|
||||||
|
@ -172,7 +172,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
i->Replace(&OPCODE_BRANCH_info, i->flags);
|
i->Replace(&OPCODE_BRANCH_info, i->flags);
|
||||||
i->src1.label = label;
|
i->src1.label = label;
|
||||||
} else {
|
} else {
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
|
@ -184,7 +184,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
i->Replace(&OPCODE_BRANCH_info, i->flags);
|
i->Replace(&OPCODE_BRANCH_info, i->flags);
|
||||||
i->src1.label = label;
|
i->src1.label = label;
|
||||||
} else {
|
} else {
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
|
@ -195,7 +195,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
TypeName target_type = v->type;
|
TypeName target_type = v->type;
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Cast(target_type);
|
v->Cast(target_type);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -204,7 +204,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
TypeName target_type = v->type;
|
TypeName target_type = v->type;
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Convert(target_type, RoundMode(i->flags));
|
v->Convert(target_type, RoundMode(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -212,7 +212,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Round(RoundMode(i->flags));
|
v->Round(RoundMode(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -221,7 +221,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
TypeName target_type = v->type;
|
TypeName target_type = v->type;
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->ZeroExtend(target_type);
|
v->ZeroExtend(target_type);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -230,7 +230,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
TypeName target_type = v->type;
|
TypeName target_type = v->type;
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->SignExtend(target_type);
|
v->SignExtend(target_type);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -239,7 +239,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
TypeName target_type = v->type;
|
TypeName target_type = v->type;
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Truncate(target_type);
|
v->Truncate(target_type);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -247,7 +247,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
if (!(i->src1.value->AsUint32() & 0xF)) {
|
if (!(i->src1.value->AsUint32() & 0xF)) {
|
||||||
v->set_zero(VEC128_TYPE);
|
v->set_zero(VEC128_TYPE);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -281,22 +281,22 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
switch (v->type) {
|
switch (v->type) {
|
||||||
case INT8_TYPE:
|
case INT8_TYPE:
|
||||||
v->set_constant(xe::load<uint8_t>(host_addr));
|
v->set_constant(xe::load<uint8_t>(host_addr));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
break;
|
break;
|
||||||
case INT16_TYPE:
|
case INT16_TYPE:
|
||||||
v->set_constant(xe::load<uint16_t>(host_addr));
|
v->set_constant(xe::load<uint16_t>(host_addr));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
break;
|
break;
|
||||||
case INT32_TYPE:
|
case INT32_TYPE:
|
||||||
v->set_constant(xe::load<uint32_t>(host_addr));
|
v->set_constant(xe::load<uint32_t>(host_addr));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
break;
|
break;
|
||||||
case INT64_TYPE:
|
case INT64_TYPE:
|
||||||
v->set_constant(xe::load<uint64_t>(host_addr));
|
v->set_constant(xe::load<uint64_t>(host_addr));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
break;
|
break;
|
||||||
case VEC128_TYPE:
|
case VEC128_TYPE:
|
||||||
|
@ -304,7 +304,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
val.low = xe::load<uint64_t>(host_addr);
|
val.low = xe::load<uint64_t>(host_addr);
|
||||||
val.high = xe::load<uint64_t>(host_addr + 8);
|
val.high = xe::load<uint64_t>(host_addr + 8);
|
||||||
v->set_constant(val);
|
v->set_constant(val);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -357,14 +357,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
i->src3.value->IsConstant()) {
|
i->src3.value->IsConstant()) {
|
||||||
v->set_from(i->src2.value);
|
v->set_from(i->src2.value);
|
||||||
v->Select(i->src3.value, i->src1.value);
|
v->Select(i->src3.value, i->src1.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) {
|
if (i->src2.value->IsConstant() && i->src3.value->IsConstant()) {
|
||||||
v->set_from(i->src2.value);
|
v->set_from(i->src2.value);
|
||||||
v->Select(i->src3.value, i->src1.value);
|
v->Select(i->src3.value, i->src1.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -381,7 +381,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
} else {
|
} else {
|
||||||
v->set_constant(uint8_t(0));
|
v->set_constant(uint8_t(0));
|
||||||
}
|
}
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -391,7 +391,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantEQ(i->src2.value);
|
bool value = i->src1.value->IsConstantEQ(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -399,7 +399,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantNE(i->src2.value);
|
bool value = i->src1.value->IsConstantNE(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -407,7 +407,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantSLT(i->src2.value);
|
bool value = i->src1.value->IsConstantSLT(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -415,7 +415,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantSLE(i->src2.value);
|
bool value = i->src1.value->IsConstantSLE(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -423,7 +423,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantSGT(i->src2.value);
|
bool value = i->src1.value->IsConstantSGT(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -431,7 +431,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantSGE(i->src2.value);
|
bool value = i->src1.value->IsConstantSGE(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -439,7 +439,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantULT(i->src2.value);
|
bool value = i->src1.value->IsConstantULT(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -447,7 +447,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantULE(i->src2.value);
|
bool value = i->src1.value->IsConstantULE(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -455,7 +455,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantUGT(i->src2.value);
|
bool value = i->src1.value->IsConstantUGT(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -463,7 +463,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
bool value = i->src1.value->IsConstantUGE(i->src2.value);
|
bool value = i->src1.value->IsConstantUGE(i->src2.value);
|
||||||
i->dest->set_constant(uint8_t(value));
|
i->dest->set_constant(uint8_t(value));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -477,7 +477,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
!should_skip_because_of_float) {
|
!should_skip_because_of_float) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Add(i->src2.value);
|
v->Add(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -489,7 +489,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
TypeName target_type = v->type;
|
TypeName target_type = v->type;
|
||||||
v->set_from(ca);
|
v->set_from(ca);
|
||||||
v->ZeroExtend(target_type);
|
v->ZeroExtend(target_type);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
} else {
|
} else {
|
||||||
if (i->dest->type == ca->type) {
|
if (i->dest->type == ca->type) {
|
||||||
i->Replace(&OPCODE_ASSIGN_info, 0);
|
i->Replace(&OPCODE_ASSIGN_info, 0);
|
||||||
|
@ -507,7 +507,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
!should_skip_because_of_float) {
|
!should_skip_because_of_float) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Sub(i->src2.value);
|
v->Sub(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -516,7 +516,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Mul(i->src2.value);
|
v->Mul(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
} else if (i->src1.value->IsConstant() ||
|
} else if (i->src1.value->IsConstant() ||
|
||||||
i->src2.value->IsConstant()) {
|
i->src2.value->IsConstant()) {
|
||||||
|
@ -548,7 +548,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->MulHi(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
|
v->MulHi(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -557,13 +557,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
|
v->Div(i->src2.value, (i->flags & ARITHMETIC_UNSIGNED) != 0);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
} else if (!i->src2.value->MaybeFloaty() &&
|
} else if (!i->src2.value->MaybeFloaty() &&
|
||||||
i->src2.value->IsConstantZero()) {
|
i->src2.value->IsConstantZero()) {
|
||||||
// division by 0 == 0 every time,
|
// division by 0 == 0 every time,
|
||||||
v->set_zero(i->src2.value->type);
|
v->set_zero(i->src2.value->type);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
} else if (i->src2.value->IsConstant()) {
|
} else if (i->src2.value->IsConstant()) {
|
||||||
// Division by one = no-op.
|
// Division by one = no-op.
|
||||||
|
@ -592,7 +592,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
}
|
}
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Max(i->src2.value);
|
v->Max(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -600,7 +600,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Neg();
|
v->Neg();
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -608,7 +608,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Abs();
|
v->Abs();
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -616,7 +616,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Sqrt();
|
v->Sqrt();
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -624,7 +624,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->RSqrt();
|
v->RSqrt();
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -632,7 +632,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Recip();
|
v->Recip();
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -640,7 +640,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->And(i->src2.value);
|
v->And(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -648,7 +648,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->AndNot(i->src2.value);
|
v->AndNot(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -656,7 +656,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Or(i->src2.value);
|
v->Or(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -664,13 +664,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Xor(i->src2.value);
|
v->Xor(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
} else if (!i->src1.value->IsConstant() &&
|
} else if (!i->src1.value->IsConstant() &&
|
||||||
!i->src2.value->IsConstant() &&
|
!i->src2.value->IsConstant() &&
|
||||||
i->src1.value == i->src2.value) {
|
i->src1.value == i->src2.value) {
|
||||||
v->set_zero(v->type);
|
v->set_zero(v->type);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -678,7 +678,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Not();
|
v->Not();
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -687,7 +687,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Shl(i->src2.value);
|
v->Shl(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
} else if (i->src2.value->IsConstantZero()) {
|
} else if (i->src2.value->IsConstantZero()) {
|
||||||
auto src1 = i->src1.value;
|
auto src1 = i->src1.value;
|
||||||
|
@ -702,7 +702,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Shr(i->src2.value);
|
v->Shr(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
} else if (i->src2.value->IsConstantZero()) {
|
} else if (i->src2.value->IsConstantZero()) {
|
||||||
auto src1 = i->src1.value;
|
auto src1 = i->src1.value;
|
||||||
|
@ -716,7 +716,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Sha(i->src2.value);
|
v->Sha(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -724,7 +724,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->RotateLeft(i->src2.value);
|
v->RotateLeft(i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -732,7 +732,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->ByteSwap();
|
v->ByteSwap();
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -740,7 +740,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_zero(v->type);
|
v->set_zero(v->type);
|
||||||
v->CountLeadingZeros(i->src1.value);
|
v->CountLeadingZeros(i->src1.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -751,7 +751,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
(i->flags == INT8_TYPE || i->flags == INT16_TYPE)) {
|
(i->flags == INT8_TYPE || i->flags == INT16_TYPE)) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags);
|
v->Permute(i->src2.value, i->src3.value, (TypeName)i->flags);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -765,7 +765,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
v->set_zero(VEC128_TYPE);
|
v->set_zero(VEC128_TYPE);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -777,7 +777,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
i->src3.value->IsConstant()) {
|
i->src3.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags);
|
v->Insert(i->src2.value, i->src3.value, (TypeName)i->flags);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -785,7 +785,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags);
|
v->Swizzle((uint32_t)i->src2.offset, (TypeName)i->flags);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -793,7 +793,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_zero(v->type);
|
v->set_zero(v->type);
|
||||||
v->Extract(i->src1.value, i->src2.value);
|
v->Extract(i->src1.value, i->src2.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -801,7 +801,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_zero(v->type);
|
v->set_zero(v->type);
|
||||||
v->Splat(i->src1.value);
|
v->Splat(i->src1.value);
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -809,7 +809,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags));
|
v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -817,7 +817,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags));
|
v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -825,7 +825,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->VectorCompareSGE(i->src2.value, hir::TypeName(i->flags));
|
v->VectorCompareSGE(i->src2.value, hir::TypeName(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -833,7 +833,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->VectorCompareUGT(i->src2.value, hir::TypeName(i->flags));
|
v->VectorCompareUGT(i->src2.value, hir::TypeName(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -841,7 +841,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->VectorCompareUGE(i->src2.value, hir::TypeName(i->flags));
|
v->VectorCompareUGE(i->src2.value, hir::TypeName(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -850,7 +850,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
v->set_zero(VEC128_TYPE);
|
v->set_zero(VEC128_TYPE);
|
||||||
v->VectorConvertF2I(i->src1.value,
|
v->VectorConvertF2I(i->src1.value,
|
||||||
!!(i->flags & ARITHMETIC_UNSIGNED));
|
!!(i->flags & ARITHMETIC_UNSIGNED));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -859,7 +859,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
v->set_zero(VEC128_TYPE);
|
v->set_zero(VEC128_TYPE);
|
||||||
v->VectorConvertI2F(i->src1.value,
|
v->VectorConvertI2F(i->src1.value,
|
||||||
!!(i->flags & ARITHMETIC_UNSIGNED));
|
!!(i->flags & ARITHMETIC_UNSIGNED));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -867,7 +867,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->VectorShl(i->src2.value, hir::TypeName(i->flags));
|
v->VectorShl(i->src2.value, hir::TypeName(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -875,7 +875,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->VectorShr(i->src2.value, hir::TypeName(i->flags));
|
v->VectorShr(i->src2.value, hir::TypeName(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -883,7 +883,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->VectorRol(i->src2.value, hir::TypeName(i->flags));
|
v->VectorRol(i->src2.value, hir::TypeName(i->flags));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -894,7 +894,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
v->VectorAdd(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
v->VectorAdd(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
||||||
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
||||||
!!(arith_flags & ARITHMETIC_SATURATE));
|
!!(arith_flags & ARITHMETIC_SATURATE));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -905,7 +905,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
v->VectorSub(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
v->VectorSub(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
||||||
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
||||||
!!(arith_flags & ARITHMETIC_SATURATE));
|
!!(arith_flags & ARITHMETIC_SATURATE));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -917,7 +917,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
v->VectorAverage(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
v->VectorAverage(i->src2.value, hir::TypeName(i->flags & 0xFF),
|
||||||
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
!!(arith_flags & ARITHMETIC_UNSIGNED),
|
||||||
!!(arith_flags & ARITHMETIC_SATURATE));
|
!!(arith_flags & ARITHMETIC_SATURATE));
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -926,7 +926,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->DenormalFlush();
|
v->DenormalFlush();
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
result = true;
|
result = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -146,7 +146,7 @@ void ContextPromotionPass::RemoveDeadStoresBlock(Block* block) {
|
||||||
validity.set(static_cast<uint32_t>(offset));
|
validity.set(static_cast<uint32_t>(offset));
|
||||||
} else {
|
} else {
|
||||||
// Already written to. Remove this store.
|
// Already written to. Remove this store.
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
i = prev;
|
i = prev;
|
||||||
|
|
|
@ -120,7 +120,8 @@ bool DeadCodeEliminationPass::Run(HIRBuilder* builder) {
|
||||||
Instr* next = i->next;
|
Instr* next = i->next;
|
||||||
if (i->opcode == &OPCODE_NOP_info) {
|
if (i->opcode == &OPCODE_NOP_info) {
|
||||||
// Nop - remove!
|
// Nop - remove!
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
|
i->Deallocate();
|
||||||
}
|
}
|
||||||
i = next;
|
i = next;
|
||||||
}
|
}
|
||||||
|
@ -148,7 +149,9 @@ bool DeadCodeEliminationPass::Run(HIRBuilder* builder) {
|
||||||
|
|
||||||
void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) {
|
void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) {
|
||||||
i->opcode = &hir::OPCODE_NOP_info;
|
i->opcode = &hir::OPCODE_NOP_info;
|
||||||
|
if (i->dest) {
|
||||||
i->dest->def = NULL;
|
i->dest->def = NULL;
|
||||||
|
}
|
||||||
i->dest = NULL;
|
i->dest = NULL;
|
||||||
|
|
||||||
#define MAKE_NOP_SRC(n) \
|
#define MAKE_NOP_SRC(n) \
|
||||||
|
@ -163,7 +166,9 @@ void DeadCodeEliminationPass::MakeNopRecursive(Instr* i) {
|
||||||
if (value->def && value->def != i) { \
|
if (value->def && value->def != i) { \
|
||||||
MakeNopRecursive(value->def); \
|
MakeNopRecursive(value->def); \
|
||||||
} \
|
} \
|
||||||
|
HIRBuilder::GetCurrent()->DeallocateValue(value); \
|
||||||
} \
|
} \
|
||||||
|
HIRBuilder::GetCurrent()->DeallocateUse(use); \
|
||||||
}
|
}
|
||||||
MAKE_NOP_SRC(1);
|
MAKE_NOP_SRC(1);
|
||||||
MAKE_NOP_SRC(2);
|
MAKE_NOP_SRC(2);
|
||||||
|
@ -189,7 +194,8 @@ void DeadCodeEliminationPass::ReplaceAssignment(Instr* i) {
|
||||||
use = use->next;
|
use = use->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
|
i->Deallocate();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) {
|
bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) {
|
||||||
|
@ -204,11 +210,11 @@ bool DeadCodeEliminationPass::CheckLocalUse(Instr* i) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load/store are paired. They can both be removed.
|
// Load/store are paired. They can both be removed.
|
||||||
use_instr->Remove();
|
use_instr->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
|
|
||||||
i->Remove();
|
i->UnlinkAndNOP();
|
||||||
|
i->Deallocate();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@ bool FinalizationPass::Run(HIRBuilder* builder) {
|
||||||
auto target = tail->src1.label;
|
auto target = tail->src1.label;
|
||||||
if (target->block == block->next) {
|
if (target->block == block->next) {
|
||||||
// Jumping to subsequent block. Remove.
|
// Jumping to subsequent block. Remove.
|
||||||
tail->Remove();
|
tail->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -46,15 +46,27 @@ namespace hir {
|
||||||
(value->type) == FLOAT64_TYPE || (value->type) == VEC128_TYPE)
|
(value->type) == FLOAT64_TYPE || (value->type) == VEC128_TYPE)
|
||||||
#define ASSERT_TYPES_EQUAL(value1, value2) \
|
#define ASSERT_TYPES_EQUAL(value1, value2) \
|
||||||
assert_true((value1->type) == (value2->type))
|
assert_true((value1->type) == (value2->type))
|
||||||
|
thread_local HIRBuilder* thrd_current_hirfunction = nullptr;
|
||||||
HIRBuilder::HIRBuilder() {
|
HIRBuilder::HIRBuilder() {
|
||||||
arena_ = new Arena();
|
arena_ = new Arena();
|
||||||
Reset();
|
Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HIRBuilder* HIRBuilder::GetCurrent() { return thrd_current_hirfunction; }
|
||||||
|
|
||||||
|
void HIRBuilder::MakeCurrent() { thrd_current_hirfunction = this; }
|
||||||
|
void HIRBuilder::RemoveCurrent() {
|
||||||
|
if (thrd_current_hirfunction == this) {
|
||||||
|
thrd_current_hirfunction = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
HIRBuilder::~HIRBuilder() {
|
HIRBuilder::~HIRBuilder() {
|
||||||
Reset();
|
Reset();
|
||||||
delete arena_;
|
delete arena_;
|
||||||
|
if (thrd_current_hirfunction == this) {
|
||||||
|
thrd_current_hirfunction = nullptr;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void HIRBuilder::Reset() {
|
void HIRBuilder::Reset() {
|
||||||
|
@ -105,7 +117,37 @@ bool HIRBuilder::Finalize() {
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
Instr* HIRBuilder::AllocateInstruction() {
|
||||||
|
Instr* result = free_instrs_.NewEntry();
|
||||||
|
if (result) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return arena()->Alloc<Instr>();
|
||||||
|
}
|
||||||
|
|
||||||
|
Value* HIRBuilder::AllocateValue() {
|
||||||
|
Value* result = free_values_.NewEntry();
|
||||||
|
if (result) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return arena()->Alloc<Value>();
|
||||||
|
}
|
||||||
|
Value::Use* HIRBuilder::AllocateUse() {
|
||||||
|
Value::Use* result = free_uses_.NewEntry();
|
||||||
|
if (result) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return arena()->Alloc<Value::Use>();
|
||||||
|
}
|
||||||
|
void HIRBuilder::DeallocateInstruction(Instr* instr) {
|
||||||
|
// free_instrs_.DeleteEntry(instr);
|
||||||
|
}
|
||||||
|
void HIRBuilder::DeallocateValue(Value* value) {
|
||||||
|
// free_values_.DeleteEntry(value);
|
||||||
|
}
|
||||||
|
void HIRBuilder::DeallocateUse(Value::Use* use) {
|
||||||
|
// free_uses_.DeleteEntry(use);
|
||||||
|
}
|
||||||
void HIRBuilder::DumpValue(StringBuffer* str, Value* value) {
|
void HIRBuilder::DumpValue(StringBuffer* str, Value* value) {
|
||||||
if (value->IsConstant()) {
|
if (value->IsConstant()) {
|
||||||
switch (value->type) {
|
switch (value->type) {
|
||||||
|
@ -545,12 +587,12 @@ void HIRBuilder::MergeAdjacentBlocks(Block* left, Block* right) {
|
||||||
auto sig = left->instr_tail->opcode->signature;
|
auto sig = left->instr_tail->opcode->signature;
|
||||||
if (GET_OPCODE_SIG_TYPE_SRC1(sig) == OPCODE_SIG_TYPE_L) {
|
if (GET_OPCODE_SIG_TYPE_SRC1(sig) == OPCODE_SIG_TYPE_L) {
|
||||||
if (left->instr_tail->src1.label->block == right) {
|
if (left->instr_tail->src1.label->block == right) {
|
||||||
left->instr_tail->Remove();
|
left->instr_tail->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (GET_OPCODE_SIG_TYPE_SRC2(sig) == OPCODE_SIG_TYPE_L) {
|
if (GET_OPCODE_SIG_TYPE_SRC2(sig) == OPCODE_SIG_TYPE_L) {
|
||||||
if (left->instr_tail->src2.label->block == right) {
|
if (left->instr_tail->src2.label->block == right) {
|
||||||
left->instr_tail->Remove();
|
left->instr_tail->UnlinkAndNOP();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -678,7 +720,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
|
||||||
}
|
}
|
||||||
Block* block = current_block_;
|
Block* block = current_block_;
|
||||||
|
|
||||||
Instr* instr = arena_->Alloc<Instr>();
|
Instr* instr = AllocateInstruction();
|
||||||
instr->next = NULL;
|
instr->next = NULL;
|
||||||
instr->prev = block->instr_tail;
|
instr->prev = block->instr_tail;
|
||||||
if (block->instr_tail) {
|
if (block->instr_tail) {
|
||||||
|
@ -705,7 +747,7 @@ Instr* HIRBuilder::AppendInstr(const OpcodeInfo& opcode_info, uint16_t flags,
|
||||||
}
|
}
|
||||||
|
|
||||||
Value* HIRBuilder::AllocValue(TypeName type) {
|
Value* HIRBuilder::AllocValue(TypeName type) {
|
||||||
Value* value = arena_->Alloc<Value>();
|
Value* value = AllocateValue();
|
||||||
value->ordinal = next_value_ordinal_++;
|
value->ordinal = next_value_ordinal_++;
|
||||||
value->type = type;
|
value->type = type;
|
||||||
value->flags = 0;
|
value->flags = 0;
|
||||||
|
@ -719,7 +761,7 @@ Value* HIRBuilder::AllocValue(TypeName type) {
|
||||||
}
|
}
|
||||||
|
|
||||||
Value* HIRBuilder::CloneValue(Value* source) {
|
Value* HIRBuilder::CloneValue(Value* source) {
|
||||||
Value* value = arena_->Alloc<Value>();
|
Value* value = AllocateValue();
|
||||||
value->ordinal = next_value_ordinal_++;
|
value->ordinal = next_value_ordinal_++;
|
||||||
value->type = source->type;
|
value->type = source->type;
|
||||||
value->flags = source->flags;
|
value->flags = source->flags;
|
||||||
|
@ -1295,6 +1337,9 @@ void HIRBuilder::CacheControl(Value* address, size_t cache_line_size,
|
||||||
|
|
||||||
void HIRBuilder::MemoryBarrier() { AppendInstr(OPCODE_MEMORY_BARRIER_info, 0); }
|
void HIRBuilder::MemoryBarrier() { AppendInstr(OPCODE_MEMORY_BARRIER_info, 0); }
|
||||||
|
|
||||||
|
void HIRBuilder::DelayExecution() {
|
||||||
|
AppendInstr(OPCODE_DELAY_EXECUTION_info, 0);
|
||||||
|
}
|
||||||
void HIRBuilder::SetRoundingMode(Value* value) {
|
void HIRBuilder::SetRoundingMode(Value* value) {
|
||||||
ASSERT_INTEGER_TYPE(value);
|
ASSERT_INTEGER_TYPE(value);
|
||||||
Instr* i = AppendInstr(OPCODE_SET_ROUNDING_MODE_info, 0);
|
Instr* i = AppendInstr(OPCODE_SET_ROUNDING_MODE_info, 0);
|
||||||
|
|
|
@ -15,6 +15,8 @@
|
||||||
#include "third_party/fmt/include/fmt/format.h"
|
#include "third_party/fmt/include/fmt/format.h"
|
||||||
#include "xenia/base/arena.h"
|
#include "xenia/base/arena.h"
|
||||||
#include "xenia/base/string_buffer.h"
|
#include "xenia/base/string_buffer.h"
|
||||||
|
|
||||||
|
#include "xenia/base/simple_freelist.h"
|
||||||
#include "xenia/cpu/hir/block.h"
|
#include "xenia/cpu/hir/block.h"
|
||||||
#include "xenia/cpu/hir/instr.h"
|
#include "xenia/cpu/hir/instr.h"
|
||||||
#include "xenia/cpu/hir/label.h"
|
#include "xenia/cpu/hir/label.h"
|
||||||
|
@ -31,11 +33,20 @@ enum FunctionAttributes {
|
||||||
};
|
};
|
||||||
|
|
||||||
class HIRBuilder {
|
class HIRBuilder {
|
||||||
|
SimpleFreelist<Instr> free_instrs_;
|
||||||
|
SimpleFreelist<Value> free_values_;
|
||||||
|
SimpleFreelist<Value::Use> free_uses_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
HIRBuilder();
|
HIRBuilder();
|
||||||
virtual ~HIRBuilder();
|
virtual ~HIRBuilder();
|
||||||
|
static HIRBuilder* GetCurrent();
|
||||||
|
|
||||||
|
void MakeCurrent();
|
||||||
|
void RemoveCurrent();
|
||||||
|
|
||||||
virtual void Reset();
|
virtual void Reset();
|
||||||
|
|
||||||
virtual bool Finalize();
|
virtual bool Finalize();
|
||||||
|
|
||||||
void Dump(StringBuffer* str);
|
void Dump(StringBuffer* str);
|
||||||
|
@ -66,6 +77,18 @@ class HIRBuilder {
|
||||||
void RemoveBlock(Block* block);
|
void RemoveBlock(Block* block);
|
||||||
void MergeAdjacentBlocks(Block* left, Block* right);
|
void MergeAdjacentBlocks(Block* left, Block* right);
|
||||||
|
|
||||||
|
Instr* AllocateInstruction();
|
||||||
|
|
||||||
|
Value* AllocateValue();
|
||||||
|
Value::Use* AllocateUse();
|
||||||
|
void DeallocateInstruction(Instr* instr);
|
||||||
|
void DeallocateValue(Value* value);
|
||||||
|
void DeallocateUse(Value::Use* use);
|
||||||
|
void ResetPools() {
|
||||||
|
free_instrs_.Reset();
|
||||||
|
free_uses_.Reset();
|
||||||
|
free_values_.Reset();
|
||||||
|
}
|
||||||
// static allocations:
|
// static allocations:
|
||||||
// Value* AllocStatic(size_t length);
|
// Value* AllocStatic(size_t length);
|
||||||
|
|
||||||
|
@ -176,7 +199,7 @@ class HIRBuilder {
|
||||||
void CacheControl(Value* address, size_t cache_line_size,
|
void CacheControl(Value* address, size_t cache_line_size,
|
||||||
CacheControlType type);
|
CacheControlType type);
|
||||||
void MemoryBarrier();
|
void MemoryBarrier();
|
||||||
|
void DelayExecution();
|
||||||
void SetRoundingMode(Value* value);
|
void SetRoundingMode(Value* value);
|
||||||
Value* Max(Value* value1, Value* value2);
|
Value* Max(Value* value1, Value* value2);
|
||||||
Value* VectorMax(Value* value1, Value* value2, TypeName part_type,
|
Value* VectorMax(Value* value1, Value* value2, TypeName part_type,
|
||||||
|
|
|
@ -10,7 +10,7 @@
|
||||||
#include "xenia/cpu/hir/instr.h"
|
#include "xenia/cpu/hir/instr.h"
|
||||||
|
|
||||||
#include "xenia/cpu/hir/block.h"
|
#include "xenia/cpu/hir/block.h"
|
||||||
|
#include "xenia/cpu/hir/hir_builder.h"
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
namespace hir {
|
namespace hir {
|
||||||
|
@ -62,21 +62,35 @@ void Instr::Replace(const OpcodeInfo* new_opcode, uint16_t new_flags) {
|
||||||
if (src1_use) {
|
if (src1_use) {
|
||||||
src1.value->RemoveUse(src1_use);
|
src1.value->RemoveUse(src1_use);
|
||||||
src1.value = NULL;
|
src1.value = NULL;
|
||||||
src1_use = NULL;
|
// src1_use = NULL;
|
||||||
}
|
}
|
||||||
if (src2_use) {
|
if (src2_use) {
|
||||||
src2.value->RemoveUse(src2_use);
|
src2.value->RemoveUse(src2_use);
|
||||||
src2.value = NULL;
|
src2.value = NULL;
|
||||||
src2_use = NULL;
|
// src2_use = NULL;
|
||||||
}
|
}
|
||||||
if (src3_use) {
|
if (src3_use) {
|
||||||
src3.value->RemoveUse(src3_use);
|
src3.value->RemoveUse(src3_use);
|
||||||
src3.value = NULL;
|
src3.value = NULL;
|
||||||
src3_use = NULL;
|
// src3_use = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src1_use) {
|
||||||
|
HIRBuilder::GetCurrent()->DeallocateUse(src1_use);
|
||||||
|
src1_use = nullptr;
|
||||||
|
}
|
||||||
|
if (src2_use) {
|
||||||
|
HIRBuilder::GetCurrent()->DeallocateUse(src2_use);
|
||||||
|
src2_use = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (src3_use) {
|
||||||
|
HIRBuilder::GetCurrent()->DeallocateUse(src3_use);
|
||||||
|
src3_use = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Instr::Remove() {
|
void Instr::UnlinkAndNOP() {
|
||||||
// Remove all srcs/dest.
|
// Remove all srcs/dest.
|
||||||
Replace(&OPCODE_NOP_info, 0);
|
Replace(&OPCODE_NOP_info, 0);
|
||||||
|
|
||||||
|
@ -91,6 +105,10 @@ void Instr::Remove() {
|
||||||
block->instr_tail = prev;
|
block->instr_tail = prev;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Instr::Deallocate() {
|
||||||
|
HIRBuilder::GetCurrent()->DeallocateInstruction(this);
|
||||||
|
}
|
||||||
Instr* Instr::GetDestDefSkipAssigns() {
|
Instr* Instr::GetDestDefSkipAssigns() {
|
||||||
Instr* current_def = this;
|
Instr* current_def = this;
|
||||||
|
|
||||||
|
|
|
@ -78,7 +78,12 @@ class Instr {
|
||||||
|
|
||||||
void MoveBefore(Instr* other);
|
void MoveBefore(Instr* other);
|
||||||
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
|
void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags);
|
||||||
void Remove();
|
void UnlinkAndNOP();
|
||||||
|
//chrispy: wanted to change this one to Remove, but i changed Remove's name to UnlinkAndNOP,
|
||||||
|
//so if changes happened in master that we wanted to port over, and those changes used Remove, then we would have a lot of issues that the cause of would
|
||||||
|
//be difficult to track
|
||||||
|
//^todo: rework this comment, im frazzled
|
||||||
|
void Deallocate();
|
||||||
const OpcodeInfo* GetOpcodeInfo() const { return opcode; }
|
const OpcodeInfo* GetOpcodeInfo() const { return opcode; }
|
||||||
// if opcode is null, we have bigger problems
|
// if opcode is null, we have bigger problems
|
||||||
Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; }
|
Opcode GetOpcodeNum() const { return GetOpcodeInfo()->num; }
|
||||||
|
|
|
@ -292,7 +292,7 @@ enum Opcode {
|
||||||
// as we already have OPCODE_ROUND. round double to float (
|
// as we already have OPCODE_ROUND. round double to float (
|
||||||
// ppc "single" fpu instruction result rounding behavior )
|
// ppc "single" fpu instruction result rounding behavior )
|
||||||
OPCODE_SET_NJM,
|
OPCODE_SET_NJM,
|
||||||
|
OPCODE_DELAY_EXECUTION, //for db16cyc
|
||||||
__OPCODE_MAX_VALUE, // Keep at end.
|
__OPCODE_MAX_VALUE, // Keep at end.
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -218,7 +218,7 @@ DEFINE_OPCODE(
|
||||||
"context_barrier",
|
"context_barrier",
|
||||||
OPCODE_SIG_X,
|
OPCODE_SIG_X,
|
||||||
0)
|
0)
|
||||||
|
DEFINE_OPCODE(OPCODE_DELAY_EXECUTION, "delay_execution", OPCODE_SIG_X, 0)
|
||||||
DEFINE_OPCODE(
|
DEFINE_OPCODE(
|
||||||
OPCODE_LOAD_MMIO,
|
OPCODE_LOAD_MMIO,
|
||||||
"load_mmio",
|
"load_mmio",
|
||||||
|
|
|
@ -16,13 +16,13 @@
|
||||||
#include "xenia/base/assert.h"
|
#include "xenia/base/assert.h"
|
||||||
#include "xenia/base/byte_order.h"
|
#include "xenia/base/byte_order.h"
|
||||||
#include "xenia/base/math.h"
|
#include "xenia/base/math.h"
|
||||||
|
#include "xenia/cpu/hir/hir_builder.h"
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
namespace hir {
|
namespace hir {
|
||||||
|
|
||||||
Value::Use* Value::AddUse(Arena* arena, Instr* instr) {
|
Value::Use* Value::AddUse(Arena* arena, Instr* instr) {
|
||||||
Use* use = arena->Alloc<Use>();
|
Use* use = HIRBuilder::GetCurrent()->AllocateUse();
|
||||||
use->instr = instr;
|
use->instr = instr;
|
||||||
use->prev = NULL;
|
use->prev = NULL;
|
||||||
use->next = use_head;
|
use->next = use_head;
|
||||||
|
@ -42,6 +42,8 @@ void Value::RemoveUse(Use* use) {
|
||||||
if (use->next) {
|
if (use->next) {
|
||||||
use->next->prev = use->prev;
|
use->next->prev = use->prev;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//HIRBuilder::GetCurrent()->DeallocateUse(use);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t Value::AsUint32() {
|
uint32_t Value::AsUint32() {
|
||||||
|
|
|
@ -789,8 +789,15 @@ int InstrEmit_norx(PPCHIRBuilder& f, const InstrData& i) {
|
||||||
int InstrEmit_orx(PPCHIRBuilder& f, const InstrData& i) {
|
int InstrEmit_orx(PPCHIRBuilder& f, const InstrData& i) {
|
||||||
// RA <- (RS) | (RB)
|
// RA <- (RS) | (RB)
|
||||||
if (i.X.RT == i.X.RB && i.X.RT == i.X.RA && !i.X.Rc) {
|
if (i.X.RT == i.X.RB && i.X.RT == i.X.RA && !i.X.Rc) {
|
||||||
|
// chrispy: this special version of orx is db16cyc and is heavily used in
|
||||||
|
// spinlocks. since we do not emit any code for this we end up wasting a ton
|
||||||
|
// of power
|
||||||
|
if (i.code == 0x7FFFFB78) {
|
||||||
|
f.DelayExecution();
|
||||||
|
} else {
|
||||||
// Sometimes used as no-op.
|
// Sometimes used as no-op.
|
||||||
f.Nop();
|
f.Nop();
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
Value* ra;
|
Value* ra;
|
||||||
|
|
|
@ -117,6 +117,7 @@ bool PPCFrontend::DefineFunction(GuestFunction* function,
|
||||||
uint32_t debug_info_flags) {
|
uint32_t debug_info_flags) {
|
||||||
auto translator = translator_pool_.Allocate(this);
|
auto translator = translator_pool_.Allocate(this);
|
||||||
bool result = translator->Translate(function, debug_info_flags);
|
bool result = translator->Translate(function, debug_info_flags);
|
||||||
|
translator->Reset();
|
||||||
translator_pool_.Release(translator);
|
translator_pool_.Release(translator);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -96,10 +96,25 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) : frontend_(frontend) {
|
||||||
|
|
||||||
PPCTranslator::~PPCTranslator() = default;
|
PPCTranslator::~PPCTranslator() = default;
|
||||||
|
|
||||||
|
class HirBuilderScope {
|
||||||
|
PPCHIRBuilder* builder_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
HirBuilderScope(PPCHIRBuilder* builder) : builder_(builder) {
|
||||||
|
builder_->MakeCurrent();
|
||||||
|
}
|
||||||
|
|
||||||
|
~HirBuilderScope() {
|
||||||
|
if (builder_) {
|
||||||
|
builder_->RemoveCurrent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
bool PPCTranslator::Translate(GuestFunction* function,
|
bool PPCTranslator::Translate(GuestFunction* function,
|
||||||
uint32_t debug_info_flags) {
|
uint32_t debug_info_flags) {
|
||||||
SCOPE_profile_cpu_f("cpu");
|
SCOPE_profile_cpu_f("cpu");
|
||||||
|
HirBuilderScope hir_build_scope{builder_.get()};
|
||||||
// Reset() all caching when we leave.
|
// Reset() all caching when we leave.
|
||||||
xe::make_reset_scope(builder_);
|
xe::make_reset_scope(builder_);
|
||||||
xe::make_reset_scope(compiler_);
|
xe::make_reset_scope(compiler_);
|
||||||
|
@ -196,7 +211,7 @@ bool PPCTranslator::Translate(GuestFunction* function,
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
void PPCTranslator::Reset() { builder_->ResetPools(); }
|
||||||
void PPCTranslator::DumpSource(GuestFunction* function,
|
void PPCTranslator::DumpSource(GuestFunction* function,
|
||||||
StringBuffer* string_buffer) {
|
StringBuffer* string_buffer) {
|
||||||
Memory* memory = frontend_->memory();
|
Memory* memory = frontend_->memory();
|
||||||
|
|
|
@ -31,7 +31,7 @@ class PPCTranslator {
|
||||||
~PPCTranslator();
|
~PPCTranslator();
|
||||||
|
|
||||||
bool Translate(GuestFunction* function, uint32_t debug_info_flags);
|
bool Translate(GuestFunction* function, uint32_t debug_info_flags);
|
||||||
|
void Reset();
|
||||||
private:
|
private:
|
||||||
void DumpSource(GuestFunction* function, StringBuffer* string_buffer);
|
void DumpSource(GuestFunction* function, StringBuffer* string_buffer);
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -19,6 +19,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "xenia/base/dma.h"
|
||||||
#include "xenia/base/ring_buffer.h"
|
#include "xenia/base/ring_buffer.h"
|
||||||
#include "xenia/base/threading.h"
|
#include "xenia/base/threading.h"
|
||||||
#include "xenia/gpu/register_file.h"
|
#include "xenia/gpu/register_file.h"
|
||||||
|
@ -66,6 +67,11 @@ enum class GammaRampType {
|
||||||
};
|
};
|
||||||
|
|
||||||
class CommandProcessor {
|
class CommandProcessor {
|
||||||
|
protected:
|
||||||
|
RingBuffer
|
||||||
|
reader_; // chrispy: instead of having ringbuffer on stack, have it near
|
||||||
|
// the start of the class so we can access it via rel8. This
|
||||||
|
// also reduces the number of params we need to pass
|
||||||
public:
|
public:
|
||||||
enum class SwapPostEffect {
|
enum class SwapPostEffect {
|
||||||
kNone,
|
kNone,
|
||||||
|
@ -76,7 +82,7 @@ class CommandProcessor {
|
||||||
CommandProcessor(GraphicsSystem* graphics_system,
|
CommandProcessor(GraphicsSystem* graphics_system,
|
||||||
kernel::KernelState* kernel_state);
|
kernel::KernelState* kernel_state);
|
||||||
virtual ~CommandProcessor();
|
virtual ~CommandProcessor();
|
||||||
|
dma::XeDMAC* GetDMAC() const { return dmac_; }
|
||||||
uint32_t counter() const { return counter_; }
|
uint32_t counter() const { return counter_; }
|
||||||
void increment_counter() { counter_++; }
|
void increment_counter() { counter_++; }
|
||||||
|
|
||||||
|
@ -101,7 +107,7 @@ class CommandProcessor {
|
||||||
// screen right in the beginning of 4D530AA4 is not a resolved render target,
|
// screen right in the beginning of 4D530AA4 is not a resolved render target,
|
||||||
// for instance).
|
// for instance).
|
||||||
virtual void IssueSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width,
|
virtual void IssueSwap(uint32_t frontbuffer_ptr, uint32_t frontbuffer_width,
|
||||||
uint32_t frontbuffer_height) = 0;
|
uint32_t frontbuffer_height) {}
|
||||||
|
|
||||||
// May be called not only from the command processor thread when the command
|
// May be called not only from the command processor thread when the command
|
||||||
// processor is paused, and the termination of this function may be explicitly
|
// processor is paused, and the termination of this function may be explicitly
|
||||||
|
@ -153,7 +159,7 @@ class CommandProcessor {
|
||||||
// rarely needed, most register writes have no special logic here
|
// rarely needed, most register writes have no special logic here
|
||||||
XE_NOINLINE
|
XE_NOINLINE
|
||||||
void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
|
void HandleSpecialRegisterWrite(uint32_t index, uint32_t value);
|
||||||
XE_FORCEINLINE
|
|
||||||
virtual void WriteRegister(uint32_t index, uint32_t value);
|
virtual void WriteRegister(uint32_t index, uint32_t value);
|
||||||
|
|
||||||
// mem has big-endian register values
|
// mem has big-endian register values
|
||||||
|
@ -165,12 +171,53 @@ class CommandProcessor {
|
||||||
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
uint32_t num_registers);
|
uint32_t num_registers);
|
||||||
|
|
||||||
XE_FORCEINLINE
|
XE_NOINLINE
|
||||||
virtual void WriteOneRegisterFromRing(
|
virtual void WriteOneRegisterFromRing(
|
||||||
xe::RingBuffer* ring, uint32_t base,
|
uint32_t base,
|
||||||
uint32_t
|
uint32_t
|
||||||
num_times); // repeatedly write a value to one register, presumably a
|
num_times); // repeatedly write a value to one register, presumably a
|
||||||
// register with special handling for writes
|
// register with special handling for writes
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteFetchRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteBoolRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteLoopRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteALURangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteFetchRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteBoolRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteLoopRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteREGISTERSRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
|
const reg::DC_LUT_30_COLOR* gamma_ramp_256_entry_table() const {
|
||||||
return gamma_ramp_256_entry_table_;
|
return gamma_ramp_256_entry_table_;
|
||||||
}
|
}
|
||||||
|
@ -186,75 +233,22 @@ class CommandProcessor {
|
||||||
|
|
||||||
uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
|
uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
|
||||||
virtual void OnPrimaryBufferEnd() {}
|
virtual void OnPrimaryBufferEnd() {}
|
||||||
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length);
|
|
||||||
bool ExecutePacket(RingBuffer* reader);
|
|
||||||
bool ExecutePacketType0(RingBuffer* reader, uint32_t packet);
|
|
||||||
bool ExecutePacketType1(RingBuffer* reader, uint32_t packet);
|
|
||||||
bool ExecutePacketType2(RingBuffer* reader, uint32_t packet);
|
|
||||||
bool ExecutePacketType3(RingBuffer* reader, uint32_t packet);
|
|
||||||
bool ExecutePacketType3_ME_INIT(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_NOP(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_INTERRUPT(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_XE_SWAP(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_INDIRECT_BUFFER(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_WAIT_REG_MEM(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_REG_RMW(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_REG_TO_MEM(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_MEM_WRITE(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_COND_WRITE(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_EVENT_WRITE(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_EVENT_WRITE_SHD(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_EVENT_WRITE_ZPD(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3Draw(RingBuffer* reader, uint32_t packet,
|
|
||||||
const char* opcode_name,
|
|
||||||
uint32_t viz_query_condition,
|
|
||||||
uint32_t count_remaining);
|
|
||||||
bool ExecutePacketType3_DRAW_INDX(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_DRAW_INDX_2(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_SET_CONSTANT(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_SET_CONSTANT2(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_LOAD_ALU_CONSTANT(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_SET_SHADER_CONSTANTS(RingBuffer* reader,
|
|
||||||
uint32_t packet, uint32_t count);
|
|
||||||
bool ExecutePacketType3_IM_LOAD(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_IM_LOAD_IMMEDIATE(RingBuffer* reader,
|
|
||||||
|
|
||||||
uint32_t packet, uint32_t count);
|
#include "pm4_command_processor_declare.h"
|
||||||
bool ExecutePacketType3_INVALIDATE_STATE(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
bool ExecutePacketType3_VIZ_QUERY(RingBuffer* reader, uint32_t packet,
|
|
||||||
uint32_t count);
|
|
||||||
|
|
||||||
virtual Shader* LoadShader(xenos::ShaderType shader_type,
|
virtual Shader* LoadShader(xenos::ShaderType shader_type,
|
||||||
uint32_t guest_address,
|
uint32_t guest_address,
|
||||||
const uint32_t* host_address,
|
const uint32_t* host_address,
|
||||||
uint32_t dword_count) = 0;
|
uint32_t dword_count) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
virtual bool IssueDraw(xenos::PrimitiveType prim_type, uint32_t index_count,
|
virtual bool IssueDraw(xenos::PrimitiveType prim_type, uint32_t index_count,
|
||||||
IndexBufferInfo* index_buffer_info,
|
IndexBufferInfo* index_buffer_info,
|
||||||
bool major_mode_explicit) = 0;
|
bool major_mode_explicit) {
|
||||||
virtual bool IssueCopy() = 0;
|
return false;
|
||||||
|
}
|
||||||
|
virtual bool IssueCopy() { return false; }
|
||||||
|
|
||||||
// "Actual" is for the command processor thread, to be read by the
|
// "Actual" is for the command processor thread, to be read by the
|
||||||
// implementations.
|
// implementations.
|
||||||
|
@ -267,7 +261,7 @@ class CommandProcessor {
|
||||||
Memory* memory_ = nullptr;
|
Memory* memory_ = nullptr;
|
||||||
kernel::KernelState* kernel_state_ = nullptr;
|
kernel::KernelState* kernel_state_ = nullptr;
|
||||||
GraphicsSystem* graphics_system_ = nullptr;
|
GraphicsSystem* graphics_system_ = nullptr;
|
||||||
RegisterFile* register_file_ = nullptr;
|
RegisterFile* XE_RESTRICT register_file_ = nullptr;
|
||||||
|
|
||||||
TraceWriter trace_writer_;
|
TraceWriter trace_writer_;
|
||||||
enum class TraceState {
|
enum class TraceState {
|
||||||
|
@ -316,6 +310,7 @@ class CommandProcessor {
|
||||||
reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
|
reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
|
||||||
reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
|
reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
|
||||||
uint32_t gamma_ramp_rw_component_ = 0;
|
uint32_t gamma_ramp_rw_component_ = 0;
|
||||||
|
dma::XeDMAC* dmac_ = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
|
|
|
@ -705,13 +705,34 @@ void D3D12CommandProcessor::SetExternalGraphicsRootSignature(
|
||||||
}
|
}
|
||||||
|
|
||||||
void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) {
|
void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) {
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
__m128 zero_register = _mm_setzero_ps();
|
||||||
|
__m128 ff_viewport_low4 = _mm_loadu_ps(&ff_viewport_.TopLeftX);
|
||||||
|
__m128 ff_viewport_high2 =
|
||||||
|
_mm_loadl_pi(zero_register, (const __m64*)&ff_viewport_.MinDepth);
|
||||||
|
|
||||||
|
__m128 viewport_low4 = _mm_loadu_ps(&viewport.TopLeftX);
|
||||||
|
__m128 viewport_high2 =
|
||||||
|
_mm_loadl_pi(zero_register, (const __m64*)&viewport.MinDepth);
|
||||||
|
|
||||||
|
__m128 first_four_cmp = _mm_cmpeq_ps(ff_viewport_low4, viewport_low4);
|
||||||
|
__m128 last_two_cmp = _mm_cmpeq_ps(ff_viewport_high2, viewport_high2);
|
||||||
|
|
||||||
|
__m128 combined_condition = _mm_and_ps(first_four_cmp, last_two_cmp);
|
||||||
|
|
||||||
|
int movmask = _mm_movemask_ps(combined_condition);
|
||||||
|
|
||||||
|
XE_UNLIKELY_IF(ff_viewport_update_needed_ || movmask != 0b1111)
|
||||||
|
#else
|
||||||
ff_viewport_update_needed_ |= ff_viewport_.TopLeftX != viewport.TopLeftX;
|
ff_viewport_update_needed_ |= ff_viewport_.TopLeftX != viewport.TopLeftX;
|
||||||
ff_viewport_update_needed_ |= ff_viewport_.TopLeftY != viewport.TopLeftY;
|
ff_viewport_update_needed_ |= ff_viewport_.TopLeftY != viewport.TopLeftY;
|
||||||
ff_viewport_update_needed_ |= ff_viewport_.Width != viewport.Width;
|
ff_viewport_update_needed_ |= ff_viewport_.Width != viewport.Width;
|
||||||
ff_viewport_update_needed_ |= ff_viewport_.Height != viewport.Height;
|
ff_viewport_update_needed_ |= ff_viewport_.Height != viewport.Height;
|
||||||
ff_viewport_update_needed_ |= ff_viewport_.MinDepth != viewport.MinDepth;
|
ff_viewport_update_needed_ |= ff_viewport_.MinDepth != viewport.MinDepth;
|
||||||
ff_viewport_update_needed_ |= ff_viewport_.MaxDepth != viewport.MaxDepth;
|
ff_viewport_update_needed_ |= ff_viewport_.MaxDepth != viewport.MaxDepth;
|
||||||
if (XE_UNLIKELY(ff_viewport_update_needed_)) {
|
if (XE_UNLIKELY(ff_viewport_update_needed_))
|
||||||
|
#endif
|
||||||
|
{
|
||||||
ff_viewport_ = viewport;
|
ff_viewport_ = viewport;
|
||||||
deferred_command_list_.RSSetViewport(ff_viewport_);
|
deferred_command_list_.RSSetViewport(ff_viewport_);
|
||||||
ff_viewport_update_needed_ = false;
|
ff_viewport_update_needed_ = false;
|
||||||
|
@ -719,11 +740,23 @@ void D3D12CommandProcessor::SetViewport(const D3D12_VIEWPORT& viewport) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void D3D12CommandProcessor::SetScissorRect(const D3D12_RECT& scissor_rect) {
|
void D3D12CommandProcessor::SetScissorRect(const D3D12_RECT& scissor_rect) {
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
// vtune suggested that this and SetViewport be vectorized, high retiring
|
||||||
|
// figure
|
||||||
|
__m128i scissor_m128 = _mm_loadu_si128((const __m128i*)&scissor_rect);
|
||||||
|
__m128i ff_scissor_m128 = _mm_loadu_si128((const __m128i*)&ff_scissor_);
|
||||||
|
__m128i comparison_result = _mm_cmpeq_epi32(scissor_m128, ff_scissor_m128);
|
||||||
|
if (ff_scissor_update_needed_ ||
|
||||||
|
_mm_movemask_epi8(comparison_result) != 0xFFFF)
|
||||||
|
#else
|
||||||
ff_scissor_update_needed_ |= ff_scissor_.left != scissor_rect.left;
|
ff_scissor_update_needed_ |= ff_scissor_.left != scissor_rect.left;
|
||||||
ff_scissor_update_needed_ |= ff_scissor_.top != scissor_rect.top;
|
ff_scissor_update_needed_ |= ff_scissor_.top != scissor_rect.top;
|
||||||
ff_scissor_update_needed_ |= ff_scissor_.right != scissor_rect.right;
|
ff_scissor_update_needed_ |= ff_scissor_.right != scissor_rect.right;
|
||||||
ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor_rect.bottom;
|
ff_scissor_update_needed_ |= ff_scissor_.bottom != scissor_rect.bottom;
|
||||||
if (ff_scissor_update_needed_) {
|
|
||||||
|
if (ff_scissor_update_needed_)
|
||||||
|
#endif
|
||||||
|
{
|
||||||
ff_scissor_ = scissor_rect;
|
ff_scissor_ = scissor_rect;
|
||||||
deferred_command_list_.RSSetScissorRect(ff_scissor_);
|
deferred_command_list_.RSSetScissorRect(ff_scissor_);
|
||||||
ff_scissor_update_needed_ = false;
|
ff_scissor_update_needed_ = false;
|
||||||
|
@ -1186,13 +1219,15 @@ bool D3D12CommandProcessor::SetupContext() {
|
||||||
}
|
}
|
||||||
// The upload buffer is frame-buffered.
|
// The upload buffer is frame-buffered.
|
||||||
gamma_ramp_buffer_desc.Width *= kQueueFrames;
|
gamma_ramp_buffer_desc.Width *= kQueueFrames;
|
||||||
if (FAILED(device->CreateCommittedResource(
|
|
||||||
&ui::d3d12::util::kHeapPropertiesUpload, heap_flag_create_not_zeroed,
|
if (!GetD3D12Provider().CreateUploadResource(
|
||||||
&gamma_ramp_buffer_desc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
|
heap_flag_create_not_zeroed, &gamma_ramp_buffer_desc,
|
||||||
IID_PPV_ARGS(&gamma_ramp_upload_buffer_)))) {
|
D3D12_RESOURCE_STATE_GENERIC_READ,
|
||||||
|
IID_PPV_ARGS(&gamma_ramp_upload_buffer_))) {
|
||||||
XELOGE("Failed to create the gamma ramp upload buffer");
|
XELOGE("Failed to create the gamma ramp upload buffer");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (FAILED(gamma_ramp_upload_buffer_->Map(
|
if (FAILED(gamma_ramp_upload_buffer_->Map(
|
||||||
0, nullptr,
|
0, nullptr,
|
||||||
reinterpret_cast<void**>(&gamma_ramp_upload_buffer_mapping_)))) {
|
reinterpret_cast<void**>(&gamma_ramp_upload_buffer_mapping_)))) {
|
||||||
|
@ -1678,9 +1713,6 @@ void D3D12CommandProcessor::ShutdownContext() {
|
||||||
}
|
}
|
||||||
// todo: bit-pack the bools and use bitarith to reduce branches
|
// todo: bit-pack the bools and use bitarith to reduce branches
|
||||||
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
||||||
#if XE_ARCH_AMD64 == 1
|
|
||||||
// CommandProcessor::WriteRegister(index, value);
|
|
||||||
|
|
||||||
__m128i to_rangecheck = _mm_set1_epi16(static_cast<short>(index));
|
__m128i to_rangecheck = _mm_set1_epi16(static_cast<short>(index));
|
||||||
|
|
||||||
__m128i lower_bounds = _mm_setr_epi16(
|
__m128i lower_bounds = _mm_setr_epi16(
|
||||||
|
@ -1713,9 +1745,7 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
||||||
|
|
||||||
uint32_t movmask = static_cast<uint32_t>(_mm_movemask_epi8(is_within_range));
|
uint32_t movmask = static_cast<uint32_t>(_mm_movemask_epi8(is_within_range));
|
||||||
|
|
||||||
if (!movmask) {
|
if (movmask) {
|
||||||
return;
|
|
||||||
} else {
|
|
||||||
if (movmask & (1 << 3)) {
|
if (movmask & (1 << 3)) {
|
||||||
if (frame_open_) {
|
if (frame_open_) {
|
||||||
uint32_t float_constant_index =
|
uint32_t float_constant_index =
|
||||||
|
@ -1747,45 +1777,12 @@ void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
||||||
} else {
|
} else {
|
||||||
HandleSpecialRegisterWrite(index, value);
|
HandleSpecialRegisterWrite(index, value);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#else
|
|
||||||
|
|
||||||
CommandProcessor::WriteRegister(index, value);
|
|
||||||
|
|
||||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
|
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
|
|
||||||
cbuffer_binding_fetch_.up_to_date = false;
|
|
||||||
// texture cache is never nullptr
|
|
||||||
// if (texture_cache_ != nullptr) {
|
|
||||||
texture_cache_->TextureFetchConstantWritten(
|
|
||||||
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
|
|
||||||
// }
|
|
||||||
} else {
|
} else {
|
||||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
|
_ReadWriteBarrier();
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
|
return;
|
||||||
if (frame_open_) {
|
|
||||||
uint32_t float_constant_index =
|
|
||||||
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
|
|
||||||
if (float_constant_index >= 256) {
|
|
||||||
float_constant_index -= 256;
|
|
||||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
|
||||||
(1ull << (float_constant_index & 63))) {
|
|
||||||
cbuffer_binding_float_pixel_.up_to_date = false;
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
if (current_float_constant_map_vertex_[float_constant_index >> 6] &
|
|
||||||
(1ull << (float_constant_index & 63))) {
|
|
||||||
cbuffer_binding_float_vertex_.up_to_date = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
|
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
|
|
||||||
cbuffer_binding_bool_loop_.up_to_date = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
||||||
uint32_t* base,
|
uint32_t* base,
|
||||||
uint32_t num_registers) {
|
uint32_t num_registers) {
|
||||||
|
@ -1794,6 +1791,95 @@ void D3D12CommandProcessor::WriteRegistersFromMem(uint32_t start_index,
|
||||||
D3D12CommandProcessor::WriteRegister(start_index + i, data);
|
D3D12CommandProcessor::WriteRegister(start_index + i, data);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void D3D12CommandProcessor::WriteALURangeFromRing(xe::RingBuffer* ring,
|
||||||
|
uint32_t base,
|
||||||
|
uint32_t num_times) {
|
||||||
|
WriteRegisterRangeFromRing_WithKnownBound<
|
||||||
|
XE_GPU_REG_SHADER_CONSTANT_000_X, XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0>(
|
||||||
|
ring, base + XE_GPU_REG_SHADER_CONSTANT_000_X, num_times);
|
||||||
|
}
|
||||||
|
|
||||||
|
void D3D12CommandProcessor::WriteFetchRangeFromRing(xe::RingBuffer* ring,
|
||||||
|
uint32_t base,
|
||||||
|
uint32_t num_times) {
|
||||||
|
WriteRegisterRangeFromRing_WithKnownBound<0x4800, 0x5002>(ring, base + 0x4800,
|
||||||
|
num_times);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void D3D12CommandProcessor::WriteBoolRangeFromRing(xe::RingBuffer* ring,
|
||||||
|
uint32_t base,
|
||||||
|
uint32_t num_times) {
|
||||||
|
// D3D12CommandProcessor::WriteRegisterRangeFromRing(ring, base + 0x4900,
|
||||||
|
// num_times);
|
||||||
|
|
||||||
|
WriteRegisterRangeFromRing_WithKnownBound<0x4900, 0x5002>(ring, base + 0x4900,
|
||||||
|
num_times);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void D3D12CommandProcessor::WriteLoopRangeFromRing(xe::RingBuffer* ring,
|
||||||
|
uint32_t base,
|
||||||
|
uint32_t num_times) {
|
||||||
|
// D3D12CommandProcessor::WriteRegisterRangeFromRing(ring, base + 0x4908,
|
||||||
|
// num_times);
|
||||||
|
|
||||||
|
WriteRegisterRangeFromRing_WithKnownBound<0x4908, 0x5002>(ring, base + 0x4908,
|
||||||
|
num_times);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void D3D12CommandProcessor::WriteREGISTERSRangeFromRing(xe::RingBuffer* ring,
|
||||||
|
uint32_t base,
|
||||||
|
uint32_t num_times) {
|
||||||
|
// D3D12CommandProcessor::WriteRegisterRangeFromRing(ring, base + 0x2000,
|
||||||
|
// num_times);
|
||||||
|
|
||||||
|
WriteRegisterRangeFromRing_WithKnownBound<0x2000, 0x2000 + 0x800>(
|
||||||
|
ring, base + 0x2000, num_times);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void D3D12CommandProcessor::WriteALURangeFromMem(uint32_t start_index,
|
||||||
|
uint32_t* base,
|
||||||
|
uint32_t num_registers) {
|
||||||
|
WriteRegisterRangeFromMem_WithKnownBound<
|
||||||
|
XE_GPU_REG_SHADER_CONSTANT_000_X, XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0>(
|
||||||
|
start_index + 0x4000, base, num_registers);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void D3D12CommandProcessor::WriteFetchRangeFromMem(uint32_t start_index,
|
||||||
|
uint32_t* base,
|
||||||
|
uint32_t num_registers) {
|
||||||
|
WriteRegisterRangeFromMem_WithKnownBound<0x4800, 0x5002>(start_index + 0x4800,
|
||||||
|
base, num_registers);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void D3D12CommandProcessor::WriteBoolRangeFromMem(uint32_t start_index,
|
||||||
|
uint32_t* base,
|
||||||
|
uint32_t num_registers) {
|
||||||
|
WriteRegisterRangeFromMem_WithKnownBound<0x4900, 0x5002>(start_index + 0x4900,
|
||||||
|
base, num_registers);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void D3D12CommandProcessor::WriteLoopRangeFromMem(uint32_t start_index,
|
||||||
|
uint32_t* base,
|
||||||
|
uint32_t num_registers) {
|
||||||
|
WriteRegisterRangeFromMem_WithKnownBound<0x4908, 0x5002>(start_index + 0x4908,
|
||||||
|
base, num_registers);
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void D3D12CommandProcessor::WriteREGISTERSRangeFromMem(uint32_t start_index,
|
||||||
|
uint32_t* base,
|
||||||
|
uint32_t num_registers) {
|
||||||
|
WriteRegisterRangeFromMem_WithKnownBound<0x2000, 0x2000 + 0x800>(
|
||||||
|
start_index + 0x2000, base, num_registers);
|
||||||
|
}
|
||||||
/*
|
/*
|
||||||
wraparound rarely happens, so its best to hoist this out of
|
wraparound rarely happens, so its best to hoist this out of
|
||||||
writeregisterrangefromring, and structure the two functions so that this can be
|
writeregisterrangefromring, and structure the two functions so that this can be
|
||||||
|
@ -1835,14 +1921,147 @@ void D3D12CommandProcessor::WriteRegisterRangeFromRing(xe::RingBuffer* ring,
|
||||||
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||||
num_regs_firstrange);
|
num_regs_firstrange);
|
||||||
ring->EndRead(range);
|
ring->EndRead(range);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
|
return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
|
|
||||||
uint32_t base,
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
|
constexpr bool bounds_may_have_reg(uint32_t reg) {
|
||||||
|
return reg >= register_lower_bound && reg < register_upper_bound;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
|
constexpr bool bounds_may_have_bounds(uint32_t reg, uint32_t last_reg) {
|
||||||
|
return bounds_may_have_reg<register_lower_bound, register_upper_bound>(reg) ||
|
||||||
|
bounds_may_have_reg<register_lower_bound, register_upper_bound>(
|
||||||
|
last_reg);
|
||||||
|
}
|
||||||
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
|
XE_FORCEINLINE void
|
||||||
|
D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
|
||||||
|
uint32_t base, uint32_t* range, uint32_t num_registers) {
|
||||||
|
constexpr auto bounds_has_reg =
|
||||||
|
bounds_may_have_reg<register_lower_bound, register_upper_bound>;
|
||||||
|
constexpr auto bounds_has_bounds =
|
||||||
|
bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < num_registers; ++i) {
|
||||||
|
uint32_t data = xe::load_and_swap<uint32_t>(range + i);
|
||||||
|
|
||||||
|
{
|
||||||
|
uint32_t index = base + i;
|
||||||
|
uint32_t value = data;
|
||||||
|
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||||
|
index < register_upper_bound);
|
||||||
|
register_file_->values[index].u32 = value;
|
||||||
|
|
||||||
|
unsigned expr = 0;
|
||||||
|
|
||||||
|
if constexpr (bounds_has_bounds(XE_GPU_REG_SCRATCH_REG0,
|
||||||
|
XE_GPU_REG_SCRATCH_REG7)) {
|
||||||
|
expr |= (index - XE_GPU_REG_SCRATCH_REG0 < 8);
|
||||||
|
}
|
||||||
|
if constexpr (bounds_has_reg(XE_GPU_REG_COHER_STATUS_HOST)) {
|
||||||
|
expr |= (index == XE_GPU_REG_COHER_STATUS_HOST);
|
||||||
|
}
|
||||||
|
if constexpr (bounds_has_bounds(XE_GPU_REG_DC_LUT_RW_INDEX,
|
||||||
|
XE_GPU_REG_DC_LUT_30_COLOR)) {
|
||||||
|
expr |= ((index - XE_GPU_REG_DC_LUT_RW_INDEX) <=
|
||||||
|
(XE_GPU_REG_DC_LUT_30_COLOR - XE_GPU_REG_DC_LUT_RW_INDEX));
|
||||||
|
}
|
||||||
|
// chrispy: reordered for msvc branch probability (assumes
|
||||||
|
// if is taken and else is not)
|
||||||
|
if (XE_LIKELY(expr == 0)) {
|
||||||
|
XE_MSVC_REORDER_BARRIER();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
HandleSpecialRegisterWrite(index, value);
|
||||||
|
goto write_done;
|
||||||
|
}
|
||||||
|
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||||
|
index < register_upper_bound);
|
||||||
|
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0,
|
||||||
|
XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5)) {
|
||||||
|
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
|
||||||
|
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
|
||||||
|
cbuffer_binding_fetch_.up_to_date = false;
|
||||||
|
// texture cache is never nullptr
|
||||||
|
texture_cache_->TextureFetchConstantWritten(
|
||||||
|
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
|
||||||
|
|
||||||
|
goto write_done;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||||
|
index < register_upper_bound);
|
||||||
|
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_000_X,
|
||||||
|
XE_GPU_REG_SHADER_CONSTANT_511_W)) {
|
||||||
|
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
|
||||||
|
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
|
||||||
|
if (frame_open_) {
|
||||||
|
uint32_t float_constant_index =
|
||||||
|
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
|
||||||
|
if (float_constant_index >= 256) {
|
||||||
|
float_constant_index -= 256;
|
||||||
|
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
||||||
|
(1ull << (float_constant_index & 63))) {
|
||||||
|
cbuffer_binding_float_pixel_.up_to_date = false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (current_float_constant_map_vertex_[float_constant_index >>
|
||||||
|
6] &
|
||||||
|
(1ull << (float_constant_index & 63))) {
|
||||||
|
cbuffer_binding_float_vertex_.up_to_date = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
goto write_done;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
XE_MSVC_ASSUME(index >= register_lower_bound &&
|
||||||
|
index < register_upper_bound);
|
||||||
|
if constexpr (bounds_has_bounds(XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031,
|
||||||
|
XE_GPU_REG_SHADER_CONSTANT_LOOP_31)) {
|
||||||
|
if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
|
||||||
|
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
|
||||||
|
cbuffer_binding_bool_loop_.up_to_date = false;
|
||||||
|
goto write_done;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
write_done:;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
|
XE_FORCEINLINE void
|
||||||
|
D3D12CommandProcessor::WriteRegisterRangeFromRing_WithKnownBound(
|
||||||
|
xe::RingBuffer* ring, uint32_t base, uint32_t num_registers) {
|
||||||
|
RingBuffer::ReadRange range =
|
||||||
|
ring->BeginRead(num_registers * sizeof(uint32_t));
|
||||||
|
|
||||||
|
constexpr auto bounds_has_reg =
|
||||||
|
bounds_may_have_reg<register_lower_bound, register_upper_bound>;
|
||||||
|
constexpr auto bounds_has_bounds =
|
||||||
|
bounds_may_have_bounds<register_lower_bound, register_upper_bound>;
|
||||||
|
|
||||||
|
XE_LIKELY_IF(!range.second) {
|
||||||
|
WriteRegisterRangeFromMem_WithKnownBound<register_lower_bound,
|
||||||
|
register_upper_bound>(
|
||||||
|
base, reinterpret_cast<uint32_t*>(const_cast<uint8_t*>(range.first)),
|
||||||
|
num_registers);
|
||||||
|
|
||||||
|
ring->EndRead(range);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return WriteRegisterRangeFromRing_WraparoundCase(ring, base, num_registers);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
XE_NOINLINE
|
||||||
|
void D3D12CommandProcessor::WriteOneRegisterFromRing(uint32_t base,
|
||||||
uint32_t num_times) {
|
uint32_t num_times) {
|
||||||
auto read = ring->BeginPrefetchedRead<swcache::PrefetchTag::Level1>(
|
auto read = reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level1>(
|
||||||
num_times * sizeof(uint32_t));
|
num_times * sizeof(uint32_t));
|
||||||
|
|
||||||
uint32_t first_length = read.first_length / sizeof(uint32_t);
|
uint32_t first_length = read.first_length / sizeof(uint32_t);
|
||||||
|
@ -1852,7 +2071,7 @@ void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
|
||||||
base, xe::load_and_swap<uint32_t>(read.first + (sizeof(uint32_t) * i)));
|
base, xe::load_and_swap<uint32_t>(read.first + (sizeof(uint32_t) * i)));
|
||||||
}
|
}
|
||||||
|
|
||||||
XE_UNLIKELY_IF (read.second) {
|
XE_UNLIKELY_IF(read.second) {
|
||||||
uint32_t second_length = read.second_length / sizeof(uint32_t);
|
uint32_t second_length = read.second_length / sizeof(uint32_t);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < second_length; ++i) {
|
for (uint32_t i = 0; i < second_length; ++i) {
|
||||||
|
@ -1861,7 +2080,7 @@ void D3D12CommandProcessor::WriteOneRegisterFromRing(xe::RingBuffer* ring,
|
||||||
xe::load_and_swap<uint32_t>(read.second + (sizeof(uint32_t) * i)));
|
xe::load_and_swap<uint32_t>(read.second + (sizeof(uint32_t) * i)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
ring->EndRead(read);
|
reader_.EndRead(read);
|
||||||
}
|
}
|
||||||
void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() {
|
void D3D12CommandProcessor::OnGammaRamp256EntryTableValueWritten() {
|
||||||
gamma_ramp_256_entry_table_up_to_date_ = false;
|
gamma_ramp_256_entry_table_up_to_date_ = false;
|
||||||
|
@ -2510,9 +2729,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
||||||
GetSupportedMemExportFormatSize(memexport_stream.format);
|
GetSupportedMemExportFormatSize(memexport_stream.format);
|
||||||
if (memexport_format_size == 0) {
|
if (memexport_format_size == 0) {
|
||||||
XELOGE("Unsupported memexport format {}",
|
XELOGE("Unsupported memexport format {}",
|
||||||
FormatInfo::Get(
|
FormatInfo::GetName(
|
||||||
xenos::TextureFormat(uint32_t(memexport_stream.format)))
|
xenos::TextureFormat(uint32_t(memexport_stream.format))));
|
||||||
->name);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
uint32_t memexport_size_dwords =
|
uint32_t memexport_size_dwords =
|
||||||
|
@ -2551,9 +2769,8 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
||||||
GetSupportedMemExportFormatSize(memexport_stream.format);
|
GetSupportedMemExportFormatSize(memexport_stream.format);
|
||||||
if (memexport_format_size == 0) {
|
if (memexport_format_size == 0) {
|
||||||
XELOGE("Unsupported memexport format {}",
|
XELOGE("Unsupported memexport format {}",
|
||||||
FormatInfo::Get(
|
FormatInfo::GetName(
|
||||||
xenos::TextureFormat(uint32_t(memexport_stream.format)))
|
xenos::TextureFormat(uint32_t(memexport_stream.format))));
|
||||||
->name);
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
uint32_t memexport_size_dwords =
|
uint32_t memexport_size_dwords =
|
||||||
|
@ -3353,17 +3570,12 @@ void D3D12CommandProcessor::UpdateFixedFunctionState(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
template <bool primitive_polygonal, bool edram_rov_used>
|
||||||
void D3D12CommandProcessor::UpdateSystemConstantValues(
|
XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
|
||||||
bool shared_memory_is_uav, bool primitive_polygonal,
|
bool shared_memory_is_uav, uint32_t line_loop_closing_index,
|
||||||
uint32_t line_loop_closing_index, xenos::Endian index_endian,
|
xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info,
|
||||||
const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask,
|
uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
|
||||||
uint32_t normalized_color_mask) {
|
uint32_t normalized_color_mask) {
|
||||||
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
|
||||||
SCOPE_profile_cpu_f("gpu");
|
|
||||||
#endif // XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
|
||||||
|
|
||||||
const RegisterFile& regs = *register_file_;
|
const RegisterFile& regs = *register_file_;
|
||||||
auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
|
auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
|
||||||
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||||
|
@ -3382,8 +3594,6 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
uint32_t vgt_max_vtx_indx = regs.Get<reg::VGT_MAX_VTX_INDX>().max_indx;
|
uint32_t vgt_max_vtx_indx = regs.Get<reg::VGT_MAX_VTX_INDX>().max_indx;
|
||||||
uint32_t vgt_min_vtx_indx = regs.Get<reg::VGT_MIN_VTX_INDX>().min_indx;
|
uint32_t vgt_min_vtx_indx = regs.Get<reg::VGT_MIN_VTX_INDX>().min_indx;
|
||||||
|
|
||||||
bool edram_rov_used = render_target_cache_->GetPath() ==
|
|
||||||
RenderTargetCache::Path::kPixelShaderInterlock;
|
|
||||||
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
|
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
|
||||||
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
|
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
|
||||||
|
|
||||||
|
@ -3426,7 +3636,21 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool dirty = false;
|
uint32_t dirty = 0u;
|
||||||
|
ArchFloatMask dirty_float_mask = floatmask_zero;
|
||||||
|
|
||||||
|
auto update_dirty_floatmask = [&dirty_float_mask](float x, float y) {
|
||||||
|
dirty_float_mask =
|
||||||
|
ArchORFloatMask(dirty_float_mask, ArchCmpneqFloatMask(x, y));
|
||||||
|
};
|
||||||
|
/*
|
||||||
|
chrispy: instead of (cmp x, y; setnz lobyte; or mask, lobyte;
|
||||||
|
we can do (xor z, x, y; or mask, z)
|
||||||
|
this ought to have much better throughput on all processors
|
||||||
|
*/
|
||||||
|
auto update_dirty_uint32_cmp = [&dirty](uint32_t x, uint32_t y) {
|
||||||
|
dirty |= (x ^ y);
|
||||||
|
};
|
||||||
|
|
||||||
// Flags.
|
// Flags.
|
||||||
uint32_t flags = 0;
|
uint32_t flags = 0;
|
||||||
|
@ -3454,7 +3678,7 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
flags |= DxbcShaderTranslator::kSysFlag_WNotReciprocal;
|
flags |= DxbcShaderTranslator::kSysFlag_WNotReciprocal;
|
||||||
}
|
}
|
||||||
// Whether the primitive is polygonal and SV_IsFrontFace matters.
|
// Whether the primitive is polygonal and SV_IsFrontFace matters.
|
||||||
if (primitive_polygonal) {
|
if constexpr (primitive_polygonal) {
|
||||||
flags |= DxbcShaderTranslator::kSysFlag_PrimitivePolygonal;
|
flags |= DxbcShaderTranslator::kSysFlag_PrimitivePolygonal;
|
||||||
}
|
}
|
||||||
// Primitive type.
|
// Primitive type.
|
||||||
|
@ -3480,7 +3704,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (edram_rov_used && depth_stencil_enabled) {
|
if constexpr (edram_rov_used) {
|
||||||
|
if (depth_stencil_enabled) {
|
||||||
flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencil;
|
flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencil;
|
||||||
if (normalized_depth_control.z_enable) {
|
if (normalized_depth_control.z_enable) {
|
||||||
flags |= uint32_t(normalized_depth_control.zfunc)
|
flags |= uint32_t(normalized_depth_control.zfunc)
|
||||||
|
@ -3504,7 +3729,8 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencilEarlyWrite;
|
flags |= DxbcShaderTranslator::kSysFlag_ROVDepthStencilEarlyWrite;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dirty |= system_constants_.flags != flags;
|
}
|
||||||
|
update_dirty_uint32_cmp(system_constants_.flags, flags);
|
||||||
system_constants_.flags = flags;
|
system_constants_.flags = flags;
|
||||||
|
|
||||||
// Tessellation factor range, plus 1.0 according to the images in
|
// Tessellation factor range, plus 1.0 according to the images in
|
||||||
|
@ -3513,29 +3739,39 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
regs[XE_GPU_REG_VGT_HOS_MIN_TESS_LEVEL].f32 + 1.0f;
|
regs[XE_GPU_REG_VGT_HOS_MIN_TESS_LEVEL].f32 + 1.0f;
|
||||||
float tessellation_factor_max =
|
float tessellation_factor_max =
|
||||||
regs[XE_GPU_REG_VGT_HOS_MAX_TESS_LEVEL].f32 + 1.0f;
|
regs[XE_GPU_REG_VGT_HOS_MAX_TESS_LEVEL].f32 + 1.0f;
|
||||||
dirty |= system_constants_.tessellation_factor_range_min !=
|
|
||||||
tessellation_factor_min;
|
update_dirty_floatmask(system_constants_.tessellation_factor_range_min,
|
||||||
|
tessellation_factor_min);
|
||||||
|
|
||||||
system_constants_.tessellation_factor_range_min = tessellation_factor_min;
|
system_constants_.tessellation_factor_range_min = tessellation_factor_min;
|
||||||
dirty |= system_constants_.tessellation_factor_range_max !=
|
update_dirty_floatmask(system_constants_.tessellation_factor_range_max,
|
||||||
tessellation_factor_max;
|
tessellation_factor_max);
|
||||||
system_constants_.tessellation_factor_range_max = tessellation_factor_max;
|
system_constants_.tessellation_factor_range_max = tessellation_factor_max;
|
||||||
|
|
||||||
// Line loop closing index (or 0 when drawing other primitives or using an
|
// Line loop closing index (or 0 when drawing other primitives or using an
|
||||||
// index buffer).
|
// index buffer).
|
||||||
dirty |= system_constants_.line_loop_closing_index != line_loop_closing_index;
|
|
||||||
|
update_dirty_uint32_cmp(system_constants_.line_loop_closing_index,
|
||||||
|
line_loop_closing_index);
|
||||||
system_constants_.line_loop_closing_index = line_loop_closing_index;
|
system_constants_.line_loop_closing_index = line_loop_closing_index;
|
||||||
|
|
||||||
// Index or tessellation edge factor buffer endianness.
|
// Index or tessellation edge factor buffer endianness.
|
||||||
dirty |= system_constants_.vertex_index_endian != index_endian;
|
update_dirty_uint32_cmp(
|
||||||
|
static_cast<uint32_t>(system_constants_.vertex_index_endian),
|
||||||
|
static_cast<uint32_t>(index_endian));
|
||||||
system_constants_.vertex_index_endian = index_endian;
|
system_constants_.vertex_index_endian = index_endian;
|
||||||
|
|
||||||
// Vertex index offset.
|
// Vertex index offset.
|
||||||
dirty |= system_constants_.vertex_index_offset != vgt_indx_offset;
|
|
||||||
|
update_dirty_uint32_cmp(system_constants_.vertex_index_offset,
|
||||||
|
vgt_indx_offset);
|
||||||
system_constants_.vertex_index_offset = vgt_indx_offset;
|
system_constants_.vertex_index_offset = vgt_indx_offset;
|
||||||
|
|
||||||
// Vertex index range.
|
// Vertex index range.
|
||||||
dirty |= system_constants_.vertex_index_min != vgt_min_vtx_indx;
|
|
||||||
dirty |= system_constants_.vertex_index_max != vgt_max_vtx_indx;
|
update_dirty_uint32_cmp(system_constants_.vertex_index_min, vgt_min_vtx_indx);
|
||||||
|
update_dirty_uint32_cmp(system_constants_.vertex_index_max, vgt_max_vtx_indx);
|
||||||
|
|
||||||
system_constants_.vertex_index_min = vgt_min_vtx_indx;
|
system_constants_.vertex_index_min = vgt_min_vtx_indx;
|
||||||
system_constants_.vertex_index_max = vgt_max_vtx_indx;
|
system_constants_.vertex_index_max = vgt_max_vtx_indx;
|
||||||
|
|
||||||
|
@ -3563,8 +3799,12 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
|
|
||||||
// Conversion to Direct3D 12 normalized device coordinates.
|
// Conversion to Direct3D 12 normalized device coordinates.
|
||||||
for (uint32_t i = 0; i < 3; ++i) {
|
for (uint32_t i = 0; i < 3; ++i) {
|
||||||
dirty |= system_constants_.ndc_scale[i] != viewport_info.ndc_scale[i];
|
update_dirty_floatmask(system_constants_.ndc_scale[i],
|
||||||
dirty |= system_constants_.ndc_offset[i] != viewport_info.ndc_offset[i];
|
viewport_info.ndc_scale[i]);
|
||||||
|
|
||||||
|
update_dirty_floatmask(system_constants_.ndc_offset[i],
|
||||||
|
viewport_info.ndc_offset[i]);
|
||||||
|
|
||||||
system_constants_.ndc_scale[i] = viewport_info.ndc_scale[i];
|
system_constants_.ndc_scale[i] = viewport_info.ndc_scale[i];
|
||||||
system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i];
|
system_constants_.ndc_offset[i] = viewport_info.ndc_offset[i];
|
||||||
}
|
}
|
||||||
|
@ -3581,14 +3821,18 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
float(pa_su_point_size.width) * (2.0f / 16.0f);
|
float(pa_su_point_size.width) * (2.0f / 16.0f);
|
||||||
float point_constant_diameter_y =
|
float point_constant_diameter_y =
|
||||||
float(pa_su_point_size.height) * (2.0f / 16.0f);
|
float(pa_su_point_size.height) * (2.0f / 16.0f);
|
||||||
dirty |= system_constants_.point_vertex_diameter_min !=
|
|
||||||
point_vertex_diameter_min;
|
update_dirty_floatmask(system_constants_.point_vertex_diameter_min,
|
||||||
dirty |= system_constants_.point_vertex_diameter_max !=
|
point_vertex_diameter_min);
|
||||||
point_vertex_diameter_max;
|
|
||||||
dirty |= system_constants_.point_constant_diameter[0] !=
|
update_dirty_floatmask(system_constants_.point_vertex_diameter_max,
|
||||||
point_constant_diameter_x;
|
point_vertex_diameter_max);
|
||||||
dirty |= system_constants_.point_constant_diameter[1] !=
|
|
||||||
point_constant_diameter_y;
|
update_dirty_floatmask(system_constants_.point_constant_diameter[0],
|
||||||
|
point_constant_diameter_x);
|
||||||
|
update_dirty_floatmask(system_constants_.point_constant_diameter[1],
|
||||||
|
point_constant_diameter_y);
|
||||||
|
|
||||||
system_constants_.point_vertex_diameter_min = point_vertex_diameter_min;
|
system_constants_.point_vertex_diameter_min = point_vertex_diameter_min;
|
||||||
system_constants_.point_vertex_diameter_max = point_vertex_diameter_max;
|
system_constants_.point_vertex_diameter_max = point_vertex_diameter_max;
|
||||||
system_constants_.point_constant_diameter[0] = point_constant_diameter_x;
|
system_constants_.point_constant_diameter[0] = point_constant_diameter_x;
|
||||||
|
@ -3602,10 +3846,15 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
float point_screen_diameter_to_ndc_radius_y =
|
float point_screen_diameter_to_ndc_radius_y =
|
||||||
(/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) /
|
(/* 0.5f * 2.0f * */ float(draw_resolution_scale_y)) /
|
||||||
std::max(viewport_info.xy_extent[1], uint32_t(1));
|
std::max(viewport_info.xy_extent[1], uint32_t(1));
|
||||||
dirty |= system_constants_.point_screen_diameter_to_ndc_radius[0] !=
|
|
||||||
point_screen_diameter_to_ndc_radius_x;
|
update_dirty_floatmask(
|
||||||
dirty |= system_constants_.point_screen_diameter_to_ndc_radius[1] !=
|
system_constants_.point_screen_diameter_to_ndc_radius[0],
|
||||||
point_screen_diameter_to_ndc_radius_y;
|
point_screen_diameter_to_ndc_radius_x);
|
||||||
|
|
||||||
|
update_dirty_floatmask(
|
||||||
|
system_constants_.point_screen_diameter_to_ndc_radius[1],
|
||||||
|
point_screen_diameter_to_ndc_radius_y);
|
||||||
|
|
||||||
system_constants_.point_screen_diameter_to_ndc_radius[0] =
|
system_constants_.point_screen_diameter_to_ndc_radius[0] =
|
||||||
point_screen_diameter_to_ndc_radius_x;
|
point_screen_diameter_to_ndc_radius_x;
|
||||||
system_constants_.point_screen_diameter_to_ndc_radius[1] =
|
system_constants_.point_screen_diameter_to_ndc_radius[1] =
|
||||||
|
@ -3628,14 +3877,20 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
uint32_t texture_signs_shifted = uint32_t(texture_signs)
|
uint32_t texture_signs_shifted = uint32_t(texture_signs)
|
||||||
<< texture_signs_shift;
|
<< texture_signs_shift;
|
||||||
uint32_t texture_signs_mask = uint32_t(0b11111111) << texture_signs_shift;
|
uint32_t texture_signs_mask = uint32_t(0b11111111) << texture_signs_shift;
|
||||||
dirty |= (texture_signs_uint & texture_signs_mask) != texture_signs_shifted;
|
|
||||||
|
update_dirty_uint32_cmp((texture_signs_uint & texture_signs_mask),
|
||||||
|
texture_signs_shifted);
|
||||||
|
|
||||||
texture_signs_uint =
|
texture_signs_uint =
|
||||||
(texture_signs_uint & ~texture_signs_mask) | texture_signs_shifted;
|
(texture_signs_uint & ~texture_signs_mask) | texture_signs_shifted;
|
||||||
|
// cache misses here, we're accessing the texture bindings out of order
|
||||||
textures_resolved |=
|
textures_resolved |=
|
||||||
uint32_t(texture_cache_->IsActiveTextureResolved(texture_index))
|
uint32_t(texture_cache_->IsActiveTextureResolved(texture_index))
|
||||||
<< texture_index;
|
<< texture_index;
|
||||||
}
|
}
|
||||||
dirty |= system_constants_.textures_resolved != textures_resolved;
|
|
||||||
|
update_dirty_uint32_cmp(system_constants_.textures_resolved,
|
||||||
|
textures_resolved);
|
||||||
system_constants_.textures_resolved = textures_resolved;
|
system_constants_.textures_resolved = textures_resolved;
|
||||||
|
|
||||||
// Log2 of sample count, for alpha to mask and with ROV, for EDRAM address
|
// Log2 of sample count, for alpha to mask and with ROV, for EDRAM address
|
||||||
|
@ -3644,18 +3899,22 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 1 : 0;
|
rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 1 : 0;
|
||||||
uint32_t sample_count_log2_y =
|
uint32_t sample_count_log2_y =
|
||||||
rb_surface_info.msaa_samples >= xenos::MsaaSamples::k2X ? 1 : 0;
|
rb_surface_info.msaa_samples >= xenos::MsaaSamples::k2X ? 1 : 0;
|
||||||
dirty |= system_constants_.sample_count_log2[0] != sample_count_log2_x;
|
|
||||||
dirty |= system_constants_.sample_count_log2[1] != sample_count_log2_y;
|
update_dirty_uint32_cmp(system_constants_.sample_count_log2[0],
|
||||||
|
sample_count_log2_x);
|
||||||
|
update_dirty_uint32_cmp(system_constants_.sample_count_log2[1],
|
||||||
|
sample_count_log2_y);
|
||||||
system_constants_.sample_count_log2[0] = sample_count_log2_x;
|
system_constants_.sample_count_log2[0] = sample_count_log2_x;
|
||||||
system_constants_.sample_count_log2[1] = sample_count_log2_y;
|
system_constants_.sample_count_log2[1] = sample_count_log2_y;
|
||||||
|
|
||||||
// Alpha test and alpha to coverage.
|
// Alpha test and alpha to coverage.
|
||||||
dirty |= system_constants_.alpha_test_reference != rb_alpha_ref;
|
update_dirty_floatmask(system_constants_.alpha_test_reference, rb_alpha_ref);
|
||||||
system_constants_.alpha_test_reference = rb_alpha_ref;
|
system_constants_.alpha_test_reference = rb_alpha_ref;
|
||||||
uint32_t alpha_to_mask = rb_colorcontrol.alpha_to_mask_enable
|
uint32_t alpha_to_mask = rb_colorcontrol.alpha_to_mask_enable
|
||||||
? (rb_colorcontrol.value >> 24) | (1 << 8)
|
? (rb_colorcontrol.value >> 24) | (1 << 8)
|
||||||
: 0;
|
: 0;
|
||||||
dirty |= system_constants_.alpha_to_mask != alpha_to_mask;
|
|
||||||
|
update_dirty_uint32_cmp(system_constants_.alpha_to_mask, alpha_to_mask);
|
||||||
system_constants_.alpha_to_mask = alpha_to_mask;
|
system_constants_.alpha_to_mask = alpha_to_mask;
|
||||||
|
|
||||||
uint32_t edram_tile_dwords_scaled =
|
uint32_t edram_tile_dwords_scaled =
|
||||||
|
@ -3663,19 +3922,23 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
(draw_resolution_scale_x * draw_resolution_scale_y);
|
(draw_resolution_scale_x * draw_resolution_scale_y);
|
||||||
|
|
||||||
// EDRAM pitch for ROV writing.
|
// EDRAM pitch for ROV writing.
|
||||||
if (edram_rov_used) {
|
if constexpr (edram_rov_used) {
|
||||||
// Align, then multiply by 32bpp tile size in dwords.
|
// Align, then multiply by 32bpp tile size in dwords.
|
||||||
uint32_t edram_32bpp_tile_pitch_dwords_scaled =
|
uint32_t edram_32bpp_tile_pitch_dwords_scaled =
|
||||||
((rb_surface_info.surface_pitch *
|
((rb_surface_info.surface_pitch *
|
||||||
(rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 2 : 1)) +
|
(rb_surface_info.msaa_samples >= xenos::MsaaSamples::k4X ? 2 : 1)) +
|
||||||
(xenos::kEdramTileWidthSamples - 1)) /
|
(xenos::kEdramTileWidthSamples - 1)) /
|
||||||
xenos::kEdramTileWidthSamples * edram_tile_dwords_scaled;
|
xenos::kEdramTileWidthSamples * edram_tile_dwords_scaled;
|
||||||
dirty |= system_constants_.edram_32bpp_tile_pitch_dwords_scaled !=
|
update_dirty_uint32_cmp(
|
||||||
edram_32bpp_tile_pitch_dwords_scaled;
|
system_constants_.edram_32bpp_tile_pitch_dwords_scaled,
|
||||||
|
edram_32bpp_tile_pitch_dwords_scaled);
|
||||||
system_constants_.edram_32bpp_tile_pitch_dwords_scaled =
|
system_constants_.edram_32bpp_tile_pitch_dwords_scaled =
|
||||||
edram_32bpp_tile_pitch_dwords_scaled;
|
edram_32bpp_tile_pitch_dwords_scaled;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
__m128i rt_clamp_dirty = _mm_set1_epi8((char)0xff);
|
||||||
|
#endif
|
||||||
// Color exponent bias and ROV render target writing.
|
// Color exponent bias and ROV render target writing.
|
||||||
for (uint32_t i = 0; i < 4; ++i) {
|
for (uint32_t i = 0; i < 4; ++i) {
|
||||||
reg::RB_COLOR_INFO color_info = color_infos[i];
|
reg::RB_COLOR_INFO color_info = color_infos[i];
|
||||||
|
@ -3695,47 +3958,80 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
float color_exp_bias_scale;
|
float color_exp_bias_scale;
|
||||||
*reinterpret_cast<int32_t*>(&color_exp_bias_scale) =
|
*reinterpret_cast<int32_t*>(&color_exp_bias_scale) =
|
||||||
0x3F800000 + (color_exp_bias << 23);
|
0x3F800000 + (color_exp_bias << 23);
|
||||||
dirty |= system_constants_.color_exp_bias[i] != color_exp_bias_scale;
|
|
||||||
|
update_dirty_floatmask(system_constants_.color_exp_bias[i],
|
||||||
|
color_exp_bias_scale);
|
||||||
|
|
||||||
system_constants_.color_exp_bias[i] = color_exp_bias_scale;
|
system_constants_.color_exp_bias[i] = color_exp_bias_scale;
|
||||||
if (edram_rov_used) {
|
if constexpr (edram_rov_used) {
|
||||||
dirty |=
|
update_dirty_uint32_cmp(system_constants_.edram_rt_keep_mask[i][0],
|
||||||
system_constants_.edram_rt_keep_mask[i][0] != rt_keep_masks[i][0];
|
rt_keep_masks[i][0]);
|
||||||
|
|
||||||
system_constants_.edram_rt_keep_mask[i][0] = rt_keep_masks[i][0];
|
system_constants_.edram_rt_keep_mask[i][0] = rt_keep_masks[i][0];
|
||||||
dirty |=
|
|
||||||
system_constants_.edram_rt_keep_mask[i][1] != rt_keep_masks[i][1];
|
update_dirty_uint32_cmp(system_constants_.edram_rt_keep_mask[i][1],
|
||||||
|
rt_keep_masks[i][1]);
|
||||||
|
|
||||||
system_constants_.edram_rt_keep_mask[i][1] = rt_keep_masks[i][1];
|
system_constants_.edram_rt_keep_mask[i][1] = rt_keep_masks[i][1];
|
||||||
if (rt_keep_masks[i][0] != UINT32_MAX ||
|
if (rt_keep_masks[i][0] != UINT32_MAX ||
|
||||||
rt_keep_masks[i][1] != UINT32_MAX) {
|
rt_keep_masks[i][1] != UINT32_MAX) {
|
||||||
uint32_t rt_base_dwords_scaled =
|
uint32_t rt_base_dwords_scaled =
|
||||||
color_info.color_base * edram_tile_dwords_scaled;
|
color_info.color_base * edram_tile_dwords_scaled;
|
||||||
dirty |= system_constants_.edram_rt_base_dwords_scaled[i] !=
|
update_dirty_uint32_cmp(
|
||||||
rt_base_dwords_scaled;
|
system_constants_.edram_rt_base_dwords_scaled[i],
|
||||||
|
rt_base_dwords_scaled);
|
||||||
system_constants_.edram_rt_base_dwords_scaled[i] =
|
system_constants_.edram_rt_base_dwords_scaled[i] =
|
||||||
rt_base_dwords_scaled;
|
rt_base_dwords_scaled;
|
||||||
uint32_t format_flags = DxbcShaderTranslator::ROV_AddColorFormatFlags(
|
uint32_t format_flags = DxbcShaderTranslator::ROV_AddColorFormatFlags(
|
||||||
color_info.color_format);
|
color_info.color_format);
|
||||||
dirty |= system_constants_.edram_rt_format_flags[i] != format_flags;
|
update_dirty_uint32_cmp(system_constants_.edram_rt_format_flags[i],
|
||||||
|
format_flags);
|
||||||
|
|
||||||
system_constants_.edram_rt_format_flags[i] = format_flags;
|
system_constants_.edram_rt_format_flags[i] = format_flags;
|
||||||
// Can't do float comparisons here because NaNs would result in always
|
// Can't do float comparisons here because NaNs would result in always
|
||||||
// setting the dirty flag.
|
// setting the dirty flag.
|
||||||
|
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
|
||||||
|
__m128i edram_rt_clamp_loaded = _mm_loadu_si128(
|
||||||
|
(const __m128i*)&system_constants_.edram_rt_clamp[i]);
|
||||||
|
__m128i rt_clamp_loaded = _mm_loadu_si128((const __m128i*)&rt_clamp[i]);
|
||||||
|
|
||||||
|
rt_clamp_dirty = _mm_and_si128(
|
||||||
|
rt_clamp_dirty,
|
||||||
|
_mm_cmpeq_epi8(edram_rt_clamp_loaded, rt_clamp_loaded));
|
||||||
|
_mm_storeu_si128((__m128i*)&system_constants_.edram_rt_clamp[i],
|
||||||
|
rt_clamp_loaded);
|
||||||
|
#else
|
||||||
dirty |= std::memcmp(system_constants_.edram_rt_clamp[i], rt_clamp[i],
|
dirty |= std::memcmp(system_constants_.edram_rt_clamp[i], rt_clamp[i],
|
||||||
4 * sizeof(float)) != 0;
|
4 * sizeof(float)) != 0;
|
||||||
std::memcpy(system_constants_.edram_rt_clamp[i], rt_clamp[i],
|
std::memcpy(system_constants_.edram_rt_clamp[i], rt_clamp[i],
|
||||||
4 * sizeof(float));
|
4 * sizeof(float));
|
||||||
|
|
||||||
|
#endif
|
||||||
uint32_t blend_factors_ops =
|
uint32_t blend_factors_ops =
|
||||||
regs[reg::RB_BLENDCONTROL::rt_register_indices[i]].u32 & 0x1FFF1FFF;
|
regs[reg::RB_BLENDCONTROL::rt_register_indices[i]].u32 & 0x1FFF1FFF;
|
||||||
dirty |= system_constants_.edram_rt_blend_factors_ops[i] !=
|
|
||||||
blend_factors_ops;
|
update_dirty_uint32_cmp(system_constants_.edram_rt_blend_factors_ops[i],
|
||||||
|
blend_factors_ops);
|
||||||
|
|
||||||
system_constants_.edram_rt_blend_factors_ops[i] = blend_factors_ops;
|
system_constants_.edram_rt_blend_factors_ops[i] = blend_factors_ops;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
if constexpr (edram_rov_used) {
|
||||||
|
update_dirty_uint32_cmp(
|
||||||
|
static_cast<uint32_t>(_mm_movemask_epi8(rt_clamp_dirty)), 0xFFFFU);
|
||||||
|
}
|
||||||
|
|
||||||
if (edram_rov_used) {
|
#endif
|
||||||
|
if constexpr (edram_rov_used) {
|
||||||
uint32_t depth_base_dwords_scaled =
|
uint32_t depth_base_dwords_scaled =
|
||||||
rb_depth_info.depth_base * edram_tile_dwords_scaled;
|
rb_depth_info.depth_base * edram_tile_dwords_scaled;
|
||||||
dirty |= system_constants_.edram_depth_base_dwords_scaled !=
|
update_dirty_uint32_cmp(system_constants_.edram_depth_base_dwords_scaled,
|
||||||
depth_base_dwords_scaled;
|
depth_base_dwords_scaled);
|
||||||
|
|
||||||
system_constants_.edram_depth_base_dwords_scaled = depth_base_dwords_scaled;
|
system_constants_.edram_depth_base_dwords_scaled = depth_base_dwords_scaled;
|
||||||
|
|
||||||
// For non-polygons, front polygon offset is used, and it's enabled if
|
// For non-polygons, front polygon offset is used, and it's enabled if
|
||||||
|
@ -3775,55 +4071,59 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
std::max(draw_resolution_scale_x, draw_resolution_scale_y);
|
std::max(draw_resolution_scale_x, draw_resolution_scale_y);
|
||||||
poly_offset_front_scale *= poly_offset_scale_factor;
|
poly_offset_front_scale *= poly_offset_scale_factor;
|
||||||
poly_offset_back_scale *= poly_offset_scale_factor;
|
poly_offset_back_scale *= poly_offset_scale_factor;
|
||||||
dirty |= system_constants_.edram_poly_offset_front_scale !=
|
update_dirty_floatmask(system_constants_.edram_poly_offset_front_scale,
|
||||||
poly_offset_front_scale;
|
poly_offset_front_scale);
|
||||||
|
|
||||||
system_constants_.edram_poly_offset_front_scale = poly_offset_front_scale;
|
system_constants_.edram_poly_offset_front_scale = poly_offset_front_scale;
|
||||||
dirty |= system_constants_.edram_poly_offset_front_offset !=
|
|
||||||
poly_offset_front_offset;
|
update_dirty_floatmask(system_constants_.edram_poly_offset_front_offset,
|
||||||
|
poly_offset_front_offset);
|
||||||
|
|
||||||
system_constants_.edram_poly_offset_front_offset = poly_offset_front_offset;
|
system_constants_.edram_poly_offset_front_offset = poly_offset_front_offset;
|
||||||
dirty |= system_constants_.edram_poly_offset_back_scale !=
|
update_dirty_floatmask(system_constants_.edram_poly_offset_back_scale,
|
||||||
poly_offset_back_scale;
|
poly_offset_back_scale);
|
||||||
system_constants_.edram_poly_offset_back_scale = poly_offset_back_scale;
|
system_constants_.edram_poly_offset_back_scale = poly_offset_back_scale;
|
||||||
dirty |= system_constants_.edram_poly_offset_back_offset !=
|
update_dirty_floatmask(system_constants_.edram_poly_offset_back_offset,
|
||||||
poly_offset_back_offset;
|
poly_offset_back_offset);
|
||||||
system_constants_.edram_poly_offset_back_offset = poly_offset_back_offset;
|
system_constants_.edram_poly_offset_back_offset = poly_offset_back_offset;
|
||||||
|
|
||||||
if (depth_stencil_enabled && normalized_depth_control.stencil_enable) {
|
if (depth_stencil_enabled && normalized_depth_control.stencil_enable) {
|
||||||
dirty |= system_constants_.edram_stencil_front_reference !=
|
update_dirty_uint32_cmp(system_constants_.edram_stencil_front_reference,
|
||||||
rb_stencilrefmask.stencilref;
|
rb_stencilrefmask.stencilref);
|
||||||
|
|
||||||
system_constants_.edram_stencil_front_reference =
|
system_constants_.edram_stencil_front_reference =
|
||||||
rb_stencilrefmask.stencilref;
|
rb_stencilrefmask.stencilref;
|
||||||
dirty |= system_constants_.edram_stencil_front_read_mask !=
|
update_dirty_uint32_cmp(system_constants_.edram_stencil_front_read_mask,
|
||||||
rb_stencilrefmask.stencilmask;
|
rb_stencilrefmask.stencilmask);
|
||||||
system_constants_.edram_stencil_front_read_mask =
|
system_constants_.edram_stencil_front_read_mask =
|
||||||
rb_stencilrefmask.stencilmask;
|
rb_stencilrefmask.stencilmask;
|
||||||
dirty |= system_constants_.edram_stencil_front_write_mask !=
|
update_dirty_uint32_cmp(system_constants_.edram_stencil_front_write_mask,
|
||||||
rb_stencilrefmask.stencilwritemask;
|
rb_stencilrefmask.stencilwritemask);
|
||||||
system_constants_.edram_stencil_front_write_mask =
|
system_constants_.edram_stencil_front_write_mask =
|
||||||
rb_stencilrefmask.stencilwritemask;
|
rb_stencilrefmask.stencilwritemask;
|
||||||
uint32_t stencil_func_ops =
|
uint32_t stencil_func_ops =
|
||||||
(normalized_depth_control.value >> 8) & ((1 << 12) - 1);
|
(normalized_depth_control.value >> 8) & ((1 << 12) - 1);
|
||||||
dirty |=
|
update_dirty_uint32_cmp(system_constants_.edram_stencil_front_func_ops,
|
||||||
system_constants_.edram_stencil_front_func_ops != stencil_func_ops;
|
stencil_func_ops);
|
||||||
system_constants_.edram_stencil_front_func_ops = stencil_func_ops;
|
system_constants_.edram_stencil_front_func_ops = stencil_func_ops;
|
||||||
|
|
||||||
if (primitive_polygonal && normalized_depth_control.backface_enable) {
|
if (primitive_polygonal && normalized_depth_control.backface_enable) {
|
||||||
dirty |= system_constants_.edram_stencil_back_reference !=
|
update_dirty_uint32_cmp(system_constants_.edram_stencil_back_reference,
|
||||||
rb_stencilrefmask_bf.stencilref;
|
rb_stencilrefmask_bf.stencilref);
|
||||||
system_constants_.edram_stencil_back_reference =
|
system_constants_.edram_stencil_back_reference =
|
||||||
rb_stencilrefmask_bf.stencilref;
|
rb_stencilrefmask_bf.stencilref;
|
||||||
dirty |= system_constants_.edram_stencil_back_read_mask !=
|
update_dirty_uint32_cmp(system_constants_.edram_stencil_back_read_mask,
|
||||||
rb_stencilrefmask_bf.stencilmask;
|
rb_stencilrefmask_bf.stencilmask);
|
||||||
system_constants_.edram_stencil_back_read_mask =
|
system_constants_.edram_stencil_back_read_mask =
|
||||||
rb_stencilrefmask_bf.stencilmask;
|
rb_stencilrefmask_bf.stencilmask;
|
||||||
dirty |= system_constants_.edram_stencil_back_write_mask !=
|
update_dirty_uint32_cmp(system_constants_.edram_stencil_back_write_mask,
|
||||||
rb_stencilrefmask_bf.stencilwritemask;
|
rb_stencilrefmask_bf.stencilwritemask);
|
||||||
system_constants_.edram_stencil_back_write_mask =
|
system_constants_.edram_stencil_back_write_mask =
|
||||||
rb_stencilrefmask_bf.stencilwritemask;
|
rb_stencilrefmask_bf.stencilwritemask;
|
||||||
uint32_t stencil_func_ops_bf =
|
uint32_t stencil_func_ops_bf =
|
||||||
(normalized_depth_control.value >> 20) & ((1 << 12) - 1);
|
(normalized_depth_control.value >> 20) & ((1 << 12) - 1);
|
||||||
dirty |= system_constants_.edram_stencil_back_func_ops !=
|
update_dirty_uint32_cmp(system_constants_.edram_stencil_back_func_ops,
|
||||||
stencil_func_ops_bf;
|
stencil_func_ops_bf);
|
||||||
system_constants_.edram_stencil_back_func_ops = stencil_func_ops_bf;
|
system_constants_.edram_stencil_back_func_ops = stencil_func_ops_bf;
|
||||||
} else {
|
} else {
|
||||||
dirty |= std::memcmp(system_constants_.edram_stencil_back,
|
dirty |= std::memcmp(system_constants_.edram_stencil_back,
|
||||||
|
@ -3834,28 +4134,69 @@ void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
4 * sizeof(uint32_t));
|
4 * sizeof(uint32_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
update_dirty_floatmask(system_constants_.edram_blend_constant[0],
|
||||||
|
regs[XE_GPU_REG_RB_BLEND_RED].f32);
|
||||||
|
|
||||||
dirty |= system_constants_.edram_blend_constant[0] !=
|
|
||||||
regs[XE_GPU_REG_RB_BLEND_RED].f32;
|
|
||||||
system_constants_.edram_blend_constant[0] =
|
system_constants_.edram_blend_constant[0] =
|
||||||
regs[XE_GPU_REG_RB_BLEND_RED].f32;
|
regs[XE_GPU_REG_RB_BLEND_RED].f32;
|
||||||
dirty |= system_constants_.edram_blend_constant[1] !=
|
|
||||||
regs[XE_GPU_REG_RB_BLEND_GREEN].f32;
|
update_dirty_floatmask(system_constants_.edram_blend_constant[1],
|
||||||
|
regs[XE_GPU_REG_RB_BLEND_GREEN].f32);
|
||||||
|
|
||||||
system_constants_.edram_blend_constant[1] =
|
system_constants_.edram_blend_constant[1] =
|
||||||
regs[XE_GPU_REG_RB_BLEND_GREEN].f32;
|
regs[XE_GPU_REG_RB_BLEND_GREEN].f32;
|
||||||
dirty |= system_constants_.edram_blend_constant[2] !=
|
update_dirty_floatmask(system_constants_.edram_blend_constant[2],
|
||||||
regs[XE_GPU_REG_RB_BLEND_BLUE].f32;
|
regs[XE_GPU_REG_RB_BLEND_BLUE].f32);
|
||||||
|
|
||||||
system_constants_.edram_blend_constant[2] =
|
system_constants_.edram_blend_constant[2] =
|
||||||
regs[XE_GPU_REG_RB_BLEND_BLUE].f32;
|
regs[XE_GPU_REG_RB_BLEND_BLUE].f32;
|
||||||
dirty |= system_constants_.edram_blend_constant[3] !=
|
update_dirty_floatmask(system_constants_.edram_blend_constant[3],
|
||||||
regs[XE_GPU_REG_RB_BLEND_ALPHA].f32;
|
regs[XE_GPU_REG_RB_BLEND_ALPHA].f32);
|
||||||
|
|
||||||
system_constants_.edram_blend_constant[3] =
|
system_constants_.edram_blend_constant[3] =
|
||||||
regs[XE_GPU_REG_RB_BLEND_ALPHA].f32;
|
regs[XE_GPU_REG_RB_BLEND_ALPHA].f32;
|
||||||
}
|
}
|
||||||
|
dirty |= ArchFloatMaskSignbit(dirty_float_mask);
|
||||||
|
|
||||||
cbuffer_binding_system_.up_to_date &= !dirty;
|
cbuffer_binding_system_.up_to_date &= !dirty;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void D3D12CommandProcessor::UpdateSystemConstantValues(
|
||||||
|
bool shared_memory_is_uav, bool primitive_polygonal,
|
||||||
|
uint32_t line_loop_closing_index, xenos::Endian index_endian,
|
||||||
|
const draw_util::ViewportInfo& viewport_info, uint32_t used_texture_mask,
|
||||||
|
reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||||
|
uint32_t normalized_color_mask) {
|
||||||
|
bool edram_rov_used = render_target_cache_->GetPath() ==
|
||||||
|
RenderTargetCache::Path::kPixelShaderInterlock;
|
||||||
|
|
||||||
|
if (!edram_rov_used) {
|
||||||
|
if (primitive_polygonal) {
|
||||||
|
UpdateSystemConstantValues_Impl<true, false>(
|
||||||
|
shared_memory_is_uav, line_loop_closing_index, index_endian,
|
||||||
|
viewport_info, used_texture_mask, normalized_depth_control,
|
||||||
|
normalized_color_mask);
|
||||||
|
} else {
|
||||||
|
UpdateSystemConstantValues_Impl<false, false>(
|
||||||
|
shared_memory_is_uav, line_loop_closing_index, index_endian,
|
||||||
|
viewport_info, used_texture_mask, normalized_depth_control,
|
||||||
|
normalized_color_mask);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (primitive_polygonal) {
|
||||||
|
UpdateSystemConstantValues_Impl<true, true>(
|
||||||
|
shared_memory_is_uav, line_loop_closing_index, index_endian,
|
||||||
|
viewport_info, used_texture_mask, normalized_depth_control,
|
||||||
|
normalized_color_mask);
|
||||||
|
} else {
|
||||||
|
UpdateSystemConstantValues_Impl<false, true>(
|
||||||
|
shared_memory_is_uav, line_loop_closing_index, index_endian,
|
||||||
|
viewport_info, used_texture_mask, normalized_depth_control,
|
||||||
|
normalized_color_mask);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool D3D12CommandProcessor::UpdateBindings(
|
bool D3D12CommandProcessor::UpdateBindings(
|
||||||
const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader,
|
const D3D12Shader* vertex_shader, const D3D12Shader* pixel_shader,
|
||||||
ID3D12RootSignature* root_signature) {
|
ID3D12RootSignature* root_signature) {
|
||||||
|
@ -4081,6 +4422,9 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
current_samplers_vertex_.resize(
|
current_samplers_vertex_.resize(
|
||||||
std::max(current_samplers_vertex_.size(), sampler_count_vertex));
|
std::max(current_samplers_vertex_.size(), sampler_count_vertex));
|
||||||
for (size_t i = 0; i < sampler_count_vertex; ++i) {
|
for (size_t i = 0; i < sampler_count_vertex; ++i) {
|
||||||
|
if (i + 2 < sampler_count_vertex) {
|
||||||
|
texture_cache_->PrefetchSamplerParameters(samplers_vertex[i + 2]);
|
||||||
|
}
|
||||||
D3D12TextureCache::SamplerParameters parameters =
|
D3D12TextureCache::SamplerParameters parameters =
|
||||||
texture_cache_->GetSamplerParameters(samplers_vertex[i]);
|
texture_cache_->GetSamplerParameters(samplers_vertex[i]);
|
||||||
if (current_samplers_vertex_[i] != parameters) {
|
if (current_samplers_vertex_[i] != parameters) {
|
||||||
|
@ -4112,9 +4456,15 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
}
|
}
|
||||||
current_samplers_pixel_.resize(std::max(current_samplers_pixel_.size(),
|
current_samplers_pixel_.resize(std::max(current_samplers_pixel_.size(),
|
||||||
size_t(sampler_count_pixel)));
|
size_t(sampler_count_pixel)));
|
||||||
|
const auto samplers_pixel_derefed = samplers_pixel->data();
|
||||||
|
|
||||||
for (uint32_t i = 0; i < sampler_count_pixel; ++i) {
|
for (uint32_t i = 0; i < sampler_count_pixel; ++i) {
|
||||||
|
if (i + 2 < sampler_count_pixel) {
|
||||||
|
texture_cache_->PrefetchSamplerParameters(
|
||||||
|
samplers_pixel_derefed[i + 2]);
|
||||||
|
}
|
||||||
D3D12TextureCache::SamplerParameters parameters =
|
D3D12TextureCache::SamplerParameters parameters =
|
||||||
texture_cache_->GetSamplerParameters((*samplers_pixel)[i]);
|
texture_cache_->GetSamplerParameters(samplers_pixel_derefed[i]);
|
||||||
if (current_samplers_pixel_[i] != parameters) {
|
if (current_samplers_pixel_[i] != parameters) {
|
||||||
current_samplers_pixel_[i] = parameters;
|
current_samplers_pixel_[i] = parameters;
|
||||||
cbuffer_binding_descriptor_indices_pixel_.up_to_date = false;
|
cbuffer_binding_descriptor_indices_pixel_.up_to_date = false;
|
||||||
|
@ -4293,6 +4643,10 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
for (size_t i = 0; i < texture_count_vertex; ++i) {
|
for (size_t i = 0; i < texture_count_vertex; ++i) {
|
||||||
|
if (i + 8 < texture_count_vertex) {
|
||||||
|
texture_cache_->PrefetchTextureBinding<swcache::PrefetchTag::Level2>(
|
||||||
|
textures_vertex[i + 8].fetch_constant);
|
||||||
|
}
|
||||||
const D3D12Shader::TextureBinding& texture = textures_vertex[i];
|
const D3D12Shader::TextureBinding& texture = textures_vertex[i];
|
||||||
descriptor_indices[texture.bindless_descriptor_index] =
|
descriptor_indices[texture.bindless_descriptor_index] =
|
||||||
texture_cache_->GetActiveTextureBindlessSRVIndex(texture) -
|
texture_cache_->GetActiveTextureBindlessSRVIndex(texture) -
|
||||||
|
@ -4740,6 +5094,9 @@ void D3D12CommandProcessor::WriteGammaRampSRV(
|
||||||
device->CreateShaderResourceView(gamma_ramp_buffer_.Get(), &desc, handle);
|
device->CreateShaderResourceView(gamma_ramp_buffer_.Get(), &desc, handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define COMMAND_PROCESSOR D3D12CommandProcessor
|
||||||
|
|
||||||
|
#include "../pm4_command_processor_implement.h"
|
||||||
} // namespace d3d12
|
} // namespace d3d12
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
/**
|
/**
|
||||||
|
/**
|
||||||
/**
|
/**
|
||||||
******************************************************************************
|
******************************************************************************
|
||||||
* Xenia : Xbox 360 Emulator Research Project *
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
@ -35,6 +36,7 @@
|
||||||
#include "xenia/gpu/registers.h"
|
#include "xenia/gpu/registers.h"
|
||||||
#include "xenia/gpu/xenos.h"
|
#include "xenia/gpu/xenos.h"
|
||||||
#include "xenia/kernel/kernel_state.h"
|
#include "xenia/kernel/kernel_state.h"
|
||||||
|
#include "xenia/kernel/user_module.h"
|
||||||
#include "xenia/ui/d3d12/d3d12_descriptor_heap_pool.h"
|
#include "xenia/ui/d3d12/d3d12_descriptor_heap_pool.h"
|
||||||
#include "xenia/ui/d3d12/d3d12_provider.h"
|
#include "xenia/ui/d3d12/d3d12_provider.h"
|
||||||
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
|
#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
|
||||||
|
@ -46,6 +48,7 @@ namespace d3d12 {
|
||||||
|
|
||||||
class D3D12CommandProcessor final : public CommandProcessor {
|
class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
public:
|
public:
|
||||||
|
#include "../pm4_command_processor_declare.h"
|
||||||
explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
|
explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
|
||||||
kernel::KernelState* kernel_state);
|
kernel::KernelState* kernel_state);
|
||||||
~D3D12CommandProcessor();
|
~D3D12CommandProcessor();
|
||||||
|
@ -205,22 +208,70 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
protected:
|
protected:
|
||||||
bool SetupContext() override;
|
bool SetupContext() override;
|
||||||
void ShutdownContext() override;
|
void ShutdownContext() override;
|
||||||
XE_FORCEINLINE
|
|
||||||
void WriteRegister(uint32_t index, uint32_t value) override;
|
void WriteRegister(uint32_t index, uint32_t value) override;
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
virtual void WriteRegistersFromMem(uint32_t start_index, uint32_t* base,
|
||||||
uint32_t num_registers) override;
|
uint32_t num_registers) override;
|
||||||
|
|
||||||
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
|
XE_FORCEINLINE void WriteRegisterRangeFromMem_WithKnownBound(
|
||||||
|
uint32_t start_index, uint32_t* base, uint32_t num_registers);
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
virtual void WriteRegisterRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
uint32_t num_registers) override;
|
uint32_t num_registers) override;
|
||||||
|
template <uint32_t register_lower_bound, uint32_t register_upper_bound>
|
||||||
|
XE_FORCEINLINE void WriteRegisterRangeFromRing_WithKnownBound(
|
||||||
|
xe::RingBuffer* ring, uint32_t base, uint32_t num_registers);
|
||||||
|
|
||||||
XE_NOINLINE
|
XE_NOINLINE
|
||||||
void WriteRegisterRangeFromRing_WraparoundCase(xe::RingBuffer* ring,
|
void WriteRegisterRangeFromRing_WraparoundCase(xe::RingBuffer* ring,
|
||||||
uint32_t base,
|
uint32_t base,
|
||||||
uint32_t num_registers);
|
uint32_t num_registers);
|
||||||
XE_FORCEINLINE
|
XE_NOINLINE
|
||||||
virtual void WriteOneRegisterFromRing(xe::RingBuffer* ring, uint32_t base,
|
virtual void WriteOneRegisterFromRing(uint32_t base,
|
||||||
uint32_t num_times) override;
|
uint32_t num_times) override;
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteFetchRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteBoolRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteLoopRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteREGISTERSRangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||||
|
uint32_t num_times);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteALURangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteFetchRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteBoolRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteLoopRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteREGISTERSRangeFromMem(uint32_t start_index, uint32_t* base,
|
||||||
|
uint32_t num_registers);
|
||||||
|
|
||||||
void OnGammaRamp256EntryTableValueWritten() override;
|
void OnGammaRamp256EntryTableValueWritten() override;
|
||||||
void OnGammaRampPWLValueWritten() override;
|
void OnGammaRampPWLValueWritten() override;
|
||||||
|
|
||||||
|
@ -367,6 +418,14 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
const draw_util::Scissor& scissor,
|
const draw_util::Scissor& scissor,
|
||||||
bool primitive_polygonal,
|
bool primitive_polygonal,
|
||||||
reg::RB_DEPTHCONTROL normalized_depth_control);
|
reg::RB_DEPTHCONTROL normalized_depth_control);
|
||||||
|
|
||||||
|
template <bool primitive_polygonal, bool edram_rov_used>
|
||||||
|
XE_NOINLINE void UpdateSystemConstantValues_Impl(
|
||||||
|
bool shared_memory_is_uav, uint32_t line_loop_closing_index,
|
||||||
|
xenos::Endian index_endian, const draw_util::ViewportInfo& viewport_info,
|
||||||
|
uint32_t used_texture_mask, reg::RB_DEPTHCONTROL normalized_depth_control,
|
||||||
|
uint32_t normalized_color_mask);
|
||||||
|
|
||||||
void UpdateSystemConstantValues(bool shared_memory_is_uav,
|
void UpdateSystemConstantValues(bool shared_memory_is_uav,
|
||||||
bool primitive_polygonal,
|
bool primitive_polygonal,
|
||||||
uint32_t line_loop_closing_index,
|
uint32_t line_loop_closing_index,
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
#pragma once
|
||||||
|
// requires windows.h
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
namespace lightweight_nvapi {
|
||||||
|
|
||||||
|
using nvstatus_t = int;
|
||||||
|
|
||||||
|
using nvintfid_t = unsigned int;
|
||||||
|
|
||||||
|
#ifndef LIGHTWEIGHT_NVAPI_EXCLUDE_D3D12
|
||||||
|
constexpr nvintfid_t id_NvAPI_D3D12_QueryCpuVisibleVidmem = 0x26322BC3;
|
||||||
|
|
||||||
|
using cb_NvAPI_D3D12_QueryCpuVisibleVidmem = nvstatus_t (*)(
|
||||||
|
ID3D12Device* pDevice, uint64_t* pTotalBytes, uint64_t* pFreeBytes);
|
||||||
|
|
||||||
|
constexpr nvintfid_t id_NvAPI_D3D12_UseDriverHeapPriorities = 0xF0D978A8;
|
||||||
|
using cb_NvAPI_D3D12_UseDriverHeapPriorities =
|
||||||
|
nvstatus_t (*)(ID3D12Device* pDevice);
|
||||||
|
enum NV_D3D12_RESOURCE_FLAGS {
|
||||||
|
NV_D3D12_RESOURCE_FLAG_NONE = 0,
|
||||||
|
NV_D3D12_RESOURCE_FLAG_HTEX = 1, //!< Create HTEX texture
|
||||||
|
NV_D3D12_RESOURCE_FLAG_CPUVISIBLE_VIDMEM =
|
||||||
|
2, //!< Hint to create resource in cpuvisible vidmem
|
||||||
|
};
|
||||||
|
|
||||||
|
struct NV_RESOURCE_PARAMS {
|
||||||
|
uint32_t version; //!< Version of structure. Must always be first member
|
||||||
|
NV_D3D12_RESOURCE_FLAGS
|
||||||
|
NVResourceFlags; //!< Additional NV specific flags (set the
|
||||||
|
//!< NV_D3D12_RESOURCE_FLAG_HTEX bit to create HTEX
|
||||||
|
//!< texture)
|
||||||
|
};
|
||||||
|
|
||||||
|
using cb_NvAPI_D3D12_CreateCommittedResource = nvstatus_t (*)(
|
||||||
|
ID3D12Device* pDevice, const D3D12_HEAP_PROPERTIES* pHeapProperties,
|
||||||
|
D3D12_HEAP_FLAGS HeapFlags, const D3D12_RESOURCE_DESC* pDesc,
|
||||||
|
D3D12_RESOURCE_STATES InitialState,
|
||||||
|
const D3D12_CLEAR_VALUE* pOptimizedClearValue,
|
||||||
|
const NV_RESOURCE_PARAMS* pNVResourceParams, REFIID riid,
|
||||||
|
void** ppvResource, bool* pSupported);
|
||||||
|
constexpr nvintfid_t id_NvAPI_D3D12_CreateCommittedResource = 0x27E98AEu;
|
||||||
|
#endif
|
||||||
|
class nvapi_state_t {
|
||||||
|
HMODULE nvapi64_;
|
||||||
|
void* (*queryinterface_)(unsigned int intfid);
|
||||||
|
bool available_;
|
||||||
|
bool init_ptrs();
|
||||||
|
|
||||||
|
bool call_init_interface();
|
||||||
|
void call_deinit_interface();
|
||||||
|
|
||||||
|
public:
|
||||||
|
nvapi_state_t() : nvapi64_(LoadLibraryA("nvapi64.dll")), available_(false) {
|
||||||
|
available_ = init_ptrs();
|
||||||
|
}
|
||||||
|
~nvapi_state_t();
|
||||||
|
template <typename T>
|
||||||
|
T* query_interface(unsigned int intfid) {
|
||||||
|
if (queryinterface_ == nullptr) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return reinterpret_cast<T*>(queryinterface_(intfid));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_available() const { return available_; }
|
||||||
|
};
|
||||||
|
inline bool nvapi_state_t::call_init_interface() {
|
||||||
|
int result = -1;
|
||||||
|
auto initInterfaceEx = query_interface<int(int)>(0xAD298D3F);
|
||||||
|
if (!initInterfaceEx) {
|
||||||
|
auto initInterface = query_interface<int()>(0x150E828u);
|
||||||
|
if (initInterface) {
|
||||||
|
result = initInterface();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result = initInterfaceEx(0);
|
||||||
|
}
|
||||||
|
return result == 0;
|
||||||
|
}
|
||||||
|
inline void nvapi_state_t::call_deinit_interface() {
|
||||||
|
auto deinitinterfaceex = query_interface<void(int)>(0xD7C61344);
|
||||||
|
if (deinitinterfaceex) {
|
||||||
|
deinitinterfaceex(1); // or 0? im not sure what the proper value is
|
||||||
|
} else {
|
||||||
|
auto deinitinterface = query_interface<void()>(0xD22BDD7E);
|
||||||
|
if (deinitinterface) {
|
||||||
|
deinitinterface();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline bool nvapi_state_t::init_ptrs() {
|
||||||
|
if (!nvapi64_) return false;
|
||||||
|
queryinterface_ = reinterpret_cast<void* (*)(unsigned)>(
|
||||||
|
GetProcAddress(nvapi64_, "nvapi_QueryInterface"));
|
||||||
|
|
||||||
|
if (!queryinterface_) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (!call_init_interface()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
inline nvapi_state_t::~nvapi_state_t() {
|
||||||
|
if (available_) {
|
||||||
|
call_deinit_interface();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inline void init_nvapi() {
|
||||||
|
/// HMODULE moddy = LoadLibraryA("nvapi64.dll");
|
||||||
|
|
||||||
|
// FARPROC quif = GetProcAddress(moddy, "nvapi_QueryInterface");
|
||||||
|
|
||||||
|
nvapi_state_t nvapi{};
|
||||||
|
|
||||||
|
auto queryvisible = nvapi.query_interface<void>(0x26322BC3);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace lightweight_nvapi
|
|
@ -108,12 +108,11 @@ bool D3D12PrimitiveProcessor::InitializeBuiltinIndexBuffer(
|
||||||
size_bytes);
|
size_bytes);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
Microsoft::WRL::ComPtr<ID3D12Resource> upload_resource;
|
Microsoft::WRL::ComPtr<ID3D12Resource> upload_resource;
|
||||||
if (FAILED(device->CreateCommittedResource(
|
if (!provider.CreateUploadResource(
|
||||||
&ui::d3d12::util::kHeapPropertiesUpload,
|
|
||||||
provider.GetHeapFlagCreateNotZeroed(), &resource_desc,
|
provider.GetHeapFlagCreateNotZeroed(), &resource_desc,
|
||||||
D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
|
D3D12_RESOURCE_STATE_GENERIC_READ, IID_PPV_ARGS(&upload_resource))) {
|
||||||
IID_PPV_ARGS(&upload_resource)))) {
|
|
||||||
XELOGE(
|
XELOGE(
|
||||||
"D3D12 primitive processor: Failed to create the built-in index "
|
"D3D12 primitive processor: Failed to create the built-in index "
|
||||||
"buffer upload resource with {} bytes",
|
"buffer upload resource with {} bytes",
|
||||||
|
|
|
@ -5492,11 +5492,19 @@ void D3D12RenderTargetCache::SetCommandListRenderTargets(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bind the render targets.
|
// Bind the render targets.
|
||||||
if (are_current_command_list_render_targets_valid_ &&
|
if (are_current_command_list_render_targets_valid_) {
|
||||||
std::memcmp(current_command_list_render_targets_,
|
// chrispy: the small memcmp doesnt get optimized by msvc
|
||||||
depth_and_color_render_targets,
|
|
||||||
sizeof(current_command_list_render_targets_))) {
|
for (unsigned i = 0;
|
||||||
|
i < sizeof(current_command_list_render_targets_) /
|
||||||
|
sizeof(current_command_list_render_targets_[0]);
|
||||||
|
++i) {
|
||||||
|
if ((const void*)current_command_list_render_targets_[i] !=
|
||||||
|
(const void*)depth_and_color_render_targets[i]) {
|
||||||
are_current_command_list_render_targets_valid_ = false;
|
are_current_command_list_render_targets_valid_ = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
uint32_t render_targets_are_srgb;
|
uint32_t render_targets_are_srgb;
|
||||||
if (gamma_render_target_as_srgb_) {
|
if (gamma_render_target_as_srgb_) {
|
||||||
|
|
|
@ -467,7 +467,7 @@ void D3D12TextureCache::EndFrame() {
|
||||||
XELOGE("Unsupported texture formats used in the frame:");
|
XELOGE("Unsupported texture formats used in the frame:");
|
||||||
unsupported_header_written = true;
|
unsupported_header_written = true;
|
||||||
}
|
}
|
||||||
XELOGE("* {}{}{}{}", FormatInfo::Get(xenos::TextureFormat(i))->name,
|
XELOGE("* {}{}{}{}", FormatInfo::GetName(xenos::TextureFormat(i)),
|
||||||
unsupported_features & kUnsupportedResourceBit ? " resource" : "",
|
unsupported_features & kUnsupportedResourceBit ? " resource" : "",
|
||||||
unsupported_features & kUnsupportedUnormBit ? " unsigned" : "",
|
unsupported_features & kUnsupportedUnormBit ? " unsigned" : "",
|
||||||
unsupported_features & kUnsupportedSnormBit ? " signed" : "");
|
unsupported_features & kUnsupportedSnormBit ? " signed" : "");
|
||||||
|
@ -523,12 +523,16 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// chrispy: optimize this further
|
||||||
bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate(
|
bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate(
|
||||||
const TextureSRVKey* keys,
|
const TextureSRVKey* keys,
|
||||||
const D3D12Shader::TextureBinding* host_shader_bindings,
|
const D3D12Shader::TextureBinding* host_shader_bindings,
|
||||||
size_t host_shader_binding_count) const {
|
size_t host_shader_binding_count) const {
|
||||||
for (size_t i = 0; i < host_shader_binding_count; ++i) {
|
for (size_t i = 0; i < host_shader_binding_count; ++i) {
|
||||||
|
if (i + 8 < host_shader_binding_count) {
|
||||||
|
PrefetchTextureBinding<swcache::PrefetchTag::Nontemporal>(
|
||||||
|
host_shader_bindings[i + 8].fetch_constant);
|
||||||
|
}
|
||||||
const TextureSRVKey& key = keys[i];
|
const TextureSRVKey& key = keys[i];
|
||||||
const TextureBinding* binding =
|
const TextureBinding* binding =
|
||||||
GetValidTextureBinding(host_shader_bindings[i].fetch_constant);
|
GetValidTextureBinding(host_shader_bindings[i].fetch_constant);
|
||||||
|
@ -538,8 +542,9 @@ bool D3D12TextureCache::AreActiveTextureSRVKeysUpToDate(
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (key.key != binding->key || key.host_swizzle != binding->host_swizzle ||
|
if ((key.key != binding->key) |
|
||||||
key.swizzled_signs != binding->swizzled_signs) {
|
(key.host_swizzle != binding->host_swizzle) |
|
||||||
|
(key.swizzled_signs != binding->swizzled_signs)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -666,8 +671,12 @@ uint32_t D3D12TextureCache::GetActiveTextureBindlessSRVIndex(
|
||||||
}
|
}
|
||||||
return descriptor_index;
|
return descriptor_index;
|
||||||
}
|
}
|
||||||
|
void D3D12TextureCache::PrefetchSamplerParameters(
|
||||||
D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
|
const D3D12Shader::SamplerBinding& binding) const {
|
||||||
|
swcache::PrefetchL1(®ister_file()[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 +
|
||||||
|
binding.fetch_constant * 6]);
|
||||||
|
}
|
||||||
|
D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
|
||||||
const D3D12Shader::SamplerBinding& binding) const {
|
const D3D12Shader::SamplerBinding& binding) const {
|
||||||
const auto& regs = register_file();
|
const auto& regs = register_file();
|
||||||
const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
|
const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
|
||||||
|
@ -694,7 +703,7 @@ D3D12TextureCache::SamplerParameters D3D12TextureCache::GetSamplerParameters(
|
||||||
nullptr, nullptr, nullptr,
|
nullptr, nullptr, nullptr,
|
||||||
&mip_min_level, nullptr);
|
&mip_min_level, nullptr);
|
||||||
parameters.mip_min_level = mip_min_level;
|
parameters.mip_min_level = mip_min_level;
|
||||||
|
//high cache miss count here, prefetch fetch earlier
|
||||||
// TODO(Triang3l): Disable filtering for texture formats not supporting it.
|
// TODO(Triang3l): Disable filtering for texture formats not supporting it.
|
||||||
xenos::AnisoFilter aniso_filter =
|
xenos::AnisoFilter aniso_filter =
|
||||||
binding.aniso_filter == xenos::AnisoFilter::kUseFetchConst
|
binding.aniso_filter == xenos::AnisoFilter::kUseFetchConst
|
||||||
|
|
|
@ -119,7 +119,8 @@ class D3D12TextureCache final : public TextureCache {
|
||||||
D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
D3D12_CPU_DESCRIPTOR_HANDLE handle);
|
||||||
uint32_t GetActiveTextureBindlessSRVIndex(
|
uint32_t GetActiveTextureBindlessSRVIndex(
|
||||||
const D3D12Shader::TextureBinding& host_shader_binding);
|
const D3D12Shader::TextureBinding& host_shader_binding);
|
||||||
|
void PrefetchSamplerParameters(
|
||||||
|
const D3D12Shader::SamplerBinding& binding) const;
|
||||||
SamplerParameters GetSamplerParameters(
|
SamplerParameters GetSamplerParameters(
|
||||||
const D3D12Shader::SamplerBinding& binding) const;
|
const D3D12Shader::SamplerBinding& binding) const;
|
||||||
void WriteSampler(SamplerParameters parameters,
|
void WriteSampler(SamplerParameters parameters,
|
||||||
|
@ -712,7 +713,7 @@ class D3D12TextureCache final : public TextureCache {
|
||||||
}
|
}
|
||||||
|
|
||||||
LoadShaderIndex GetLoadShaderIndex(TextureKey key) const;
|
LoadShaderIndex GetLoadShaderIndex(TextureKey key) const;
|
||||||
|
// chrispy: todo, can use simple branchless tests here
|
||||||
static constexpr bool AreDimensionsCompatible(
|
static constexpr bool AreDimensionsCompatible(
|
||||||
xenos::FetchOpDimension binding_dimension,
|
xenos::FetchOpDimension binding_dimension,
|
||||||
xenos::DataDimension resource_dimension) {
|
xenos::DataDimension resource_dimension) {
|
||||||
|
|
|
@ -1047,8 +1047,7 @@ bool PipelineCache::ConfigurePipeline(
|
||||||
PipelineDescription& description = runtime_description.description;
|
PipelineDescription& description = runtime_description.description;
|
||||||
|
|
||||||
if (current_pipeline_ != nullptr &&
|
if (current_pipeline_ != nullptr &&
|
||||||
!std::memcmp(¤t_pipeline_->description.description, &description,
|
current_pipeline_->description.description == description) {
|
||||||
sizeof(description))) {
|
|
||||||
*pipeline_handle_out = current_pipeline_;
|
*pipeline_handle_out = current_pipeline_;
|
||||||
*root_signature_out = runtime_description.root_signature;
|
*root_signature_out = runtime_description.root_signature;
|
||||||
return true;
|
return true;
|
||||||
|
@ -1059,8 +1058,7 @@ bool PipelineCache::ConfigurePipeline(
|
||||||
auto found_range = pipelines_.equal_range(hash);
|
auto found_range = pipelines_.equal_range(hash);
|
||||||
for (auto it = found_range.first; it != found_range.second; ++it) {
|
for (auto it = found_range.first; it != found_range.second; ++it) {
|
||||||
Pipeline* found_pipeline = it->second;
|
Pipeline* found_pipeline = it->second;
|
||||||
if (!std::memcmp(&found_pipeline->description.description, &description,
|
if (found_pipeline->description.description == description) {
|
||||||
sizeof(description))) {
|
|
||||||
current_pipeline_ = found_pipeline;
|
current_pipeline_ = found_pipeline;
|
||||||
*pipeline_handle_out = found_pipeline;
|
*pipeline_handle_out = found_pipeline;
|
||||||
*root_signature_out = found_pipeline->description.root_signature;
|
*root_signature_out = found_pipeline->description.root_signature;
|
||||||
|
|
|
@ -226,6 +226,7 @@ class PipelineCache {
|
||||||
|
|
||||||
PipelineRenderTarget render_targets[xenos::kMaxColorRenderTargets];
|
PipelineRenderTarget render_targets[xenos::kMaxColorRenderTargets];
|
||||||
|
|
||||||
|
inline bool operator==(const PipelineDescription& other) const;
|
||||||
static constexpr uint32_t kVersion = 0x20210425;
|
static constexpr uint32_t kVersion = 0x20210425;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -424,7 +425,34 @@ class PipelineCache {
|
||||||
size_t creation_threads_shutdown_from_ = SIZE_MAX;
|
size_t creation_threads_shutdown_from_ = SIZE_MAX;
|
||||||
std::vector<std::unique_ptr<xe::threading::Thread>> creation_threads_;
|
std::vector<std::unique_ptr<xe::threading::Thread>> creation_threads_;
|
||||||
};
|
};
|
||||||
|
inline bool PipelineCache::PipelineDescription::operator==(
|
||||||
|
const PipelineDescription& other) const {
|
||||||
|
constexpr size_t cmp_size = sizeof(PipelineDescription);
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
if constexpr (cmp_size == 64) {
|
||||||
|
if (vertex_shader_hash != other.vertex_shader_hash ||
|
||||||
|
vertex_shader_modification != other.vertex_shader_modification) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const __m128i* thiz = (const __m128i*)this;
|
||||||
|
const __m128i* thoze = (const __m128i*)&other;
|
||||||
|
__m128i cmp32 =
|
||||||
|
_mm_cmpeq_epi8(_mm_loadu_si128(thiz + 1), _mm_loadu_si128(thoze + 1));
|
||||||
|
|
||||||
|
cmp32 = _mm_and_si128(cmp32, _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 2),
|
||||||
|
_mm_loadu_si128(thoze + 2)));
|
||||||
|
|
||||||
|
cmp32 = _mm_and_si128(cmp32, _mm_cmpeq_epi8(_mm_loadu_si128(thiz + 3),
|
||||||
|
_mm_loadu_si128(thoze + 3)));
|
||||||
|
|
||||||
|
return _mm_movemask_epi8(cmp32) == 0xFFFF;
|
||||||
|
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
return !memcmp(this, &other, cmp_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace d3d12
|
} // namespace d3d12
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -320,22 +320,38 @@ uint32_t DrawExtentEstimator::EstimateMaxY(bool try_to_estimate_vertex_max_y,
|
||||||
// scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
|
// scissor (it's set by Direct3D 9 when a viewport is used), on hosts, it
|
||||||
// usually exists and can't be disabled.
|
// usually exists and can't be disabled.
|
||||||
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||||
|
|
||||||
float viewport_bottom = 0.0f;
|
float viewport_bottom = 0.0f;
|
||||||
|
uint32_t enable_window_offset =
|
||||||
|
regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable;
|
||||||
|
|
||||||
|
bool not_pix_center = !regs.Get<reg::PA_SU_VTX_CNTL>().pix_center;
|
||||||
|
|
||||||
|
float window_y_offset_f = float(window_y_offset);
|
||||||
|
|
||||||
|
float yoffset = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
||||||
|
|
||||||
// First calculate all the integer.0 or integer.5 offsetting exactly at full
|
// First calculate all the integer.0 or integer.5 offsetting exactly at full
|
||||||
// precision.
|
// precision.
|
||||||
if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().vtx_window_offset_enable) {
|
// chrispy: branch mispredicts here causing some pain according to vtune
|
||||||
viewport_bottom += float(window_y_offset);
|
float sm1 = .0f, sm2 = .0f, sm3 = .0f, sm4 = .0f;
|
||||||
|
|
||||||
|
if (enable_window_offset) {
|
||||||
|
sm1 = window_y_offset_f;
|
||||||
}
|
}
|
||||||
if (!regs.Get<reg::PA_SU_VTX_CNTL>().pix_center) {
|
if (not_pix_center) {
|
||||||
viewport_bottom += 0.5f;
|
sm2 = 0.5f;
|
||||||
}
|
}
|
||||||
// Then apply the floating-point viewport offset.
|
// Then apply the floating-point viewport offset.
|
||||||
if (pa_cl_vte_cntl.vport_y_offset_ena) {
|
if (pa_cl_vte_cntl.vport_y_offset_ena) {
|
||||||
viewport_bottom += regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
sm3 = yoffset;
|
||||||
}
|
}
|
||||||
viewport_bottom += pa_cl_vte_cntl.vport_y_scale_ena
|
sm4 = pa_cl_vte_cntl.vport_y_scale_ena
|
||||||
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
|
? std::abs(regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32)
|
||||||
: 1.0f;
|
: 1.0f;
|
||||||
|
|
||||||
|
viewport_bottom = sm1 + sm2 + sm3 + sm4;
|
||||||
|
|
||||||
// Using floor, or, rather, truncation (because maxing with zero anyway)
|
// Using floor, or, rather, truncation (because maxing with zero anyway)
|
||||||
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
|
// similar to how viewport scissoring behaves on real AMD, Intel and Nvidia
|
||||||
// GPUs on Direct3D 12 (but not WARP), also like in
|
// GPUs on Direct3D 12 (but not WARP), also like in
|
||||||
|
|
|
@ -929,8 +929,8 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
||||||
XELOGW(
|
XELOGW(
|
||||||
"Resolving to format {}, which is untested - treating like {}. "
|
"Resolving to format {}, which is untested - treating like {}. "
|
||||||
"Report the game to Xenia developers!",
|
"Report the game to Xenia developers!",
|
||||||
FormatInfo::Get(dest_format)->name,
|
FormatInfo::GetName(dest_format),
|
||||||
FormatInfo::Get(dest_closest_format)->name);
|
FormatInfo::GetName(dest_closest_format));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1002,7 +1002,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
XELOGE("Tried to resolve to format {}, which is not a ColorFormat",
|
XELOGE("Tried to resolve to format {}, which is not a ColorFormat",
|
||||||
dest_format_info.name);
|
FormatInfo::GetName(dest_format));
|
||||||
copy_dest_extent_start = copy_dest_base_adjusted;
|
copy_dest_extent_start = copy_dest_base_adjusted;
|
||||||
copy_dest_extent_end = copy_dest_base_adjusted;
|
copy_dest_extent_end = copy_dest_base_adjusted;
|
||||||
}
|
}
|
||||||
|
@ -1117,7 +1117,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
||||||
xenos::DepthRenderTargetFormat(depth_edram_info.format))
|
xenos::DepthRenderTargetFormat(depth_edram_info.format))
|
||||||
: xenos::GetColorRenderTargetFormatName(
|
: xenos::GetColorRenderTargetFormatName(
|
||||||
xenos::ColorRenderTargetFormat(color_edram_info.format)),
|
xenos::ColorRenderTargetFormat(color_edram_info.format)),
|
||||||
dest_format_info.name, rb_copy_dest_base, copy_dest_extent_start,
|
FormatInfo::GetName(dest_format), rb_copy_dest_base, copy_dest_extent_start,
|
||||||
copy_dest_extent_end);
|
copy_dest_extent_end);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -0,0 +1,106 @@
|
||||||
|
|
||||||
|
|
||||||
|
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
virtual bool ExecutePacket();
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType0( uint32_t packet) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType1( uint32_t packet) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType2( uint32_t packet) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3( uint32_t packet) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_ME_INIT( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
bool ExecutePacketType3_NOP( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_INTERRUPT( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_XE_SWAP( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_INDIRECT_BUFFER( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_WAIT_REG_MEM( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_REG_RMW( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_REG_TO_MEM( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_MEM_WRITE( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_COND_WRITE( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_EVENT_WRITE( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_EVENT_WRITE_SHD( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_EVENT_WRITE_EXT( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_EVENT_WRITE_ZPD( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3Draw( uint32_t packet,
|
||||||
|
const char* opcode_name,
|
||||||
|
uint32_t viz_query_condition,
|
||||||
|
uint32_t count_remaining) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_DRAW_INDX( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_DRAW_INDX_2( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_FORCEINLINE
|
||||||
|
bool ExecutePacketType3_SET_CONSTANT( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_NOINLINE
|
||||||
|
bool ExecutePacketType3_SET_CONSTANT2( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
XE_FORCEINLINE
|
||||||
|
bool ExecutePacketType3_LOAD_ALU_CONSTANT( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_SET_SHADER_CONSTANTS(
|
||||||
|
uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_IM_LOAD( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_IM_LOAD_IMMEDIATE( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_INVALIDATE_STATE( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
bool ExecutePacketType3_VIZ_QUERY( uint32_t packet,
|
||||||
|
uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
void WriteEventInitiator(uint32_t value) XE_RESTRICT;
|
||||||
|
|
||||||
|
XE_NOINLINE
|
||||||
|
XE_COLD
|
||||||
|
bool HitUnimplementedOpcode(uint32_t opcode, uint32_t count) XE_RESTRICT;
|
||||||
|
|
||||||
|
XE_NOINLINE
|
||||||
|
XE_NOALIAS
|
||||||
|
uint32_t GetCurrentRingReadCount();
|
||||||
|
|
||||||
|
XE_NOINLINE
|
||||||
|
XE_COLD
|
||||||
|
bool ExecutePacketType3_CountOverflow(uint32_t count);
|
File diff suppressed because it is too large
Load Diff
|
@ -233,15 +233,27 @@ void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last,
|
||||||
// Fire per-range watches.
|
// Fire per-range watches.
|
||||||
for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
|
for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
|
||||||
WatchNode* node = watch_buckets_[i];
|
WatchNode* node = watch_buckets_[i];
|
||||||
|
if (i + 1 <= bucket_last) {
|
||||||
|
WatchNode* nextnode = watch_buckets_[i + 1];
|
||||||
|
if (nextnode) {
|
||||||
|
swcache::PrefetchL1(nextnode->range);
|
||||||
|
}
|
||||||
|
}
|
||||||
while (node != nullptr) {
|
while (node != nullptr) {
|
||||||
WatchRange* range = node->range;
|
WatchRange* range = node->range;
|
||||||
// Store the next node now since when the callback is triggered, the links
|
// Store the next node now since when the callback is triggered, the links
|
||||||
// will be broken.
|
// will be broken.
|
||||||
node = node->bucket_node_next;
|
node = node->bucket_node_next;
|
||||||
|
if (node) {
|
||||||
|
swcache::PrefetchL1(node);
|
||||||
|
}
|
||||||
if (page_first <= range->page_last && page_last >= range->page_first) {
|
if (page_first <= range->page_last && page_last >= range->page_first) {
|
||||||
range->callback(global_lock, range->callback_context,
|
range->callback(global_lock, range->callback_context,
|
||||||
range->callback_data, range->callback_argument,
|
range->callback_data, range->callback_argument,
|
||||||
invalidated_by_gpu);
|
invalidated_by_gpu);
|
||||||
|
if (node && node->range) {
|
||||||
|
swcache::PrefetchL1(node->range);
|
||||||
|
}
|
||||||
UnlinkWatchRange(range);
|
UnlinkWatchRange(range);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -440,7 +440,7 @@ void TextureCache::TextureKey::LogAction(const char* action) const {
|
||||||
"base at 0x{:08X} (pitch {}), mips at 0x{:08X}",
|
"base at 0x{:08X} (pitch {}), mips at 0x{:08X}",
|
||||||
action, tiled ? "tiled" : "linear", scaled_resolve ? "scaled " : "",
|
action, tiled ? "tiled" : "linear", scaled_resolve ? "scaled " : "",
|
||||||
GetWidth(), GetHeight(), GetDepthOrArraySize(), GetLogDimensionName(),
|
GetWidth(), GetHeight(), GetDepthOrArraySize(), GetLogDimensionName(),
|
||||||
FormatInfo::Get(format)->name, mip_max_level + 1, packed_mips ? "" : "un",
|
FormatInfo::GetName(format), mip_max_level + 1, packed_mips ? "" : "un",
|
||||||
mip_max_level != 0 ? "s" : "", base_page << 12, pitch << 5,
|
mip_max_level != 0 ? "s" : "", base_page << 12, pitch << 5,
|
||||||
mip_page << 12);
|
mip_page << 12);
|
||||||
}
|
}
|
||||||
|
@ -453,7 +453,7 @@ void TextureCache::Texture::LogAction(const char* action) const {
|
||||||
action, key_.tiled ? "tiled" : "linear",
|
action, key_.tiled ? "tiled" : "linear",
|
||||||
key_.scaled_resolve ? "scaled " : "", key_.GetWidth(), key_.GetHeight(),
|
key_.scaled_resolve ? "scaled " : "", key_.GetWidth(), key_.GetHeight(),
|
||||||
key_.GetDepthOrArraySize(), key_.GetLogDimensionName(),
|
key_.GetDepthOrArraySize(), key_.GetLogDimensionName(),
|
||||||
FormatInfo::Get(key_.format)->name, key_.mip_max_level + 1,
|
FormatInfo::GetName(key_.format), key_.mip_max_level + 1,
|
||||||
key_.packed_mips ? "" : "un", key_.mip_max_level != 0 ? "s" : "",
|
key_.packed_mips ? "" : "un", key_.mip_max_level != 0 ? "s" : "",
|
||||||
key_.base_page << 12, key_.pitch << 5, GetGuestBaseSize(),
|
key_.base_page << 12, key_.pitch << 5, GetGuestBaseSize(),
|
||||||
key_.mip_page << 12, GetGuestMipsSize());
|
key_.mip_page << 12, GetGuestMipsSize());
|
||||||
|
|
|
@ -128,6 +128,14 @@ class TextureCache {
|
||||||
return (binding->texture && binding->texture->IsResolved()) ||
|
return (binding->texture && binding->texture->IsResolved()) ||
|
||||||
(binding->texture_signed && binding->texture_signed->IsResolved());
|
(binding->texture_signed && binding->texture_signed->IsResolved());
|
||||||
}
|
}
|
||||||
|
template <swcache::PrefetchTag tag>
|
||||||
|
void PrefetchTextureBinding(uint32_t fetch_constant_index) const {
|
||||||
|
swcache::Prefetch<tag>(&texture_bindings_[fetch_constant_index]);
|
||||||
|
swcache::Prefetch<tag>(
|
||||||
|
&texture_bindings_[fetch_constant_index +
|
||||||
|
1]); // we may cross a cache line boundary :( size
|
||||||
|
// of the structure is 0x28
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
struct TextureKey {
|
struct TextureKey {
|
||||||
|
|
|
@ -85,7 +85,7 @@ void TextureDump(const TextureInfo& src, void* buffer, size_t length) {
|
||||||
assert_unhandled_case(src.format);
|
assert_unhandled_case(src.format);
|
||||||
std::memset(&dds_header.pixel_format, 0xCD,
|
std::memset(&dds_header.pixel_format, 0xCD,
|
||||||
sizeof(dds_header.pixel_format));
|
sizeof(dds_header.pixel_format));
|
||||||
XELOGW("Skipping {} for texture dump.", src.format_info()->name);
|
XELOGW("Skipping {} for texture dump.", src.format_name());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -96,7 +96,7 @@ void TextureDump(const TextureInfo& src, void* buffer, size_t length) {
|
||||||
std::filesystem::path path = "texture_dumps";
|
std::filesystem::path path = "texture_dumps";
|
||||||
path /= fmt::format("{:05d}_{:08X}_{:08X}_{:08X}.dds", dump_counter++,
|
path /= fmt::format("{:05d}_{:08X}_{:08X}_{:08X}.dds", dump_counter++,
|
||||||
src.memory.base_address, src.memory.mip_address,
|
src.memory.base_address, src.memory.mip_address,
|
||||||
src.format_info()->name);
|
src.format_name());
|
||||||
|
|
||||||
FILE* handle = filesystem::OpenFile(path, "wb");
|
FILE* handle = filesystem::OpenFile(path, "wb");
|
||||||
if (handle) {
|
if (handle) {
|
||||||
|
|
|
@ -159,151 +159,6 @@ void TextureInfo::GetMipSize(uint32_t mip, uint32_t* out_width,
|
||||||
*out_height = std::max(height_pow2 >> mip, 1u);
|
*out_height = std::max(height_pow2 >> mip, 1u);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t TextureInfo::GetMipLocation(uint32_t mip, uint32_t* offset_x,
|
|
||||||
uint32_t* offset_y, bool is_guest) const {
|
|
||||||
if (mip == 0) {
|
|
||||||
// Short-circuit. Mip 0 is always stored in base_address.
|
|
||||||
if (!has_packed_mips) {
|
|
||||||
*offset_x = 0;
|
|
||||||
*offset_y = 0;
|
|
||||||
} else {
|
|
||||||
GetPackedTileOffset(0, offset_x, offset_y);
|
|
||||||
}
|
|
||||||
return memory.base_address;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!memory.mip_address) {
|
|
||||||
// Short-circuit. There is no mip data.
|
|
||||||
*offset_x = 0;
|
|
||||||
*offset_y = 0;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t address_base, address_offset;
|
|
||||||
address_base = memory.mip_address;
|
|
||||||
address_offset = 0;
|
|
||||||
|
|
||||||
auto bytes_per_block = format_info()->bytes_per_block();
|
|
||||||
|
|
||||||
if (!has_packed_mips) {
|
|
||||||
for (uint32_t i = 1; i < mip; i++) {
|
|
||||||
address_offset +=
|
|
||||||
GetMipExtent(i, is_guest).all_blocks() * bytes_per_block;
|
|
||||||
}
|
|
||||||
*offset_x = 0;
|
|
||||||
*offset_y = 0;
|
|
||||||
return address_base + address_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t width_pow2 = xe::next_pow2(width + 1);
|
|
||||||
uint32_t height_pow2 = xe::next_pow2(height + 1);
|
|
||||||
|
|
||||||
// Walk forward to find the address of the mip.
|
|
||||||
uint32_t packed_mip_base = 1;
|
|
||||||
for (uint32_t i = packed_mip_base; i < mip; i++, packed_mip_base++) {
|
|
||||||
uint32_t mip_width = std::max(width_pow2 >> i, 1u);
|
|
||||||
uint32_t mip_height = std::max(height_pow2 >> i, 1u);
|
|
||||||
if (std::min(mip_width, mip_height) <= 16) {
|
|
||||||
// We've reached the point where the mips are packed into a single tile.
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
address_offset += GetMipExtent(i, is_guest).all_blocks() * bytes_per_block;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now, check if the mip is packed at an offset.
|
|
||||||
GetPackedTileOffset(width_pow2 >> mip, height_pow2 >> mip, format_info(),
|
|
||||||
mip - packed_mip_base, offset_x, offset_y);
|
|
||||||
return address_base + address_offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool TextureInfo::GetPackedTileOffset(uint32_t width, uint32_t height,
|
|
||||||
const FormatInfo* format_info,
|
|
||||||
int packed_tile, uint32_t* offset_x,
|
|
||||||
uint32_t* offset_y) {
|
|
||||||
// Tile size is 32x32, and once textures go <=16 they are packed into a
|
|
||||||
// single tile together. The math here is insane. Most sourced
|
|
||||||
// from graph paper and looking at dds dumps.
|
|
||||||
// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
|
|
||||||
// 0 +.4x4.+ +.....8x8.....+ +............16x16............+
|
|
||||||
// 1 +.4x4.+ +.....8x8.....+ +............16x16............+
|
|
||||||
// 2 +.4x4.+ +.....8x8.....+ +............16x16............+
|
|
||||||
// 3 +.4x4.+ +.....8x8.....+ +............16x16............+
|
|
||||||
// 4 x +.....8x8.....+ +............16x16............+
|
|
||||||
// 5 +.....8x8.....+ +............16x16............+
|
|
||||||
// 6 +.....8x8.....+ +............16x16............+
|
|
||||||
// 7 +.....8x8.....+ +............16x16............+
|
|
||||||
// 8 2x2 +............16x16............+
|
|
||||||
// 9 2x2 +............16x16............+
|
|
||||||
// 0 +............16x16............+
|
|
||||||
// ... .....
|
|
||||||
// This only works for square textures, or textures that are some non-pot
|
|
||||||
// <= square. As soon as the aspect ratio goes weird, the textures start to
|
|
||||||
// stretch across tiles.
|
|
||||||
//
|
|
||||||
// The 2x2 and 1x1 squares are packed in their specific positions because
|
|
||||||
// each square is the size of at least one block (which is 4x4 pixels max)
|
|
||||||
//
|
|
||||||
// if (tile_aligned(w) > tile_aligned(h)) {
|
|
||||||
// // wider than tall, so packed horizontally
|
|
||||||
// } else if (tile_aligned(w) < tile_aligned(h)) {
|
|
||||||
// // taller than wide, so packed vertically
|
|
||||||
// } else {
|
|
||||||
// square
|
|
||||||
// }
|
|
||||||
// It's important to use logical sizes here, as the input sizes will be
|
|
||||||
// for the entire packed tile set, not the actual texture.
|
|
||||||
// The minimum dimension is what matters most: if either width or height
|
|
||||||
// is <= 16 this mode kicks in.
|
|
||||||
|
|
||||||
uint32_t log2_width = xe::log2_ceil(width);
|
|
||||||
uint32_t log2_height = xe::log2_ceil(height);
|
|
||||||
if (std::min(log2_width, log2_height) > 4) {
|
|
||||||
// Too big, not packed.
|
|
||||||
*offset_x = 0;
|
|
||||||
*offset_y = 0;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the block offset of the mip.
|
|
||||||
if (packed_tile < 3) {
|
|
||||||
if (log2_width > log2_height) {
|
|
||||||
// Wider than tall. Laid out vertically.
|
|
||||||
*offset_x = 0;
|
|
||||||
*offset_y = 16 >> packed_tile;
|
|
||||||
} else {
|
|
||||||
// Taller than wide. Laid out horizontally.
|
|
||||||
*offset_x = 16 >> packed_tile;
|
|
||||||
*offset_y = 0;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (log2_width > log2_height) {
|
|
||||||
// Wider than tall. Laid out vertically.
|
|
||||||
*offset_x = 16 >> (packed_tile - 2);
|
|
||||||
*offset_y = 0;
|
|
||||||
} else {
|
|
||||||
// Taller than wide. Laid out horizontally.
|
|
||||||
*offset_x = 0;
|
|
||||||
*offset_y = 16 >> (packed_tile - 2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
*offset_x /= format_info->block_width;
|
|
||||||
*offset_y /= format_info->block_height;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool TextureInfo::GetPackedTileOffset(int packed_tile, uint32_t* offset_x,
|
|
||||||
uint32_t* offset_y) const {
|
|
||||||
if (!has_packed_mips) {
|
|
||||||
*offset_x = 0;
|
|
||||||
*offset_y = 0;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return GetPackedTileOffset(xe::next_pow2(width + 1),
|
|
||||||
xe::next_pow2(height + 1), format_info(),
|
|
||||||
packed_tile, offset_x, offset_y);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t TextureInfo::hash() const {
|
uint64_t TextureInfo::hash() const {
|
||||||
return XXH3_64bits(this, sizeof(TextureInfo));
|
return XXH3_64bits(this, sizeof(TextureInfo));
|
||||||
}
|
}
|
||||||
|
|
|
@ -181,7 +181,7 @@ inline xenos::TextureFormat DepthRenderTargetToTextureFormat(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
enum class FormatType {
|
enum class FormatType : uint32_t {
|
||||||
// Uncompressed, and is also a ColorFormat.
|
// Uncompressed, and is also a ColorFormat.
|
||||||
kResolvable,
|
kResolvable,
|
||||||
// Uncompressed, but resolve or memory export cannot be done to the format.
|
// Uncompressed, but resolve or memory export cannot be done to the format.
|
||||||
|
@ -190,12 +190,12 @@ enum class FormatType {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct FormatInfo {
|
struct FormatInfo {
|
||||||
xenos::TextureFormat format;
|
const xenos::TextureFormat format;
|
||||||
const char* name;
|
|
||||||
FormatType type;
|
const FormatType type;
|
||||||
uint32_t block_width;
|
const uint32_t block_width;
|
||||||
uint32_t block_height;
|
const uint32_t block_height;
|
||||||
uint32_t bits_per_pixel;
|
const uint32_t bits_per_pixel;
|
||||||
|
|
||||||
uint32_t bytes_per_block() const {
|
uint32_t bytes_per_block() const {
|
||||||
return block_width * block_height * bits_per_pixel / 8;
|
return block_width * block_height * bits_per_pixel / 8;
|
||||||
|
@ -203,6 +203,20 @@ struct FormatInfo {
|
||||||
|
|
||||||
static const FormatInfo* Get(uint32_t gpu_format);
|
static const FormatInfo* Get(uint32_t gpu_format);
|
||||||
|
|
||||||
|
static const char* GetName(uint32_t gpu_format);
|
||||||
|
static const char* GetName(xenos::TextureFormat format) {
|
||||||
|
return GetName(static_cast<uint32_t>(format));
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned char GetWidthShift(uint32_t gpu_format);
|
||||||
|
static unsigned char GetHeightShift(uint32_t gpu_format);
|
||||||
|
|
||||||
|
static unsigned char GetWidthShift(xenos::TextureFormat gpu_format) {
|
||||||
|
return GetWidthShift(static_cast<uint32_t>(gpu_format));
|
||||||
|
}
|
||||||
|
static unsigned char GetHeightShift(xenos::TextureFormat gpu_format) {
|
||||||
|
return GetHeightShift(static_cast<uint32_t>(gpu_format));
|
||||||
|
}
|
||||||
static const FormatInfo* Get(xenos::TextureFormat format) {
|
static const FormatInfo* Get(xenos::TextureFormat format) {
|
||||||
return Get(static_cast<uint32_t>(format));
|
return Get(static_cast<uint32_t>(format));
|
||||||
}
|
}
|
||||||
|
@ -259,7 +273,9 @@ struct TextureInfo {
|
||||||
const FormatInfo* format_info() const {
|
const FormatInfo* format_info() const {
|
||||||
return FormatInfo::Get(static_cast<uint32_t>(format));
|
return FormatInfo::Get(static_cast<uint32_t>(format));
|
||||||
}
|
}
|
||||||
|
const char* format_name() const {
|
||||||
|
return FormatInfo::GetName(static_cast<uint32_t>(format));
|
||||||
|
}
|
||||||
bool is_compressed() const {
|
bool is_compressed() const {
|
||||||
return format_info()->type == FormatType::kCompressed;
|
return format_info()->type == FormatType::kCompressed;
|
||||||
}
|
}
|
||||||
|
@ -281,18 +297,6 @@ struct TextureInfo {
|
||||||
|
|
||||||
void GetMipSize(uint32_t mip, uint32_t* width, uint32_t* height) const;
|
void GetMipSize(uint32_t mip, uint32_t* width, uint32_t* height) const;
|
||||||
|
|
||||||
// Get the memory location of a mip. offset_x and offset_y are in blocks.
|
|
||||||
uint32_t GetMipLocation(uint32_t mip, uint32_t* offset_x, uint32_t* offset_y,
|
|
||||||
bool is_guest) const;
|
|
||||||
|
|
||||||
static bool GetPackedTileOffset(uint32_t width, uint32_t height,
|
|
||||||
const FormatInfo* format_info,
|
|
||||||
int packed_tile, uint32_t* offset_x,
|
|
||||||
uint32_t* offset_y);
|
|
||||||
|
|
||||||
bool GetPackedTileOffset(int packed_tile, uint32_t* offset_x,
|
|
||||||
uint32_t* offset_y) const;
|
|
||||||
|
|
||||||
uint64_t hash() const;
|
uint64_t hash() const;
|
||||||
bool operator==(const TextureInfo& other) const {
|
bool operator==(const TextureInfo& other) const {
|
||||||
return std::memcmp(this, &other, sizeof(TextureInfo)) == 0;
|
return std::memcmp(this, &other, sizeof(TextureInfo)) == 0;
|
||||||
|
|
|
@ -17,77 +17,60 @@ namespace gpu {
|
||||||
using namespace xe::gpu::xenos;
|
using namespace xe::gpu::xenos;
|
||||||
|
|
||||||
#define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) \
|
#define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) \
|
||||||
{xenos::TextureFormat::texture_format, #texture_format, FormatType::format, block_width, block_height, bits_per_pixel}
|
{xenos::TextureFormat::texture_format, FormatType::format, block_width, block_height, bits_per_pixel}
|
||||||
const FormatInfo* FormatInfo::Get(uint32_t gpu_format) {
|
const FormatInfo* FormatInfo::Get(uint32_t gpu_format) {
|
||||||
static const FormatInfo format_infos[64] = {
|
static const FormatInfo format_infos[64] = {
|
||||||
FORMAT_INFO(k_1_REVERSE , kUncompressed, 1, 1, 1),
|
#include "texture_info_formats.inl"
|
||||||
FORMAT_INFO(k_1 , kUncompressed, 1, 1, 1),
|
|
||||||
FORMAT_INFO(k_8 , kResolvable, 1, 1, 8),
|
|
||||||
FORMAT_INFO(k_1_5_5_5 , kResolvable, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_5_6_5 , kResolvable, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_6_5_5 , kResolvable, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_8_8_8_8 , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_2_10_10_10 , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_8_A , kResolvable, 1, 1, 8),
|
|
||||||
FORMAT_INFO(k_8_B , kResolvable, 1, 1, 8),
|
|
||||||
FORMAT_INFO(k_8_8 , kResolvable, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_Cr_Y1_Cb_Y0_REP , kCompressed, 2, 1, 16),
|
|
||||||
FORMAT_INFO(k_Y1_Cr_Y0_Cb_REP , kCompressed, 2, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_16_EDRAM , kUncompressed, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_8_8_8_8_A , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_4_4_4_4 , kResolvable, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_10_11_11 , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_11_11_10 , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_DXT1 , kCompressed, 4, 4, 4),
|
|
||||||
FORMAT_INFO(k_DXT2_3 , kCompressed, 4, 4, 8),
|
|
||||||
FORMAT_INFO(k_DXT4_5 , kCompressed, 4, 4, 8),
|
|
||||||
FORMAT_INFO(k_16_16_16_16_EDRAM , kUncompressed, 1, 1, 64),
|
|
||||||
FORMAT_INFO(k_24_8 , kUncompressed, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_24_8_FLOAT , kUncompressed, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_16 , kResolvable, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_16 , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_16_16_16_16 , kResolvable, 1, 1, 64),
|
|
||||||
FORMAT_INFO(k_16_EXPAND , kUncompressed, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_16_EXPAND , kUncompressed, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_16_16_16_16_EXPAND , kUncompressed, 1, 1, 64),
|
|
||||||
FORMAT_INFO(k_16_FLOAT , kResolvable, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_16_FLOAT , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_16_16_16_16_FLOAT , kResolvable, 1, 1, 64),
|
|
||||||
FORMAT_INFO(k_32 , kUncompressed, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_32_32 , kUncompressed, 1, 1, 64),
|
|
||||||
FORMAT_INFO(k_32_32_32_32 , kUncompressed, 1, 1, 128),
|
|
||||||
FORMAT_INFO(k_32_FLOAT , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_32_32_FLOAT , kResolvable, 1, 1, 64),
|
|
||||||
FORMAT_INFO(k_32_32_32_32_FLOAT , kResolvable, 1, 1, 128),
|
|
||||||
FORMAT_INFO(k_32_AS_8 , kCompressed, 4, 1, 8),
|
|
||||||
FORMAT_INFO(k_32_AS_8_8 , kCompressed, 2, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_MPEG , kUncompressed, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_16_MPEG , kUncompressed, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_8_INTERLACED , kUncompressed, 1, 1, 8),
|
|
||||||
FORMAT_INFO(k_32_AS_8_INTERLACED , kCompressed, 4, 1, 8),
|
|
||||||
FORMAT_INFO(k_32_AS_8_8_INTERLACED , kCompressed, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_INTERLACED , kUncompressed, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_MPEG_INTERLACED , kUncompressed, 1, 1, 16),
|
|
||||||
FORMAT_INFO(k_16_16_MPEG_INTERLACED , kUncompressed, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_DXN , kCompressed, 4, 4, 8),
|
|
||||||
FORMAT_INFO(k_8_8_8_8_AS_16_16_16_16 , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_DXT1_AS_16_16_16_16 , kCompressed, 4, 4, 4),
|
|
||||||
FORMAT_INFO(k_DXT2_3_AS_16_16_16_16 , kCompressed, 4, 4, 8),
|
|
||||||
FORMAT_INFO(k_DXT4_5_AS_16_16_16_16 , kCompressed, 4, 4, 8),
|
|
||||||
FORMAT_INFO(k_2_10_10_10_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_10_11_11_AS_16_16_16_16 , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_11_11_10_AS_16_16_16_16 , kResolvable, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_32_32_32_FLOAT , kUncompressed, 1, 1, 96),
|
|
||||||
FORMAT_INFO(k_DXT3A , kCompressed, 4, 4, 4),
|
|
||||||
FORMAT_INFO(k_DXT5A , kCompressed, 4, 4, 4),
|
|
||||||
FORMAT_INFO(k_CTX1 , kCompressed, 4, 4, 4),
|
|
||||||
FORMAT_INFO(k_DXT3A_AS_1_1_1_1 , kCompressed, 4, 4, 4),
|
|
||||||
FORMAT_INFO(k_8_8_8_8_GAMMA_EDRAM , kUncompressed, 1, 1, 32),
|
|
||||||
FORMAT_INFO(k_2_10_10_10_FLOAT_EDRAM , kUncompressed, 1, 1, 32),
|
|
||||||
};
|
};
|
||||||
return &format_infos[gpu_format];
|
return &format_infos[gpu_format];
|
||||||
}
|
}
|
||||||
#undef FORMAT_INFO
|
#undef FORMAT_INFO
|
||||||
|
|
||||||
|
|
||||||
|
constexpr unsigned char GetShift(unsigned pow) {
|
||||||
|
unsigned char sh = 0;
|
||||||
|
|
||||||
|
while (!(pow & 1)) {
|
||||||
|
pow>>=1;
|
||||||
|
sh++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return sh;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
todo: getwidthshift and getheightshift should not need a full 64 byte table each
|
||||||
|
there are 15 elements for GetWidthShift where the shift will not be 0. the max shift that will be returned is 2, and there are 64 elements total
|
||||||
|
this means we can use a boolean table that also acts as a sparse indexer ( popcnt preceding bits to get index) and shift and mask a 32 bit word to get the shift
|
||||||
|
*/
|
||||||
|
unsigned char FormatInfo::GetWidthShift(uint32_t gpu_format) {
|
||||||
|
#define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) GetShift(block_width)
|
||||||
|
alignas(XE_HOST_CACHE_LINE_SIZE)
|
||||||
|
constexpr unsigned char wshift_table[64] = {
|
||||||
|
#include "texture_info_formats.inl"
|
||||||
|
};
|
||||||
|
#undef FORMAT_INFO
|
||||||
|
|
||||||
|
return wshift_table[gpu_format];
|
||||||
|
}
|
||||||
|
unsigned char FormatInfo::GetHeightShift(uint32_t gpu_format) {
|
||||||
|
#define FORMAT_INFO(texture_format, format, block_width, block_height, bits_per_pixel) GetShift(block_height)
|
||||||
|
alignas(XE_HOST_CACHE_LINE_SIZE)
|
||||||
|
constexpr unsigned char hshift_table[64] = {
|
||||||
|
#include "texture_info_formats.inl"
|
||||||
|
};
|
||||||
|
#undef FORMAT_INFO
|
||||||
|
|
||||||
|
return hshift_table[gpu_format];
|
||||||
|
}
|
||||||
|
#define FORMAT_INFO(texture_format,...) #texture_format
|
||||||
|
static constexpr const char* const format_name_table[64] = {
|
||||||
|
#include "texture_info_formats.inl"
|
||||||
|
|
||||||
|
};
|
||||||
|
#undef FORMAT_INFO
|
||||||
|
const char* FormatInfo::GetName(uint32_t gpu_format) {
|
||||||
|
|
||||||
|
return format_name_table[gpu_format];
|
||||||
|
}
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -0,0 +1,64 @@
|
||||||
|
FORMAT_INFO(k_1_REVERSE, kUncompressed, 1, 1, 1),
|
||||||
|
FORMAT_INFO(k_1, kUncompressed, 1, 1, 1),
|
||||||
|
FORMAT_INFO(k_8, kResolvable, 1, 1, 8),
|
||||||
|
FORMAT_INFO(k_1_5_5_5, kResolvable, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_5_6_5, kResolvable, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_6_5_5, kResolvable, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_8_8_8_8, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_2_10_10_10, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_8_A, kResolvable, 1, 1, 8),
|
||||||
|
FORMAT_INFO(k_8_B, kResolvable, 1, 1, 8),
|
||||||
|
FORMAT_INFO(k_8_8, kResolvable, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_Cr_Y1_Cb_Y0_REP, kCompressed, 2, 1, 16),
|
||||||
|
FORMAT_INFO(k_Y1_Cr_Y0_Cb_REP, kCompressed, 2, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_16_EDRAM, kUncompressed, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_8_8_8_8_A, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_4_4_4_4, kResolvable, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_10_11_11, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_11_11_10, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_DXT1, kCompressed, 4, 4, 4),
|
||||||
|
FORMAT_INFO(k_DXT2_3, kCompressed, 4, 4, 8),
|
||||||
|
FORMAT_INFO(k_DXT4_5, kCompressed, 4, 4, 8),
|
||||||
|
FORMAT_INFO(k_16_16_16_16_EDRAM, kUncompressed, 1, 1, 64),
|
||||||
|
FORMAT_INFO(k_24_8, kUncompressed, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_24_8_FLOAT, kUncompressed, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_16, kResolvable, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_16, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_16_16_16_16, kResolvable, 1, 1, 64),
|
||||||
|
FORMAT_INFO(k_16_EXPAND, kUncompressed, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_16_EXPAND, kUncompressed, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_16_16_16_16_EXPAND, kUncompressed, 1, 1, 64),
|
||||||
|
FORMAT_INFO(k_16_FLOAT, kResolvable, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_16_FLOAT, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_16_16_16_16_FLOAT, kResolvable, 1, 1, 64),
|
||||||
|
FORMAT_INFO(k_32, kUncompressed, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_32_32, kUncompressed, 1, 1, 64),
|
||||||
|
FORMAT_INFO(k_32_32_32_32, kUncompressed, 1, 1, 128),
|
||||||
|
FORMAT_INFO(k_32_FLOAT, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_32_32_FLOAT, kResolvable, 1, 1, 64),
|
||||||
|
FORMAT_INFO(k_32_32_32_32_FLOAT, kResolvable, 1, 1, 128),
|
||||||
|
FORMAT_INFO(k_32_AS_8, kCompressed, 4, 1, 8),
|
||||||
|
FORMAT_INFO(k_32_AS_8_8, kCompressed, 2, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_MPEG, kUncompressed, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_16_MPEG, kUncompressed, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_8_INTERLACED, kUncompressed, 1, 1, 8),
|
||||||
|
FORMAT_INFO(k_32_AS_8_INTERLACED, kCompressed, 4, 1, 8),
|
||||||
|
FORMAT_INFO(k_32_AS_8_8_INTERLACED, kCompressed, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_INTERLACED, kUncompressed, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_MPEG_INTERLACED, kUncompressed, 1, 1, 16),
|
||||||
|
FORMAT_INFO(k_16_16_MPEG_INTERLACED, kUncompressed, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_DXN, kCompressed, 4, 4, 8),
|
||||||
|
FORMAT_INFO(k_8_8_8_8_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_DXT1_AS_16_16_16_16, kCompressed, 4, 4, 4),
|
||||||
|
FORMAT_INFO(k_DXT2_3_AS_16_16_16_16, kCompressed, 4, 4, 8),
|
||||||
|
FORMAT_INFO(k_DXT4_5_AS_16_16_16_16, kCompressed, 4, 4, 8),
|
||||||
|
FORMAT_INFO(k_2_10_10_10_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_10_11_11_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_11_11_10_AS_16_16_16_16, kResolvable, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_32_32_32_FLOAT, kUncompressed, 1, 1, 96),
|
||||||
|
FORMAT_INFO(k_DXT3A, kCompressed, 4, 4, 4),
|
||||||
|
FORMAT_INFO(k_DXT5A, kCompressed, 4, 4, 4),
|
||||||
|
FORMAT_INFO(k_CTX1, kCompressed, 4, 4, 4),
|
||||||
|
FORMAT_INFO(k_DXT3A_AS_1_1_1_1, kCompressed, 4, 4, 4),
|
||||||
|
FORMAT_INFO(k_8_8_8_8_GAMMA_EDRAM, kUncompressed, 1, 1, 32),
|
||||||
|
FORMAT_INFO(k_2_10_10_10_FLOAT_EDRAM, kUncompressed, 1, 1, 32),
|
|
@ -199,9 +199,8 @@ bool GetPackedMipOffset(uint32_t width, uint32_t height, uint32_t depth,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const FormatInfo* format_info = FormatInfo::Get(format);
|
x_blocks >>= FormatInfo::GetWidthShift(format);
|
||||||
x_blocks /= format_info->block_width;
|
y_blocks >>= FormatInfo::GetHeightShift(format);
|
||||||
y_blocks /= format_info->block_height;
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -273,9 +272,10 @@ TextureGuestLayout GetGuestTextureLayout(
|
||||||
}
|
}
|
||||||
layout.mips_total_extent_bytes = 0;
|
layout.mips_total_extent_bytes = 0;
|
||||||
|
|
||||||
const FormatInfo* format_info = FormatInfo::Get(format);
|
const FormatInfo* const format_info = FormatInfo::Get(format);
|
||||||
uint32_t bytes_per_block = format_info->bytes_per_block();
|
const uint32_t bytes_per_block = format_info->bytes_per_block();
|
||||||
|
const unsigned char block_width_sh = FormatInfo::GetWidthShift(format);
|
||||||
|
const unsigned char block_height_sh = FormatInfo::GetHeightShift(format);
|
||||||
// The loop counter can mean two things depending on whether the packed mip
|
// The loop counter can mean two things depending on whether the packed mip
|
||||||
// tail is stored as mip 0, because in this case, it would be ambiguous since
|
// tail is stored as mip 0, because in this case, it would be ambiguous since
|
||||||
// both the base and the mips would be on "level 0", but stored separately and
|
// both the base and the mips would be on "level 0", but stored separately and
|
||||||
|
@ -320,9 +320,12 @@ TextureGuestLayout GetGuestTextureLayout(
|
||||||
z_slice_stride_texel_rows_unaligned =
|
z_slice_stride_texel_rows_unaligned =
|
||||||
std::max(xe::next_pow2(height_texels) >> level, uint32_t(1));
|
std::max(xe::next_pow2(height_texels) >> level, uint32_t(1));
|
||||||
}
|
}
|
||||||
uint32_t row_pitch_blocks_tile_aligned = xe::align(
|
// maybe do 1 << block_width_sh instead of format_info->block_width, since
|
||||||
xe::align(row_pitch_texels_unaligned, format_info->block_width) /
|
// we'll have cl loaded with the shift anyway
|
||||||
format_info->block_width,
|
uint32_t row_pitch_blocks_tile_aligned =
|
||||||
|
xe::align(xe::align<uint32_t>(row_pitch_texels_unaligned,
|
||||||
|
format_info->block_width) >>
|
||||||
|
block_width_sh,
|
||||||
xenos::kTextureTileWidthHeight);
|
xenos::kTextureTileWidthHeight);
|
||||||
level_layout.row_pitch_bytes =
|
level_layout.row_pitch_bytes =
|
||||||
row_pitch_blocks_tile_aligned * bytes_per_block;
|
row_pitch_blocks_tile_aligned * bytes_per_block;
|
||||||
|
@ -335,9 +338,10 @@ TextureGuestLayout GetGuestTextureLayout(
|
||||||
}
|
}
|
||||||
level_layout.z_slice_stride_block_rows =
|
level_layout.z_slice_stride_block_rows =
|
||||||
dimension != xenos::DataDimension::k1D
|
dimension != xenos::DataDimension::k1D
|
||||||
? xe::align(xe::align(z_slice_stride_texel_rows_unaligned,
|
? xe::align<uint32_t>(
|
||||||
format_info->block_height) /
|
xe::align<uint32_t>(z_slice_stride_texel_rows_unaligned,
|
||||||
format_info->block_height,
|
format_info->block_height) >>
|
||||||
|
block_height_sh,
|
||||||
xenos::kTextureTileWidthHeight)
|
xenos::kTextureTileWidthHeight)
|
||||||
: 1;
|
: 1;
|
||||||
level_layout.array_slice_stride_bytes =
|
level_layout.array_slice_stride_bytes =
|
||||||
|
@ -358,13 +362,13 @@ TextureGuestLayout GetGuestTextureLayout(
|
||||||
// the stride. For tiled textures, this is the dimensions aligned to 32x32x4
|
// the stride. For tiled textures, this is the dimensions aligned to 32x32x4
|
||||||
// blocks (or x1 for the missing dimensions).
|
// blocks (or x1 for the missing dimensions).
|
||||||
uint32_t level_width_blocks =
|
uint32_t level_width_blocks =
|
||||||
xe::align(std::max(width_texels >> level, uint32_t(1)),
|
xe::align<uint32_t>(std::max(width_texels >> level, uint32_t(1)),
|
||||||
format_info->block_width) /
|
format_info->block_width) >>
|
||||||
format_info->block_width;
|
block_width_sh;
|
||||||
uint32_t level_height_blocks =
|
uint32_t level_height_blocks =
|
||||||
xe::align(std::max(height_texels >> level, uint32_t(1)),
|
xe::align<uint32_t>(std::max(height_texels >> level, uint32_t(1)),
|
||||||
format_info->block_height) /
|
format_info->block_height) >>
|
||||||
format_info->block_height;
|
block_height_sh;
|
||||||
uint32_t level_depth = std::max(depth >> level, uint32_t(1));
|
uint32_t level_depth = std::max(depth >> level, uint32_t(1));
|
||||||
if (is_tiled) {
|
if (is_tiled) {
|
||||||
level_layout.x_extent_blocks =
|
level_layout.x_extent_blocks =
|
||||||
|
@ -415,20 +419,20 @@ TextureGuestLayout GetGuestTextureLayout(
|
||||||
GetPackedMipOffset(width_texels, height_texels, depth, format,
|
GetPackedMipOffset(width_texels, height_texels, depth, format,
|
||||||
packed_sublevel, packed_sublevel_x_blocks,
|
packed_sublevel, packed_sublevel_x_blocks,
|
||||||
packed_sublevel_y_blocks, packed_sublevel_z);
|
packed_sublevel_y_blocks, packed_sublevel_z);
|
||||||
level_layout.x_extent_blocks = std::max(
|
level_layout.x_extent_blocks = std::max<uint32_t>(
|
||||||
level_layout.x_extent_blocks,
|
level_layout.x_extent_blocks,
|
||||||
packed_sublevel_x_blocks +
|
packed_sublevel_x_blocks +
|
||||||
xe::align(
|
(xe::align<uint32_t>(
|
||||||
std::max(width_texels >> packed_sublevel, uint32_t(1)),
|
std::max(width_texels >> packed_sublevel, uint32_t(1)),
|
||||||
format_info->block_width) /
|
format_info->block_width) >>
|
||||||
format_info->block_width);
|
block_width_sh));
|
||||||
level_layout.y_extent_blocks = std::max(
|
level_layout.y_extent_blocks = std::max<uint32_t>(
|
||||||
level_layout.y_extent_blocks,
|
level_layout.y_extent_blocks,
|
||||||
packed_sublevel_y_blocks +
|
packed_sublevel_y_blocks +
|
||||||
xe::align(
|
(xe::align<uint32_t>(
|
||||||
std::max(height_texels >> packed_sublevel, uint32_t(1)),
|
std::max(height_texels >> packed_sublevel, uint32_t(1)),
|
||||||
format_info->block_height) /
|
format_info->block_height) >>
|
||||||
format_info->block_height);
|
block_height_sh));
|
||||||
level_layout.z_extent =
|
level_layout.z_extent =
|
||||||
std::max(level_layout.z_extent,
|
std::max(level_layout.z_extent,
|
||||||
packed_sublevel_z +
|
packed_sublevel_z +
|
||||||
|
|
|
@ -743,7 +743,7 @@ void TraceViewer::DrawTextureInfo(
|
||||||
ImGui::NextColumn();
|
ImGui::NextColumn();
|
||||||
ImGui::Text("Fetch Slot: %u", texture_binding.fetch_constant);
|
ImGui::Text("Fetch Slot: %u", texture_binding.fetch_constant);
|
||||||
ImGui::Text("Guest Address: %.8X", texture_info.memory.base_address);
|
ImGui::Text("Guest Address: %.8X", texture_info.memory.base_address);
|
||||||
ImGui::Text("Format: %s", texture_info.format_info()->name);
|
ImGui::Text("Format: %s", texture_info.format_name());
|
||||||
switch (texture_info.dimension) {
|
switch (texture_info.dimension) {
|
||||||
case xenos::DataDimension::k1D:
|
case xenos::DataDimension::k1D:
|
||||||
ImGui::Text("1D: %dpx", texture_info.width + 1);
|
ImGui::Text("1D: %dpx", texture_info.width + 1);
|
||||||
|
|
|
@ -32,10 +32,11 @@
|
||||||
#include "xenia/gpu/vulkan/vulkan_shader.h"
|
#include "xenia/gpu/vulkan/vulkan_shader.h"
|
||||||
#include "xenia/gpu/vulkan/vulkan_shared_memory.h"
|
#include "xenia/gpu/vulkan/vulkan_shared_memory.h"
|
||||||
#include "xenia/gpu/xenos.h"
|
#include "xenia/gpu/xenos.h"
|
||||||
|
#include "xenia/kernel/kernel_state.h"
|
||||||
|
#include "xenia/kernel/user_module.h"
|
||||||
#include "xenia/ui/vulkan/vulkan_presenter.h"
|
#include "xenia/ui/vulkan/vulkan_presenter.h"
|
||||||
#include "xenia/ui/vulkan/vulkan_provider.h"
|
#include "xenia/ui/vulkan/vulkan_provider.h"
|
||||||
#include "xenia/ui/vulkan/vulkan_util.h"
|
#include "xenia/ui/vulkan/vulkan_util.h"
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
namespace vulkan {
|
namespace vulkan {
|
||||||
|
@ -4171,6 +4172,8 @@ uint32_t VulkanCommandProcessor::WriteTransientTextureBindings(
|
||||||
return descriptor_set_write_count;
|
return descriptor_set_write_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define COMMAND_PROCESSOR VulkanCommandProcessor
|
||||||
|
#include "../pm4_command_processor_implement.h"
|
||||||
} // namespace vulkan
|
} // namespace vulkan
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -53,6 +53,7 @@ class VulkanCommandProcessor final : public CommandProcessor {
|
||||||
kStorageBufferCompute,
|
kStorageBufferCompute,
|
||||||
kCount,
|
kCount,
|
||||||
};
|
};
|
||||||
|
#include "../pm4_command_processor_declare.h"
|
||||||
|
|
||||||
class ScratchBufferAcquisition {
|
class ScratchBufferAcquisition {
|
||||||
public:
|
public:
|
||||||
|
|
|
@ -2020,7 +2020,7 @@ bool VulkanTextureCache::Initialize() {
|
||||||
// Log which formats are not supported or supported via fallbacks.
|
// Log which formats are not supported or supported via fallbacks.
|
||||||
const HostFormatPair& best_host_format = kBestHostFormats[i];
|
const HostFormatPair& best_host_format = kBestHostFormats[i];
|
||||||
const char* guest_format_name =
|
const char* guest_format_name =
|
||||||
FormatInfo::Get(xenos::TextureFormat(i))->name;
|
FormatInfo::GetName(xenos::TextureFormat(i));
|
||||||
if (best_host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) {
|
if (best_host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) {
|
||||||
assert_not_null(guest_format_name);
|
assert_not_null(guest_format_name);
|
||||||
if (host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) {
|
if (host_format.format_unsigned.format != VK_FORMAT_UNDEFINED) {
|
||||||
|
|
|
@ -1045,8 +1045,9 @@ inline uint16_t GpuSwap(uint16_t value, Endian endianness) {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
XE_NOINLINE
|
||||||
inline uint32_t GpuSwap(uint32_t value, Endian endianness) {
|
XE_NOALIAS
|
||||||
|
static uint32_t GpuSwap(uint32_t value, Endian endianness) {
|
||||||
switch (endianness) {
|
switch (endianness) {
|
||||||
default:
|
default:
|
||||||
case Endian::kNone:
|
case Endian::kNone:
|
||||||
|
|
|
@ -511,7 +511,8 @@ template <size_t I = 0, typename... Ps>
|
||||||
StringBuffer* thread_local_string_buffer();
|
StringBuffer* thread_local_string_buffer();
|
||||||
|
|
||||||
template <typename Tuple>
|
template <typename Tuple>
|
||||||
void PrintKernelCall(cpu::Export* export_entry, const Tuple& params) {
|
XE_NOALIAS void PrintKernelCall(cpu::Export* export_entry,
|
||||||
|
const Tuple& params) {
|
||||||
auto& string_buffer = *thread_local_string_buffer();
|
auto& string_buffer = *thread_local_string_buffer();
|
||||||
string_buffer.Reset();
|
string_buffer.Reset();
|
||||||
string_buffer.Append(export_entry->name);
|
string_buffer.Append(export_entry->name);
|
||||||
|
@ -526,24 +527,31 @@ void PrintKernelCall(cpu::Export* export_entry, const Tuple& params) {
|
||||||
string_buffer.to_string_view());
|
string_buffer.to_string_view());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
todo: need faster string formatting/concatenation (all arguments are
|
||||||
|
always turned into strings except if kHighFrequency)
|
||||||
|
|
||||||
|
*/
|
||||||
template <typename F, typename Tuple, std::size_t... I>
|
template <typename F, typename Tuple, std::size_t... I>
|
||||||
auto KernelTrampoline(F&& f, Tuple&& t, std::index_sequence<I...>) {
|
XE_FORCEINLINE static auto KernelTrampoline(F&& f, Tuple&& t,
|
||||||
|
std::index_sequence<I...>) {
|
||||||
return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
|
return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <KernelModuleId MODULE, uint16_t ORDINAL, typename R, typename... Ps>
|
template <KernelModuleId MODULE, uint16_t ORDINAL, typename R, typename... Ps>
|
||||||
xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name,
|
struct ExportRegistrerHelper {
|
||||||
xe::cpu::ExportTag::type tags) {
|
template <R (*fn)(Ps&...), xe::cpu::ExportTag::type tags>
|
||||||
|
static xe::cpu::Export* RegisterExport(const char* name) {
|
||||||
static_assert(
|
static_assert(
|
||||||
std::is_void<R>::value || std::is_base_of<shim::Result, R>::value,
|
std::is_void<R>::value || std::is_base_of<shim::Result, R>::value,
|
||||||
"R must be void or derive from shim::Result");
|
"R must be void or derive from shim::Result");
|
||||||
static_assert((std::is_base_of_v<shim::Param, Ps> && ...),
|
static_assert((std::is_base_of_v<shim::Param, Ps> && ...),
|
||||||
"Ps must derive from shim::Param");
|
"Ps must derive from shim::Param");
|
||||||
static const auto export_entry = new cpu::Export(
|
constexpr auto TAGS =
|
||||||
ORDINAL, xe::cpu::Export::Type::kFunction, name,
|
tags | xe::cpu::ExportTag::kImplemented | xe::cpu::ExportTag::kLog;
|
||||||
tags | xe::cpu::ExportTag::kImplemented | xe::cpu::ExportTag::kLog);
|
|
||||||
static R (*FN)(Ps & ...) = fn;
|
static const auto export_entry =
|
||||||
|
new cpu::Export(ORDINAL, xe::cpu::Export::Type::kFunction, name, TAGS);
|
||||||
struct X {
|
struct X {
|
||||||
static void Trampoline(PPCContext* ppc_context) {
|
static void Trampoline(PPCContext* ppc_context) {
|
||||||
++export_entry->function_data.call_count;
|
++export_entry->function_data.call_count;
|
||||||
|
@ -556,28 +564,52 @@ xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name,
|
||||||
// The make_tuple order is undefined per the C++ standard and
|
// The make_tuple order is undefined per the C++ standard and
|
||||||
// cause inconsitencies between msvc and clang.
|
// cause inconsitencies between msvc and clang.
|
||||||
std::tuple<Ps...> params = {Ps(init)...};
|
std::tuple<Ps...> params = {Ps(init)...};
|
||||||
if (export_entry->tags & xe::cpu::ExportTag::kLog &&
|
if (TAGS & xe::cpu::ExportTag::kLog &&
|
||||||
(!(export_entry->tags & xe::cpu::ExportTag::kHighFrequency) ||
|
(!(TAGS & xe::cpu::ExportTag::kHighFrequency) ||
|
||||||
cvars::log_high_frequency_kernel_calls)) {
|
cvars::log_high_frequency_kernel_calls)) {
|
||||||
PrintKernelCall(export_entry, params);
|
PrintKernelCall(export_entry, params);
|
||||||
}
|
}
|
||||||
if constexpr (std::is_void<R>::value) {
|
if constexpr (std::is_void<R>::value) {
|
||||||
KernelTrampoline(FN, std::forward<std::tuple<Ps...>>(params),
|
KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
|
||||||
std::make_index_sequence<sizeof...(Ps)>());
|
std::make_index_sequence<sizeof...(Ps)>());
|
||||||
} else {
|
} else {
|
||||||
auto result =
|
auto result =
|
||||||
KernelTrampoline(FN, std::forward<std::tuple<Ps...>>(params),
|
KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
|
||||||
std::make_index_sequence<sizeof...(Ps)>());
|
std::make_index_sequence<sizeof...(Ps)>());
|
||||||
result.Store(ppc_context);
|
result.Store(ppc_context);
|
||||||
if (export_entry->tags &
|
if (TAGS &
|
||||||
(xe::cpu::ExportTag::kLog | xe::cpu::ExportTag::kLogResult)) {
|
(xe::cpu::ExportTag::kLog | xe::cpu::ExportTag::kLogResult)) {
|
||||||
// TODO(benvanik): log result.
|
// TODO(benvanik): log result.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
struct Y {
|
||||||
|
static void Trampoline(PPCContext* ppc_context) {
|
||||||
|
Param::Init init = {
|
||||||
|
ppc_context,
|
||||||
|
0,
|
||||||
|
};
|
||||||
|
std::tuple<Ps...> params = {Ps(init)...};
|
||||||
|
if constexpr (std::is_void<R>::value) {
|
||||||
|
KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
|
||||||
|
std::make_index_sequence<sizeof...(Ps)>());
|
||||||
|
} else {
|
||||||
|
auto result =
|
||||||
|
KernelTrampoline(fn, std::forward<std::tuple<Ps...>>(params),
|
||||||
|
std::make_index_sequence<sizeof...(Ps)>());
|
||||||
|
result.Store(ppc_context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
export_entry->function_data.trampoline = &X::Trampoline;
|
export_entry->function_data.trampoline = &X::Trampoline;
|
||||||
return export_entry;
|
return export_entry;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template <KernelModuleId MODULE, uint16_t ORDINAL, typename R, typename... Ps>
|
||||||
|
auto GetRegister(R (*fngetter)(Ps&...)) {
|
||||||
|
return static_cast<ExportRegistrerHelper<MODULE, ORDINAL, R, Ps...>*>(
|
||||||
|
nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace shim
|
} // namespace shim
|
||||||
|
@ -585,13 +617,17 @@ xe::cpu::Export* RegisterExport(R (*fn)(Ps&...), const char* name,
|
||||||
using xe::cpu::ExportTag;
|
using xe::cpu::ExportTag;
|
||||||
|
|
||||||
#define DECLARE_EXPORT(module_name, name, category, tags) \
|
#define DECLARE_EXPORT(module_name, name, category, tags) \
|
||||||
|
using _register_##module_name##_##name = \
|
||||||
|
std::remove_cv_t<std::remove_reference_t< \
|
||||||
|
decltype(*xe::kernel::shim::GetRegister< \
|
||||||
|
xe::kernel::shim::KernelModuleId::module_name, \
|
||||||
|
ordinals::name>(&name##_entry))>>; \
|
||||||
const auto EXPORT_##module_name##_##name = RegisterExport_##module_name( \
|
const auto EXPORT_##module_name##_##name = RegisterExport_##module_name( \
|
||||||
xe::kernel::shim::RegisterExport< \
|
_register_##module_name##_##name ::RegisterExport< \
|
||||||
xe::kernel::shim::KernelModuleId::module_name, ordinals::name>( \
|
&name##_entry, tags | (static_cast<xe::cpu::ExportTag::type>( \
|
||||||
&name##_entry, #name, \
|
|
||||||
tags | (static_cast<xe::cpu::ExportTag::type>( \
|
|
||||||
xe::cpu::ExportCategory::category) \
|
xe::cpu::ExportCategory::category) \
|
||||||
<< xe::cpu::ExportTag::CategoryShift)));
|
<< xe::cpu::ExportTag::CategoryShift)>( \
|
||||||
|
#name));
|
||||||
|
|
||||||
#define DECLARE_EMPTY_REGISTER_EXPORTS(module_name, group_name) \
|
#define DECLARE_EMPTY_REGISTER_EXPORTS(module_name, group_name) \
|
||||||
void xe::kernel::module_name::Register##group_name##Exports( \
|
void xe::kernel::module_name::Register##group_name##Exports( \
|
||||||
|
|
|
@ -316,8 +316,46 @@ void Memory::Reset() {
|
||||||
heaps_.v90000000.Reset();
|
heaps_.v90000000.Reset();
|
||||||
heaps_.physical.Reset();
|
heaps_.physical.Reset();
|
||||||
}
|
}
|
||||||
|
XE_NOALIAS
|
||||||
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
||||||
|
#if 1
|
||||||
|
#define HEAP_INDEX(name) \
|
||||||
|
offsetof(Memory, heaps_.name) - offsetof(Memory, heaps_)
|
||||||
|
|
||||||
|
const char* heap_select = (const char*)&this->heaps_;
|
||||||
|
|
||||||
|
unsigned selected_heap_offset = 0;
|
||||||
|
unsigned high_nibble = address >> 28;
|
||||||
|
|
||||||
|
if (high_nibble < 0x4) {
|
||||||
|
selected_heap_offset = HEAP_INDEX(v00000000);
|
||||||
|
} else if (address < 0x7F000000) {
|
||||||
|
selected_heap_offset = HEAP_INDEX(v40000000);
|
||||||
|
} else if (high_nibble < 0x8) {
|
||||||
|
heap_select = nullptr;
|
||||||
|
// return nullptr;
|
||||||
|
} else if (high_nibble < 0x9) {
|
||||||
|
selected_heap_offset = HEAP_INDEX(v80000000);
|
||||||
|
// return &heaps_.v80000000;
|
||||||
|
} else if (high_nibble < 0xA) {
|
||||||
|
// return &heaps_.v90000000;
|
||||||
|
selected_heap_offset = HEAP_INDEX(v90000000);
|
||||||
|
} else if (high_nibble < 0xC) {
|
||||||
|
// return &heaps_.vA0000000;
|
||||||
|
selected_heap_offset = HEAP_INDEX(vA0000000);
|
||||||
|
} else if (high_nibble < 0xE) {
|
||||||
|
// return &heaps_.vC0000000;
|
||||||
|
selected_heap_offset = HEAP_INDEX(vC0000000);
|
||||||
|
} else if (address < 0xFFD00000) {
|
||||||
|
// return &heaps_.vE0000000;
|
||||||
|
selected_heap_offset = HEAP_INDEX(vE0000000);
|
||||||
|
} else {
|
||||||
|
// return nullptr;
|
||||||
|
heap_select = nullptr;
|
||||||
|
}
|
||||||
|
return reinterpret_cast<const BaseHeap*>(selected_heap_offset + heap_select);
|
||||||
|
|
||||||
|
#else
|
||||||
if (address < 0x40000000) {
|
if (address < 0x40000000) {
|
||||||
return &heaps_.v00000000;
|
return &heaps_.v00000000;
|
||||||
} else if (address < 0x7F000000) {
|
} else if (address < 0x7F000000) {
|
||||||
|
@ -337,6 +375,7 @@ const BaseHeap* Memory::LookupHeap(uint32_t address) const {
|
||||||
} else {
|
} else {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
|
BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
|
||||||
|
@ -465,8 +504,8 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Memory::AccessViolationCallback(
|
bool Memory::AccessViolationCallback(
|
||||||
global_unique_lock_type global_lock_locked_once,
|
global_unique_lock_type global_lock_locked_once, void* host_address,
|
||||||
void* host_address, bool is_write) {
|
bool is_write) {
|
||||||
// Access via physical_membase_ is special, when need to bypass everything
|
// Access via physical_membase_ is special, when need to bypass everything
|
||||||
// (for instance, for a data provider to actually write the data) so only
|
// (for instance, for a data provider to actually write the data) so only
|
||||||
// triggering callbacks on virtual memory regions.
|
// triggering callbacks on virtual memory regions.
|
||||||
|
@ -493,16 +532,15 @@ bool Memory::AccessViolationCallback(
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Memory::AccessViolationCallbackThunk(
|
bool Memory::AccessViolationCallbackThunk(
|
||||||
global_unique_lock_type global_lock_locked_once,
|
global_unique_lock_type global_lock_locked_once, void* context,
|
||||||
void* context, void* host_address, bool is_write) {
|
void* host_address, bool is_write) {
|
||||||
return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
|
return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
|
||||||
std::move(global_lock_locked_once), host_address, is_write);
|
std::move(global_lock_locked_once), host_address, is_write);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Memory::TriggerPhysicalMemoryCallbacks(
|
bool Memory::TriggerPhysicalMemoryCallbacks(
|
||||||
global_unique_lock_type global_lock_locked_once,
|
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
|
||||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
|
||||||
bool unwatch_exact_range, bool unprotect) {
|
|
||||||
BaseHeap* heap = LookupHeap(virtual_address);
|
BaseHeap* heap = LookupHeap(virtual_address);
|
||||||
if (heap->heap_type() == HeapType::kGuestPhysical) {
|
if (heap->heap_type() == HeapType::kGuestPhysical) {
|
||||||
auto physical_heap = static_cast<PhysicalHeap*>(heap);
|
auto physical_heap = static_cast<PhysicalHeap*>(heap);
|
||||||
|
@ -1711,9 +1749,8 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
|
||||||
}
|
}
|
||||||
|
|
||||||
bool PhysicalHeap::TriggerCallbacks(
|
bool PhysicalHeap::TriggerCallbacks(
|
||||||
global_unique_lock_type global_lock_locked_once,
|
global_unique_lock_type global_lock_locked_once, uint32_t virtual_address,
|
||||||
uint32_t virtual_address, uint32_t length, bool is_write,
|
uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) {
|
||||||
bool unwatch_exact_range, bool unprotect) {
|
|
||||||
// TODO(Triang3l): Support read watches.
|
// TODO(Triang3l): Support read watches.
|
||||||
assert_true(is_write);
|
assert_true(is_write);
|
||||||
if (!is_write) {
|
if (!is_write) {
|
||||||
|
|
|
@ -473,8 +473,9 @@ class Memory {
|
||||||
void SystemHeapFree(uint32_t address);
|
void SystemHeapFree(uint32_t address);
|
||||||
|
|
||||||
// Gets the heap for the address space containing the given address.
|
// Gets the heap for the address space containing the given address.
|
||||||
|
XE_NOALIAS
|
||||||
const BaseHeap* LookupHeap(uint32_t address) const;
|
const BaseHeap* LookupHeap(uint32_t address) const;
|
||||||
|
XE_NOALIAS
|
||||||
inline BaseHeap* LookupHeap(uint32_t address) {
|
inline BaseHeap* LookupHeap(uint32_t address) {
|
||||||
return const_cast<BaseHeap*>(
|
return const_cast<BaseHeap*>(
|
||||||
const_cast<const Memory*>(this)->LookupHeap(address));
|
const_cast<const Memory*>(this)->LookupHeap(address));
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
#include "xenia/base/math.h"
|
#include "xenia/base/math.h"
|
||||||
#include "xenia/ui/d3d12/d3d12_immediate_drawer.h"
|
#include "xenia/ui/d3d12/d3d12_immediate_drawer.h"
|
||||||
#include "xenia/ui/d3d12/d3d12_presenter.h"
|
#include "xenia/ui/d3d12/d3d12_presenter.h"
|
||||||
|
#include "xenia/ui/d3d12/d3d12_util.h"
|
||||||
DEFINE_bool(d3d12_debug, false, "Enable Direct3D 12 and DXGI debug layer.",
|
DEFINE_bool(d3d12_debug, false, "Enable Direct3D 12 and DXGI debug layer.",
|
||||||
"D3D12");
|
"D3D12");
|
||||||
DEFINE_bool(d3d12_break_on_error, false,
|
DEFINE_bool(d3d12_break_on_error, false,
|
||||||
|
@ -35,6 +35,8 @@ DEFINE_int32(
|
||||||
"system responsibility)",
|
"system responsibility)",
|
||||||
"D3D12");
|
"D3D12");
|
||||||
|
|
||||||
|
DEFINE_bool(d3d12_nvapi_use_driver_heap_priorities, false, "nvidia stuff",
|
||||||
|
"D3D12");
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace ui {
|
namespace ui {
|
||||||
namespace d3d12 {
|
namespace d3d12 {
|
||||||
|
@ -61,6 +63,7 @@ std::unique_ptr<D3D12Provider> D3D12Provider::Create() {
|
||||||
"supported GPUs.");
|
"supported GPUs.");
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
return provider;
|
return provider;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -476,10 +479,69 @@ bool D3D12Provider::Initialize() {
|
||||||
// Get the graphics analysis interface, will silently fail if PIX is not
|
// Get the graphics analysis interface, will silently fail if PIX is not
|
||||||
// attached.
|
// attached.
|
||||||
pfn_dxgi_get_debug_interface1_(0, IID_PPV_ARGS(&graphics_analysis_));
|
pfn_dxgi_get_debug_interface1_(0, IID_PPV_ARGS(&graphics_analysis_));
|
||||||
|
if (GetAdapterVendorID() == ui::GraphicsProvider::GpuVendorID::kNvidia) {
|
||||||
|
nvapi_ = new lightweight_nvapi::nvapi_state_t();
|
||||||
|
if (!nvapi_->is_available()) {
|
||||||
|
delete nvapi_;
|
||||||
|
nvapi_ = nullptr;
|
||||||
|
} else {
|
||||||
|
using namespace lightweight_nvapi;
|
||||||
|
|
||||||
|
nvapi_createcommittedresource_ =
|
||||||
|
(cb_NvAPI_D3D12_CreateCommittedResource)nvapi_->query_interface<void>(
|
||||||
|
id_NvAPI_D3D12_CreateCommittedResource);
|
||||||
|
nvapi_querycpuvisiblevidmem_ =
|
||||||
|
(cb_NvAPI_D3D12_QueryCpuVisibleVidmem)nvapi_->query_interface<void>(
|
||||||
|
id_NvAPI_D3D12_QueryCpuVisibleVidmem);
|
||||||
|
nvapi_usedriverheappriorities_ =
|
||||||
|
(cb_NvAPI_D3D12_UseDriverHeapPriorities)nvapi_->query_interface<void>(
|
||||||
|
id_NvAPI_D3D12_UseDriverHeapPriorities);
|
||||||
|
|
||||||
|
if (nvapi_usedriverheappriorities_) {
|
||||||
|
if (cvars::d3d12_nvapi_use_driver_heap_priorities) {
|
||||||
|
if (nvapi_usedriverheappriorities_(device_) != 0) {
|
||||||
|
XELOGI("Failed to enable driver heap priorities");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
uint32_t D3D12Provider::CreateUploadResource(
|
||||||
|
D3D12_HEAP_FLAGS HeapFlags, _In_ const D3D12_RESOURCE_DESC* pDesc,
|
||||||
|
D3D12_RESOURCE_STATES InitialResourceState, REFIID riidResource,
|
||||||
|
void** ppvResource, bool try_create_cpuvisible,
|
||||||
|
const D3D12_CLEAR_VALUE* pOptimizedClearValue) const {
|
||||||
|
auto device = GetDevice();
|
||||||
|
|
||||||
|
if (try_create_cpuvisible && nvapi_createcommittedresource_) {
|
||||||
|
lightweight_nvapi::NV_RESOURCE_PARAMS nvrp;
|
||||||
|
nvrp.NVResourceFlags =
|
||||||
|
lightweight_nvapi::NV_D3D12_RESOURCE_FLAG_CPUVISIBLE_VIDMEM;
|
||||||
|
nvrp.version = 0; // nothing checks the version
|
||||||
|
|
||||||
|
if (nvapi_createcommittedresource_(
|
||||||
|
device, &ui::d3d12::util::kHeapPropertiesUpload, HeapFlags, pDesc,
|
||||||
|
InitialResourceState, pOptimizedClearValue, &nvrp, riidResource,
|
||||||
|
ppvResource, nullptr) != 0) {
|
||||||
|
XELOGI(
|
||||||
|
"Failed to create CPUVISIBLE_VIDMEM upload resource, will just do "
|
||||||
|
"normal CreateCommittedResource");
|
||||||
|
} else {
|
||||||
|
return UPLOAD_RESULT_CREATE_CPUVISIBLE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (FAILED(device->CreateCommittedResource(
|
||||||
|
&ui::d3d12::util::kHeapPropertiesUpload, HeapFlags, pDesc,
|
||||||
|
InitialResourceState, pOptimizedClearValue, riidResource,
|
||||||
|
ppvResource))) {
|
||||||
|
XELOGE("Failed to create the gamma ramp upload buffer");
|
||||||
|
return UPLOAD_RESULT_CREATE_FAILED;
|
||||||
|
}
|
||||||
|
|
||||||
|
return UPLOAD_RESULT_CREATE_SUCCESS;
|
||||||
|
}
|
||||||
std::unique_ptr<Presenter> D3D12Provider::CreatePresenter(
|
std::unique_ptr<Presenter> D3D12Provider::CreatePresenter(
|
||||||
Presenter::HostGpuLossCallback host_gpu_loss_callback) {
|
Presenter::HostGpuLossCallback host_gpu_loss_callback) {
|
||||||
return D3D12Presenter::Create(host_gpu_loss_callback, *this);
|
return D3D12Presenter::Create(host_gpu_loss_callback, *this);
|
||||||
|
|
|
@ -14,13 +14,21 @@
|
||||||
|
|
||||||
#include "xenia/ui/d3d12/d3d12_api.h"
|
#include "xenia/ui/d3d12/d3d12_api.h"
|
||||||
#include "xenia/ui/graphics_provider.h"
|
#include "xenia/ui/graphics_provider.h"
|
||||||
|
// chrispy: this is here to prevent clang format from moving d3d12_nvapi above
|
||||||
|
// the headers it depends on
|
||||||
|
#define HEADERFENCE
|
||||||
|
#undef HEADERFENCE
|
||||||
|
#include "xenia/gpu/d3d12/d3d12_nvapi.hpp"
|
||||||
#define XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES 1
|
#define XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES 1
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace ui {
|
namespace ui {
|
||||||
namespace d3d12 {
|
namespace d3d12 {
|
||||||
|
enum {
|
||||||
|
UPLOAD_RESULT_CREATE_FAILED = 0,
|
||||||
|
UPLOAD_RESULT_CREATE_SUCCESS = 1,
|
||||||
|
UPLOAD_RESULT_CREATE_CPUVISIBLE = 2
|
||||||
|
};
|
||||||
class D3D12Provider : public GraphicsProvider {
|
class D3D12Provider : public GraphicsProvider {
|
||||||
public:
|
public:
|
||||||
~D3D12Provider();
|
~D3D12Provider();
|
||||||
|
@ -34,6 +42,11 @@ class D3D12Provider : public GraphicsProvider {
|
||||||
Presenter::FatalErrorHostGpuLossCallback) override;
|
Presenter::FatalErrorHostGpuLossCallback) override;
|
||||||
|
|
||||||
std::unique_ptr<ImmediateDrawer> CreateImmediateDrawer() override;
|
std::unique_ptr<ImmediateDrawer> CreateImmediateDrawer() override;
|
||||||
|
uint32_t CreateUploadResource(
|
||||||
|
D3D12_HEAP_FLAGS HeapFlags, _In_ const D3D12_RESOURCE_DESC* pDesc,
|
||||||
|
D3D12_RESOURCE_STATES InitialResourceState, REFIID riidResource,
|
||||||
|
void** ppvResource, bool try_create_cpuvisible = false,
|
||||||
|
const D3D12_CLEAR_VALUE* pOptimizedClearValue = nullptr) const;
|
||||||
|
|
||||||
IDXGIFactory2* GetDXGIFactory() const { return dxgi_factory_; }
|
IDXGIFactory2* GetDXGIFactory() const { return dxgi_factory_; }
|
||||||
// nullptr if PIX not attached.
|
// nullptr if PIX not attached.
|
||||||
|
@ -193,6 +206,14 @@ class D3D12Provider : public GraphicsProvider {
|
||||||
bool ps_specified_stencil_reference_supported_;
|
bool ps_specified_stencil_reference_supported_;
|
||||||
bool rasterizer_ordered_views_supported_;
|
bool rasterizer_ordered_views_supported_;
|
||||||
bool unaligned_block_textures_supported_;
|
bool unaligned_block_textures_supported_;
|
||||||
|
|
||||||
|
lightweight_nvapi::nvapi_state_t* nvapi_;
|
||||||
|
lightweight_nvapi::cb_NvAPI_D3D12_CreateCommittedResource
|
||||||
|
nvapi_createcommittedresource_ = nullptr;
|
||||||
|
lightweight_nvapi::cb_NvAPI_D3D12_UseDriverHeapPriorities
|
||||||
|
nvapi_usedriverheappriorities_ = nullptr;
|
||||||
|
lightweight_nvapi::cb_NvAPI_D3D12_QueryCpuVisibleVidmem
|
||||||
|
nvapi_querycpuvisiblevidmem_ = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace d3d12
|
} // namespace d3d12
|
||||||
|
|
|
@ -81,10 +81,10 @@ D3D12UploadBufferPool::CreatePageImplementation() {
|
||||||
util::FillBufferResourceDesc(buffer_desc, page_size_,
|
util::FillBufferResourceDesc(buffer_desc, page_size_,
|
||||||
D3D12_RESOURCE_FLAG_NONE);
|
D3D12_RESOURCE_FLAG_NONE);
|
||||||
Microsoft::WRL::ComPtr<ID3D12Resource> buffer;
|
Microsoft::WRL::ComPtr<ID3D12Resource> buffer;
|
||||||
if (FAILED(provider_.GetDevice()->CreateCommittedResource(
|
|
||||||
&util::kHeapPropertiesUpload, provider_.GetHeapFlagCreateNotZeroed(),
|
if (!provider_.CreateUploadResource(
|
||||||
&buffer_desc, D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
|
provider_.GetHeapFlagCreateNotZeroed(), &buffer_desc,
|
||||||
IID_PPV_ARGS(&buffer)))) {
|
D3D12_RESOURCE_STATE_GENERIC_READ, IID_PPV_ARGS(&buffer))) {
|
||||||
XELOGE("Failed to create a D3D upload buffer with {} bytes", page_size_);
|
XELOGE("Failed to create a D3D upload buffer with {} bytes", page_size_);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue