Merge pull request #66 from chrisps/canary_experimental
Huge boost to readback_memexport/resolve performance by fixing old bug; miscellaneous optimizations
This commit is contained in:
commit
c1d3e35eb9
|
@ -0,0 +1,415 @@
|
||||||
|
#include "dma.h"
|
||||||
|
|
||||||
|
template <size_t N, typename... Ts>
|
||||||
|
static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
|
||||||
|
char buffer[1024];
|
||||||
|
sprintf_s(buffer, fmt, args...);
|
||||||
|
XELOGI("%s", buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
|
||||||
|
//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
|
||||||
|
#define XEDMALOG(...) static_cast<void>(0)
|
||||||
|
using xe::swcache::CacheLine;
|
||||||
|
static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
|
||||||
|
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static void XeCopy16384Streaming(CacheLine* XE_RESTRICT to,
|
||||||
|
CacheLine* XE_RESTRICT from) {
|
||||||
|
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
|
||||||
|
|
||||||
|
CacheLine* dest1 = to;
|
||||||
|
CacheLine* src1 = from;
|
||||||
|
|
||||||
|
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
|
||||||
|
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
|
||||||
|
|
||||||
|
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
|
||||||
|
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
|
||||||
|
|
||||||
|
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
|
||||||
|
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
|
||||||
|
#pragma loop(no_vector)
|
||||||
|
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
|
||||||
|
xe::swcache::CacheLine line0, line1, line2, line3;
|
||||||
|
|
||||||
|
xe::swcache::ReadLine(&line0, src1 + i);
|
||||||
|
xe::swcache::ReadLine(&line1, src2 + i);
|
||||||
|
xe::swcache::ReadLine(&line2, src3 + i);
|
||||||
|
xe::swcache::ReadLine(&line3, src4 + i);
|
||||||
|
XE_MSVC_REORDER_BARRIER();
|
||||||
|
xe::swcache::WriteLineNT(dest1 + i, &line0);
|
||||||
|
xe::swcache::WriteLineNT(dest2 + i, &line1);
|
||||||
|
|
||||||
|
xe::swcache::WriteLineNT(dest3 + i, &line2);
|
||||||
|
xe::swcache::WriteLineNT(dest4 + i, &line3);
|
||||||
|
}
|
||||||
|
XE_MSVC_REORDER_BARRIER();
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace xe::dma {
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static void vastcpy_impl(CacheLine* XE_RESTRICT physaddr,
|
||||||
|
CacheLine* XE_RESTRICT rdmapping,
|
||||||
|
uint32_t written_length) {
|
||||||
|
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
|
||||||
|
|
||||||
|
while (written_length >= 16384) {
|
||||||
|
XeCopy16384Streaming(physaddr, rdmapping);
|
||||||
|
|
||||||
|
physaddr += NUM_LINES_FOR_16K;
|
||||||
|
rdmapping += NUM_LINES_FOR_16K;
|
||||||
|
|
||||||
|
written_length -= 16384;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!written_length) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
|
||||||
|
|
||||||
|
uint32_t i = 0;
|
||||||
|
|
||||||
|
for (; i + 1 < num_written_lines; i += 2) {
|
||||||
|
xe::swcache::CacheLine line0, line1;
|
||||||
|
|
||||||
|
xe::swcache::ReadLine(&line0, rdmapping + i);
|
||||||
|
|
||||||
|
xe::swcache::ReadLine(&line1, rdmapping + i + 1);
|
||||||
|
XE_MSVC_REORDER_BARRIER();
|
||||||
|
xe::swcache::WriteLineNT(physaddr + i, &line0);
|
||||||
|
xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i < num_written_lines) {
|
||||||
|
xe::swcache::CacheLine line0;
|
||||||
|
|
||||||
|
xe::swcache::ReadLine(&line0, rdmapping + i);
|
||||||
|
xe::swcache::WriteLineNT(physaddr + i, &line0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
XE_NOINLINE
|
||||||
|
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
|
||||||
|
uint32_t written_length) {
|
||||||
|
return vastcpy_impl((CacheLine*)physaddr, (CacheLine*)rdmapping,
|
||||||
|
written_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define XEDMA_NUM_WORKERS 4
|
||||||
|
class alignas(256) XeDMACGeneric : public XeDMAC {
|
||||||
|
struct alignas(XE_HOST_CACHE_LINE_SIZE) {
|
||||||
|
std::atomic<uint64_t> free_job_slots_;
|
||||||
|
std::atomic<uint64_t> jobs_submitted_;
|
||||||
|
std::atomic<uint64_t> jobs_completed_;
|
||||||
|
std::atomic<uint32_t> num_workers_awoken_;
|
||||||
|
std::atomic<uint32_t> current_job_serial_;
|
||||||
|
|
||||||
|
} dma_volatile_;
|
||||||
|
|
||||||
|
alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64];
|
||||||
|
|
||||||
|
volatile uint32_t jobserials_[64];
|
||||||
|
|
||||||
|
alignas(XE_HOST_CACHE_LINE_SIZE)
|
||||||
|
std::unique_ptr<threading::Event> job_done_signals_[64];
|
||||||
|
// really dont like using unique pointer for this...
|
||||||
|
std::unique_ptr<threading::Event> job_submitted_signal_;
|
||||||
|
std::unique_ptr<threading::Event> job_completed_signal_;
|
||||||
|
|
||||||
|
std::unique_ptr<threading::Thread> scheduler_thread_;
|
||||||
|
struct WorkSlice {
|
||||||
|
uint8_t* destination;
|
||||||
|
uint8_t* source;
|
||||||
|
size_t numbytes;
|
||||||
|
};
|
||||||
|
std::unique_ptr<threading::Thread> workers_[XEDMA_NUM_WORKERS];
|
||||||
|
std::unique_ptr<threading::Event> worker_has_work_; //[XEDMA_NUM_WORKERS];
|
||||||
|
std::unique_ptr<threading::Event> worker_has_finished_[XEDMA_NUM_WORKERS];
|
||||||
|
|
||||||
|
threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS];
|
||||||
|
WorkSlice worker_workslice_[XEDMA_NUM_WORKERS];
|
||||||
|
|
||||||
|
// chrispy: this is bad
|
||||||
|
static uint32_t find_free_hole_in_dword(uint64_t dw) {
|
||||||
|
XEDMALOG("Finding free hole in 0x%llX", dw);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < 64; ++i) {
|
||||||
|
if (dw & (1ULL << i)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
return ~0U;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t allocate_free_dma_slot() {
|
||||||
|
XEDMALOG("Allocating free slot");
|
||||||
|
uint32_t got_slot = 0;
|
||||||
|
uint64_t slots;
|
||||||
|
uint64_t allocated_slot;
|
||||||
|
|
||||||
|
do {
|
||||||
|
slots = dma_volatile_.free_job_slots_.load();
|
||||||
|
|
||||||
|
got_slot = find_free_hole_in_dword(slots);
|
||||||
|
if (!~got_slot) {
|
||||||
|
XEDMALOG("Didn't get a slot!");
|
||||||
|
return ~0U;
|
||||||
|
}
|
||||||
|
allocated_slot = slots | (1ULL << got_slot);
|
||||||
|
|
||||||
|
} while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
|
||||||
|
slots, allocated_slot)));
|
||||||
|
XEDMALOG("Allocated slot %d", got_slot);
|
||||||
|
return got_slot;
|
||||||
|
}
|
||||||
|
// chrispy: on x86 this can just be interlockedbittestandreset...
|
||||||
|
void free_dma_slot(uint32_t slot) {
|
||||||
|
XEDMALOG("Freeing slot %d", slot);
|
||||||
|
uint64_t slots;
|
||||||
|
|
||||||
|
uint64_t deallocated_slot;
|
||||||
|
|
||||||
|
do {
|
||||||
|
slots = dma_volatile_.free_job_slots_.load();
|
||||||
|
|
||||||
|
deallocated_slot = slots & (~(1ULL << slot));
|
||||||
|
|
||||||
|
} while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
|
||||||
|
slots, deallocated_slot)));
|
||||||
|
}
|
||||||
|
|
||||||
|
void DoDMAJob(uint32_t idx) {
|
||||||
|
XeDMAJob& job = jobs_[idx];
|
||||||
|
if (job.precall) {
|
||||||
|
job.precall(&job);
|
||||||
|
}
|
||||||
|
// memcpy(job.destination, job.source, job.size);
|
||||||
|
|
||||||
|
size_t job_size = job.size;
|
||||||
|
|
||||||
|
size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE;
|
||||||
|
|
||||||
|
size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE;
|
||||||
|
|
||||||
|
size_t rem = job_size - line_rounded;
|
||||||
|
|
||||||
|
size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS;
|
||||||
|
|
||||||
|
XEDMALOG(
|
||||||
|
"Distributing %d bytes from %p to %p across %d workers, remainder is "
|
||||||
|
"%d",
|
||||||
|
line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem);
|
||||||
|
if (num_per_worker < 2048) {
|
||||||
|
XEDMALOG("not distributing across workers, num_per_worker < 8192");
|
||||||
|
// not worth splitting up
|
||||||
|
memcpy(job.destination, job.source, job.size);
|
||||||
|
job.signal_on_done->Set();
|
||||||
|
} else {
|
||||||
|
for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
|
||||||
|
worker_workslice_[i].destination =
|
||||||
|
(i * num_per_worker) + job.destination;
|
||||||
|
worker_workslice_[i].source = (i * num_per_worker) + job.source;
|
||||||
|
|
||||||
|
worker_workslice_[i].numbytes = num_per_worker;
|
||||||
|
}
|
||||||
|
if (rem) {
|
||||||
|
__movsb(job.destination + line_rounded, job.source + line_rounded, rem);
|
||||||
|
}
|
||||||
|
// wake them up
|
||||||
|
worker_has_work_->Set();
|
||||||
|
XEDMALOG("Starting waitall for job");
|
||||||
|
threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS,
|
||||||
|
false);
|
||||||
|
|
||||||
|
XEDMALOG("Waitall for job completed!");
|
||||||
|
job.signal_on_done->Set();
|
||||||
|
}
|
||||||
|
if (job.postcall) {
|
||||||
|
job.postcall(&job);
|
||||||
|
}
|
||||||
|
++dma_volatile_.jobs_completed_;
|
||||||
|
}
|
||||||
|
|
||||||
|
void WorkerIter(uint32_t worker_index) {
|
||||||
|
xenia_assert(worker_index < XEDMA_NUM_WORKERS);
|
||||||
|
auto [dest, src, size] = worker_workslice_[worker_index];
|
||||||
|
|
||||||
|
// if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) {
|
||||||
|
worker_has_work_->Reset();
|
||||||
|
//}
|
||||||
|
xenia_assert(size < (1ULL << 32));
|
||||||
|
// memcpy(dest, src, size);
|
||||||
|
dma::vastcpy(dest, src, static_cast<uint32_t>(size));
|
||||||
|
}
|
||||||
|
XE_NOINLINE
|
||||||
|
void WorkerMainLoop(uint32_t worker_index) {
|
||||||
|
do {
|
||||||
|
XEDMALOG("Worker iter for worker %d", worker_index);
|
||||||
|
WorkerIter(worker_index);
|
||||||
|
|
||||||
|
XEDMALOG("Worker %d is done\n", worker_index);
|
||||||
|
threading::SignalAndWait(worker_has_finished_[worker_index].get(),
|
||||||
|
worker_has_work_.get(), false);
|
||||||
|
} while (true);
|
||||||
|
}
|
||||||
|
void WorkerMain(uint32_t worker_index) {
|
||||||
|
XEDMALOG("Entered worker main loop, index %d", worker_index);
|
||||||
|
threading::Wait(worker_has_work_.get(), false);
|
||||||
|
XEDMALOG("First wait for worker %d completed, first job ever",
|
||||||
|
worker_index);
|
||||||
|
WorkerMainLoop(worker_index);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void WorkerMainForwarder(void* ptr) {
|
||||||
|
// we aligned XeDma to 256 bytes and encode extra info in the low 8
|
||||||
|
uintptr_t uptr = (uintptr_t)ptr;
|
||||||
|
|
||||||
|
uint32_t worker_index = (uint8_t)uptr;
|
||||||
|
|
||||||
|
uptr &= ~0xFFULL;
|
||||||
|
|
||||||
|
char name_buffer[64];
|
||||||
|
sprintf_s(name_buffer, "dma_worker_%d", worker_index);
|
||||||
|
|
||||||
|
xe::threading::set_name(name_buffer);
|
||||||
|
|
||||||
|
reinterpret_cast<XeDMACGeneric*>(uptr)->WorkerMain(worker_index);
|
||||||
|
}
|
||||||
|
|
||||||
|
void DMAMain() {
|
||||||
|
XEDMALOG("DmaMain");
|
||||||
|
do {
|
||||||
|
threading::Wait(job_submitted_signal_.get(), false);
|
||||||
|
|
||||||
|
auto slots = dma_volatile_.free_job_slots_.load();
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < 64; ++i) {
|
||||||
|
if (slots & (1ULL << i)) {
|
||||||
|
XEDMALOG("Got new job at index %d in DMAMain", i);
|
||||||
|
DoDMAJob(i);
|
||||||
|
|
||||||
|
free_dma_slot(i);
|
||||||
|
|
||||||
|
job_completed_signal_->Set();
|
||||||
|
// break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (true);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DMAMainForwarder(void* ud) {
|
||||||
|
xe::threading::set_name("dma_main");
|
||||||
|
reinterpret_cast<XeDMACGeneric*>(ud)->DMAMain();
|
||||||
|
}
|
||||||
|
|
||||||
|
public:
|
||||||
|
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
|
||||||
|
XEDMALOG("New job, %p to %p with size %d", job->source, job->destination,
|
||||||
|
job->size);
|
||||||
|
uint32_t slot;
|
||||||
|
do {
|
||||||
|
slot = allocate_free_dma_slot();
|
||||||
|
if (!~slot) {
|
||||||
|
XEDMALOG(
|
||||||
|
"Didn't get a free slot, waiting for a job to complete before "
|
||||||
|
"resuming.");
|
||||||
|
threading::Wait(job_completed_signal_.get(), false);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (true);
|
||||||
|
jobs_[slot] = *job;
|
||||||
|
|
||||||
|
jobs_[slot].signal_on_done = job_done_signals_[slot].get();
|
||||||
|
jobs_[slot].signal_on_done->Reset();
|
||||||
|
XEDMALOG("Setting job submit signal, pushed into slot %d", slot);
|
||||||
|
|
||||||
|
uint32_t new_serial = dma_volatile_.current_job_serial_++;
|
||||||
|
|
||||||
|
jobserials_[slot] = new_serial;
|
||||||
|
|
||||||
|
++dma_volatile_.jobs_submitted_;
|
||||||
|
job_submitted_signal_->Set();
|
||||||
|
return (static_cast<uint64_t>(new_serial) << 32) |
|
||||||
|
static_cast<uint64_t>(slot);
|
||||||
|
|
||||||
|
// return job_done_signals_[slot].get();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool AllJobsDone() {
|
||||||
|
return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_;
|
||||||
|
}
|
||||||
|
virtual void WaitJobDone(DMACJobHandle handle) override {
|
||||||
|
uint32_t serial = static_cast<uint32_t>(handle >> 32);
|
||||||
|
uint32_t jobid = static_cast<uint32_t>(handle);
|
||||||
|
do {
|
||||||
|
if (jobserials_[jobid] != serial) {
|
||||||
|
return; // done, our slot was reused
|
||||||
|
}
|
||||||
|
|
||||||
|
auto waitres = threading::Wait(job_done_signals_[jobid].get(), false,
|
||||||
|
std::chrono::milliseconds{1});
|
||||||
|
|
||||||
|
if (waitres == threading::WaitResult::kTimeout) {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} while (true);
|
||||||
|
}
|
||||||
|
virtual void WaitForIdle() override {
|
||||||
|
while (!AllJobsDone()) {
|
||||||
|
threading::MaybeYield();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
XeDMACGeneric() {
|
||||||
|
XEDMALOG("Constructing xedma at addr %p", this);
|
||||||
|
dma_volatile_.free_job_slots_.store(0ULL);
|
||||||
|
dma_volatile_.jobs_submitted_.store(0ULL);
|
||||||
|
dma_volatile_.jobs_completed_.store(0ULL);
|
||||||
|
dma_volatile_.current_job_serial_.store(
|
||||||
|
1ULL); // so that a jobhandle is never 0
|
||||||
|
std::memset(jobs_, 0, sizeof(jobs_));
|
||||||
|
job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false);
|
||||||
|
job_completed_signal_ = threading::Event::CreateAutoResetEvent(false);
|
||||||
|
worker_has_work_ = threading::Event::CreateManualResetEvent(false);
|
||||||
|
threading::Thread::CreationParameters worker_params{};
|
||||||
|
worker_params.create_suspended = false;
|
||||||
|
worker_params.initial_priority = threading::ThreadPriority::kBelowNormal;
|
||||||
|
worker_params.stack_size = 65536; // dont need much stack at all
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < 64; ++i) {
|
||||||
|
job_done_signals_[i] = threading::Event::CreateManualResetEvent(false);
|
||||||
|
}
|
||||||
|
for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
|
||||||
|
// worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false);
|
||||||
|
worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false);
|
||||||
|
worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get();
|
||||||
|
|
||||||
|
uintptr_t encoded = reinterpret_cast<uintptr_t>(this);
|
||||||
|
xenia_assert(!(encoded & 0xFFULL));
|
||||||
|
xenia_assert(i < 256);
|
||||||
|
|
||||||
|
encoded |= i;
|
||||||
|
|
||||||
|
workers_[i] = threading::Thread::Create(worker_params, [encoded]() {
|
||||||
|
XeDMACGeneric::WorkerMainForwarder((void*)encoded);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
threading::Thread::CreationParameters scheduler_params{};
|
||||||
|
scheduler_params.create_suspended = false;
|
||||||
|
scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal;
|
||||||
|
scheduler_params.stack_size = 65536;
|
||||||
|
scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() {
|
||||||
|
XeDMACGeneric::DMAMainForwarder((void*)this);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
|
||||||
|
} // namespace xe::dma
|
|
@ -0,0 +1,46 @@
|
||||||
|
/**
|
||||||
|
******************************************************************************
|
||||||
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
******************************************************************************
|
||||||
|
* Copyright 2020 Ben Vanik. All rights reserved. *
|
||||||
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
|
******************************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef XENIA_BASE_DMA_H_
|
||||||
|
#define XENIA_BASE_DMA_H_
|
||||||
|
#include "memory.h"
|
||||||
|
#include "threading.h"
|
||||||
|
namespace xe::dma {
|
||||||
|
struct XeDMAJob;
|
||||||
|
using DmaPrecall = void (*)(XeDMAJob* job);
|
||||||
|
using DmaPostcall = void (*)(XeDMAJob* job);
|
||||||
|
struct XeDMAJob {
|
||||||
|
threading::Event* signal_on_done;
|
||||||
|
uint8_t* destination;
|
||||||
|
uint8_t* source;
|
||||||
|
size_t size;
|
||||||
|
DmaPrecall precall;
|
||||||
|
DmaPostcall postcall;
|
||||||
|
void* userdata1;
|
||||||
|
void* userdata2;
|
||||||
|
};
|
||||||
|
using DMACJobHandle = uint64_t;
|
||||||
|
|
||||||
|
class XeDMAC {
|
||||||
|
public:
|
||||||
|
virtual ~XeDMAC() {}
|
||||||
|
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
|
||||||
|
virtual void WaitJobDone(DMACJobHandle handle) = 0;
|
||||||
|
virtual void WaitForIdle() = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
XeDMAC* CreateDMAC();
|
||||||
|
// must be divisible by cache line size
|
||||||
|
XE_NOINLINE
|
||||||
|
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
|
||||||
|
uint32_t written_length);
|
||||||
|
|
||||||
|
} // namespace xe::dma
|
||||||
|
|
||||||
|
#endif // XENIA_BASE_DMA_H_
|
|
@ -377,29 +377,45 @@ int64_t m128_i64(const __m128& v) {
|
||||||
return m128_i64<N>(_mm_castps_pd(v));
|
return m128_i64<N>(_mm_castps_pd(v));
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
|
|
||||||
std::min/max float has handling for nans, where if either argument is nan the first argument is returned
|
|
||||||
|
|
||||||
minss/maxss are different, if either argument is nan the second operand to the instruction is returned
|
std::min/max float has handling for nans, where if either argument is
|
||||||
this is problematic because we have no assurances from the compiler on the argument ordering
|
nan the first argument is returned
|
||||||
|
|
||||||
so only use in places where nan handling is not needed
|
minss/maxss are different, if either argument is nan the second operand
|
||||||
|
to the instruction is returned this is problematic because we have no
|
||||||
|
assurances from the compiler on the argument ordering
|
||||||
|
|
||||||
|
so only use in places where nan handling is not needed
|
||||||
*/
|
*/
|
||||||
static float xe_minf(float x, float y) {
|
XE_FORCEINLINE
|
||||||
|
static float ArchMin(float x, float y) {
|
||||||
return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
||||||
}
|
}
|
||||||
static float xe_maxf(float x, float y) {
|
XE_FORCEINLINE
|
||||||
|
static float ArchMax(float x, float y) {
|
||||||
return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
|
||||||
}
|
}
|
||||||
static float xe_rcpf(float den) {
|
XE_FORCEINLINE
|
||||||
|
static float ArchReciprocal(float den) {
|
||||||
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
static float xe_minf(float x, float y) { return std::min<float>(x, y); }
|
static float ArchMin(float x, float y) { return std::min<float>(x, y); }
|
||||||
static float xe_maxf(float x, float y) { return std::max<float>(x, y); }
|
static float ArchMax(float x, float y) { return std::max<float>(x, y); }
|
||||||
static float xe_rcpf(float den) { return 1.0f / den; }
|
static float ArchReciprocal(float den) { return 1.0f / den; }
|
||||||
#endif
|
#endif
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static float RefineReciprocal(float initial, float den) {
|
||||||
|
float t0 = initial * den;
|
||||||
|
float t1 = t0 * initial;
|
||||||
|
float rcp2 = initial + initial;
|
||||||
|
return rcp2 - t1;
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static float ArchReciprocalRefined(float den) {
|
||||||
|
return RefineReciprocal(ArchReciprocal(den), den);
|
||||||
|
}
|
||||||
|
|
||||||
// Similar to the C++ implementation of XMConvertFloatToHalf and
|
// Similar to the C++ implementation of XMConvertFloatToHalf and
|
||||||
// XMConvertHalfToFloat from DirectXMath 3.00 (pre-3.04, which switched from the
|
// XMConvertHalfToFloat from DirectXMath 3.00 (pre-3.04, which switched from the
|
||||||
|
@ -494,7 +510,101 @@ inline T sat_sub(T a, T b) {
|
||||||
}
|
}
|
||||||
return T(result);
|
return T(result);
|
||||||
}
|
}
|
||||||
|
namespace divisors {
|
||||||
|
union IDivExtraInfo {
|
||||||
|
uint32_t value_;
|
||||||
|
struct {
|
||||||
|
uint32_t shift_ : 31;
|
||||||
|
uint32_t add_ : 1;
|
||||||
|
} info;
|
||||||
|
};
|
||||||
|
// returns magicnum multiplier
|
||||||
|
static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
|
||||||
|
IDivExtraInfo extra;
|
||||||
|
|
||||||
|
uint32_t d = _denom;
|
||||||
|
int p;
|
||||||
|
uint32_t nc, delta, q1, r1, q2, r2;
|
||||||
|
struct {
|
||||||
|
unsigned M;
|
||||||
|
int a;
|
||||||
|
int s;
|
||||||
|
} magu;
|
||||||
|
magu.a = 0;
|
||||||
|
nc = -1 - ((uint32_t) - (int32_t)d) % d;
|
||||||
|
p = 31;
|
||||||
|
q1 = 0x80000000 / nc;
|
||||||
|
r1 = 0x80000000 - q1 * nc;
|
||||||
|
q2 = 0x7FFFFFFF / d;
|
||||||
|
r2 = 0x7FFFFFFF - q2 * d;
|
||||||
|
do {
|
||||||
|
p += 1;
|
||||||
|
if (r1 >= nc - r1) {
|
||||||
|
q1 = 2 * q1 + 1;
|
||||||
|
r1 = 2 * r1 - nc;
|
||||||
|
} else {
|
||||||
|
q1 = 2 * q1;
|
||||||
|
r1 = 2 * r1;
|
||||||
|
}
|
||||||
|
if (r2 + 1 >= d - r2) {
|
||||||
|
if (q2 >= 0x7FFFFFFF) {
|
||||||
|
magu.a = 1;
|
||||||
|
}
|
||||||
|
q2 = 2 * q2 + 1;
|
||||||
|
r2 = 2 * r2 + 1 - d;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
if (q2 >= 0x80000000U) {
|
||||||
|
magu.a = 1;
|
||||||
|
}
|
||||||
|
q2 = 2 * q2;
|
||||||
|
r2 = 2 * r2 + 1;
|
||||||
|
}
|
||||||
|
delta = d - 1 - r2;
|
||||||
|
} while (p < 64 && (q1 < delta || r1 == 0));
|
||||||
|
|
||||||
|
extra.info.add_ = magu.a;
|
||||||
|
extra.info.shift_ = p - 32;
|
||||||
|
out_extra = extra.value_;
|
||||||
|
return static_cast<uint64_t>(q2 + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
|
||||||
|
uint32_t extradata) {
|
||||||
|
IDivExtraInfo extra;
|
||||||
|
|
||||||
|
extra.value_ = extradata;
|
||||||
|
|
||||||
|
uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
|
||||||
|
if (extra.info.add_) {
|
||||||
|
uint32_t addend = result + num;
|
||||||
|
addend = ((addend < result ? 0x80000000 : 0) | addend);
|
||||||
|
result = addend;
|
||||||
|
}
|
||||||
|
return result >> extra.info.shift_;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
|
||||||
|
uint32_t extradata, uint32_t original) {
|
||||||
|
uint32_t dived = ApplyUint32Div(num, mul, extradata);
|
||||||
|
unsigned result = num - (dived * original);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct MagicDiv {
|
||||||
|
uint32_t multiplier_;
|
||||||
|
uint32_t extradata_;
|
||||||
|
MagicDiv() : multiplier_(0), extradata_(0) {}
|
||||||
|
MagicDiv(uint32_t original) {
|
||||||
|
multiplier_ = PregenerateUint32Div(original, extradata_);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t Apply(uint32_t numerator) const {
|
||||||
|
return ApplyUint32Div(numerator, multiplier_, extradata_);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace divisors
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
||||||
#endif // XENIA_BASE_MATH_H_
|
#endif // XENIA_BASE_MATH_H_
|
||||||
|
|
|
@ -672,25 +672,58 @@ static void Prefetch<PrefetchTag::Level1>(const void* addr) {
|
||||||
#define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
|
#define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
|
||||||
#endif
|
#endif
|
||||||
#if XE_ARCH_AMD64 == 1
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
|
||||||
|
struct {
|
||||||
|
__m256 low32;
|
||||||
|
__m256 high32;
|
||||||
|
};
|
||||||
|
struct {
|
||||||
|
__m128i xmms[4];
|
||||||
|
};
|
||||||
|
float floats[XE_HOST_CACHE_LINE_SIZE / sizeof(float)];
|
||||||
|
};
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
static void WriteLineNT(void* destination, const void* source) {
|
static void WriteLineNT(CacheLine* XE_RESTRICT destination,
|
||||||
assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
|
const CacheLine* XE_RESTRICT source) {
|
||||||
__m256i low = _mm256_loadu_si256((const __m256i*)source);
|
assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
|
||||||
__m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]);
|
__m256 low = _mm256_loadu_ps(&source->floats[0]);
|
||||||
XE_MSVC_REORDER_BARRIER();
|
__m256 high = _mm256_loadu_ps(&source->floats[8]);
|
||||||
_mm256_stream_si256((__m256i*)destination, low);
|
_mm256_stream_ps(&destination->floats[0], low);
|
||||||
_mm256_stream_si256(&((__m256i*)destination)[1], high);
|
_mm256_stream_ps(&destination->floats[8], high);
|
||||||
}
|
}
|
||||||
|
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
static void ReadLineNT(void* destination, const void* source) {
|
static void ReadLineNT(CacheLine* XE_RESTRICT destination,
|
||||||
assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
|
const CacheLine* XE_RESTRICT source) {
|
||||||
__m256i low = _mm256_stream_load_si256((const __m256i*)source);
|
assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
|
||||||
__m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]);
|
|
||||||
XE_MSVC_REORDER_BARRIER();
|
__m128i first = _mm_stream_load_si128(&source->xmms[0]);
|
||||||
_mm256_storeu_si256((__m256i*)destination, low);
|
__m128i second = _mm_stream_load_si128(&source->xmms[1]);
|
||||||
_mm256_storeu_si256(&((__m256i*)destination)[1], high);
|
__m128i third = _mm_stream_load_si128(&source->xmms[2]);
|
||||||
|
__m128i fourth = _mm_stream_load_si128(&source->xmms[3]);
|
||||||
|
|
||||||
|
destination->xmms[0] = first;
|
||||||
|
destination->xmms[1] = second;
|
||||||
|
destination->xmms[2] = third;
|
||||||
|
destination->xmms[3] = fourth;
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static void ReadLine(CacheLine* XE_RESTRICT destination,
|
||||||
|
const CacheLine* XE_RESTRICT source) {
|
||||||
|
assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
|
||||||
|
__m256 low = _mm256_loadu_ps(&source->floats[0]);
|
||||||
|
__m256 high = _mm256_loadu_ps(&source->floats[8]);
|
||||||
|
_mm256_storeu_ps(&destination->floats[0], low);
|
||||||
|
_mm256_storeu_ps(&destination->floats[8], high);
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static void WriteLine(CacheLine* XE_RESTRICT destination,
|
||||||
|
const CacheLine* XE_RESTRICT source) {
|
||||||
|
assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
|
||||||
|
__m256 low = _mm256_loadu_ps(&source->floats[0]);
|
||||||
|
__m256 high = _mm256_loadu_ps(&source->floats[8]);
|
||||||
|
_mm256_storeu_ps(&destination->floats[0], low);
|
||||||
|
_mm256_storeu_ps(&destination->floats[8], high);
|
||||||
}
|
}
|
||||||
|
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
|
@ -699,19 +732,29 @@ XE_FORCEINLINE
|
||||||
static void ReadFence() { _mm_lfence(); }
|
static void ReadFence() { _mm_lfence(); }
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
static void ReadWriteFence() { _mm_mfence(); }
|
static void ReadWriteFence() { _mm_mfence(); }
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
|
||||||
|
uint8_t bvals[XE_HOST_CACHE_LINE_SIZE];
|
||||||
|
};
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
static void WriteLineNT(void* destination, const void* source) {
|
static void WriteLineNT(CacheLine* destination, const CacheLine* source) {
|
||||||
assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
|
memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
|
||||||
memcpy(destination, source, 64);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
static void ReadLineNT(void* destination, const void* source) {
|
static void ReadLineNT(CacheLine* destination, const CacheLine* source) {
|
||||||
assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
|
memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
|
||||||
memcpy(destination, source, 64);
|
|
||||||
}
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static void WriteLine(CacheLine* destination, const CacheLine* source) {
|
||||||
|
memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
static void ReadLine(CacheLine* destination, const CacheLine* source) {
|
||||||
|
memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
static void WriteFence() {}
|
static void WriteFence() {}
|
||||||
XE_FORCEINLINE
|
XE_FORCEINLINE
|
||||||
|
@ -720,6 +763,47 @@ XE_FORCEINLINE
|
||||||
static void ReadWriteFence() {}
|
static void ReadWriteFence() {}
|
||||||
#endif
|
#endif
|
||||||
} // namespace swcache
|
} // namespace swcache
|
||||||
|
|
||||||
|
template <unsigned Size>
|
||||||
|
static void smallcpy_const(void* destination, const void* source) {
|
||||||
|
#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
|
||||||
|
if constexpr ((Size & 7) == 0) {
|
||||||
|
__movsq((unsigned long long*)destination, (const unsigned long long*)source,
|
||||||
|
Size / 8);
|
||||||
|
} else if constexpr ((Size & 3) == 0) {
|
||||||
|
__movsd((unsigned long*)destination, (const unsigned long*)source,
|
||||||
|
Size / 4);
|
||||||
|
// dont even bother with movsw, i think the operand size override prefix
|
||||||
|
// slows it down
|
||||||
|
} else {
|
||||||
|
__movsb((unsigned char*)destination, (const unsigned char*)source, Size);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
memcpy(destination, source, Size);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
template <unsigned Size>
|
||||||
|
static void smallset_const(void* destination, unsigned char fill_value) {
|
||||||
|
#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
|
||||||
|
if constexpr ((Size & 7) == 0) {
|
||||||
|
unsigned long long fill =
|
||||||
|
static_cast<unsigned long long>(fill_value) * 0x0101010101010101ULL;
|
||||||
|
|
||||||
|
__stosq((unsigned long long*)destination, fill, Size / 8);
|
||||||
|
} else if constexpr ((Size & 3) == 0) {
|
||||||
|
static constexpr unsigned long fill =
|
||||||
|
static_cast<unsigned long>(fill_value) * 0x01010101U;
|
||||||
|
__stosd((unsigned long*)destination, fill, Size / 4);
|
||||||
|
// dont even bother with movsw, i think the operand size override prefix
|
||||||
|
// slows it down
|
||||||
|
} else {
|
||||||
|
__stosb((unsigned char*)destination, fill_value, Size);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
memset(destination, fill_value, Size);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
||||||
#endif // XENIA_BASE_MEMORY_H_
|
#endif // XENIA_BASE_MEMORY_H_
|
||||||
|
|
|
@ -59,16 +59,8 @@ class RingBuffer {
|
||||||
// subtract instead
|
// subtract instead
|
||||||
void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
|
void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
|
||||||
ring_size_t read_count() const {
|
ring_size_t read_count() const {
|
||||||
// chrispy: these branches are unpredictable
|
// chrispy: these branches are unpredictable
|
||||||
#if 0
|
|
||||||
if (read_offset_ == write_offset_) {
|
|
||||||
return 0;
|
|
||||||
} else if (read_offset_ < write_offset_) {
|
|
||||||
return write_offset_ - read_offset_;
|
|
||||||
} else {
|
|
||||||
return (capacity_ - read_offset_) + write_offset_;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
ring_size_t read_offs = read_offset_;
|
ring_size_t read_offs = read_offset_;
|
||||||
ring_size_t write_offs = write_offset_;
|
ring_size_t write_offs = write_offset_;
|
||||||
ring_size_t cap = capacity_;
|
ring_size_t cap = capacity_;
|
||||||
|
@ -77,14 +69,6 @@ class RingBuffer {
|
||||||
ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
|
ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
|
||||||
|
|
||||||
ring_size_t comparison_value = read_offs <= write_offs;
|
ring_size_t comparison_value = read_offs <= write_offs;
|
||||||
#if 0
|
|
||||||
size_t selector =
|
|
||||||
static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
|
|
||||||
offset_delta &= selector;
|
|
||||||
|
|
||||||
wrap_read_count &= ~selector;
|
|
||||||
return offset_delta | wrap_read_count;
|
|
||||||
#else
|
|
||||||
|
|
||||||
if (XE_LIKELY(read_offs <= write_offs)) {
|
if (XE_LIKELY(read_offs <= write_offs)) {
|
||||||
return offset_delta; // will be 0 if they are equal, semantically
|
return offset_delta; // will be 0 if they are equal, semantically
|
||||||
|
@ -93,8 +77,6 @@ class RingBuffer {
|
||||||
} else {
|
} else {
|
||||||
return wrap_read_count;
|
return wrap_read_count;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ring_size_t write_offset() const { return write_offset_; }
|
ring_size_t write_offset() const { return write_offset_; }
|
||||||
|
@ -116,9 +98,9 @@ class RingBuffer {
|
||||||
void AdvanceWrite(size_t count);
|
void AdvanceWrite(size_t count);
|
||||||
|
|
||||||
struct ReadRange {
|
struct ReadRange {
|
||||||
const uint8_t* first;
|
const uint8_t* XE_RESTRICT first;
|
||||||
|
|
||||||
const uint8_t* second;
|
const uint8_t* XE_RESTRICT second;
|
||||||
ring_size_t first_length;
|
ring_size_t first_length;
|
||||||
ring_size_t second_length;
|
ring_size_t second_length;
|
||||||
};
|
};
|
||||||
|
@ -126,9 +108,11 @@ class RingBuffer {
|
||||||
void EndRead(ReadRange read_range);
|
void EndRead(ReadRange read_range);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
BeginRead, but if there is a second Range it will prefetch all lines of it
|
BeginRead, but if there is a second Range it will prefetch all lines of
|
||||||
|
it
|
||||||
|
|
||||||
this does not prefetch the first range, because software prefetching can do that faster than we can
|
this does not prefetch the first range, because software
|
||||||
|
prefetching can do that faster than we can
|
||||||
*/
|
*/
|
||||||
template <swcache::PrefetchTag tag>
|
template <swcache::PrefetchTag tag>
|
||||||
XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
|
XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
|
||||||
|
@ -138,7 +122,7 @@ class RingBuffer {
|
||||||
ring_size_t numlines =
|
ring_size_t numlines =
|
||||||
xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
|
xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
|
||||||
XE_HOST_CACHE_LINE_SIZE;
|
XE_HOST_CACHE_LINE_SIZE;
|
||||||
//chrispy: maybe unroll?
|
// chrispy: maybe unroll?
|
||||||
for (ring_size_t i = 0; i < numlines; ++i) {
|
for (ring_size_t i = 0; i < numlines; ++i) {
|
||||||
swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
|
swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
|
||||||
}
|
}
|
||||||
|
@ -187,7 +171,7 @@ class RingBuffer {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint8_t* buffer_ = nullptr;
|
uint8_t* XE_RESTRICT buffer_ = nullptr;
|
||||||
ring_size_t capacity_ = 0;
|
ring_size_t capacity_ = 0;
|
||||||
ring_size_t read_offset_ = 0;
|
ring_size_t read_offset_ = 0;
|
||||||
ring_size_t write_offset_ = 0;
|
ring_size_t write_offset_ = 0;
|
||||||
|
|
|
@ -0,0 +1,87 @@
|
||||||
|
/**
|
||||||
|
******************************************************************************
|
||||||
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
******************************************************************************
|
||||||
|
* Copyright 2019 Ben Vanik. All rights reserved. *
|
||||||
|
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||||
|
******************************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef XENIA_BASE_SPLIT_MAP_H_
|
||||||
|
#define XENIA_BASE_SPLIT_MAP_H_
|
||||||
|
#include <algorithm>
|
||||||
|
#include <vector>
|
||||||
|
namespace xe {
|
||||||
|
/*
|
||||||
|
a map structure that is optimized for infrequent
|
||||||
|
reallocation/resizing/erasure and frequent searches by key implemented as 2
|
||||||
|
std::vectors, one of the keys and one of the values
|
||||||
|
*/
|
||||||
|
template <typename TKey, typename TValue>
|
||||||
|
class split_map {
|
||||||
|
using key_vector = std::vector<TKey>;
|
||||||
|
using value_vector = std::vector<TValue>;
|
||||||
|
|
||||||
|
key_vector keys_;
|
||||||
|
value_vector values_;
|
||||||
|
|
||||||
|
public:
|
||||||
|
using my_type = split_map<TKey, TValue>;
|
||||||
|
|
||||||
|
uint32_t IndexForKey(const TKey& k) {
|
||||||
|
auto lbound = std::lower_bound(keys_.begin(), keys_.end(), k);
|
||||||
|
return static_cast<uint32_t>(lbound - keys_.begin());
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t size() const { return static_cast<uint32_t>(keys_.size()); }
|
||||||
|
key_vector& Keys() { return keys_; }
|
||||||
|
value_vector& Values() { return values_; }
|
||||||
|
void clear() {
|
||||||
|
keys_.clear();
|
||||||
|
values_.clear();
|
||||||
|
}
|
||||||
|
void resize(uint32_t new_size) {
|
||||||
|
keys_.resize(static_cast<size_t>(new_size));
|
||||||
|
values_.resize(static_cast<size_t>(new_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
void reserve(uint32_t new_size) {
|
||||||
|
keys_.reserve(static_cast<size_t>(new_size));
|
||||||
|
values_.reserve(static_cast<size_t>(new_size));
|
||||||
|
}
|
||||||
|
const TKey* KeyAt(uint32_t index) const {
|
||||||
|
if (index == size()) {
|
||||||
|
return nullptr;
|
||||||
|
} else {
|
||||||
|
return &keys_[index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const TValue* ValueAt(uint32_t index) const {
|
||||||
|
if (index == size()) {
|
||||||
|
return nullptr;
|
||||||
|
} else {
|
||||||
|
return &values_[index];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void InsertAt(TKey k, TValue v, uint32_t idx) {
|
||||||
|
uint32_t old_size = size();
|
||||||
|
|
||||||
|
bool needs_shiftup = idx != old_size;
|
||||||
|
|
||||||
|
values_.insert(values_.begin() + idx, v);
|
||||||
|
keys_.insert(keys_.begin() + idx, k);
|
||||||
|
}
|
||||||
|
void EraseAt(uint32_t idx) {
|
||||||
|
uint32_t old_size = size();
|
||||||
|
if (idx == old_size) {
|
||||||
|
return; // trying to erase nonexistent entry
|
||||||
|
} else {
|
||||||
|
values_.erase(values_.begin() + idx);
|
||||||
|
keys_.erase(keys_.begin() + idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
} // namespace xe
|
||||||
|
|
||||||
|
#endif // XENIA_BASE_SPLIT_MAP_H_
|
|
@ -57,7 +57,11 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
|
||||||
"code. The workaround may cause reduced CPU performance but is a "
|
"code. The workaround may cause reduced CPU performance but is a "
|
||||||
"more accurate emulation",
|
"more accurate emulation",
|
||||||
"x64");
|
"x64");
|
||||||
|
DEFINE_uint32(align_all_basic_blocks, 0,
|
||||||
|
"Aligns the start of all basic blocks to N bytes. Only specify a "
|
||||||
|
"power of 2, 16 is the recommended value. Results in larger "
|
||||||
|
"icache usage, but potentially faster loops",
|
||||||
|
"x64");
|
||||||
#if XE_X64_PROFILER_AVAILABLE == 1
|
#if XE_X64_PROFILER_AVAILABLE == 1
|
||||||
DEFINE_bool(instrument_call_times, false,
|
DEFINE_bool(instrument_call_times, false,
|
||||||
"Compute time taken for functions, for profiling guest code",
|
"Compute time taken for functions, for profiling guest code",
|
||||||
|
@ -110,7 +114,6 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
|
||||||
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
|
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
|
||||||
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
|
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
|
||||||
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
|
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
|
||||||
TEST_EMIT_FEATURE(kX64EmitF16C, Xbyak::util::Cpu::tF16C);
|
|
||||||
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
|
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
|
||||||
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
|
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
|
||||||
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
|
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
|
||||||
|
@ -200,7 +203,55 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
#pragma pack(push, 1)
|
||||||
|
struct RGCEmitted {
|
||||||
|
uint8_t ff_;
|
||||||
|
uint32_t rgcid_;
|
||||||
|
};
|
||||||
|
#pragma pack(pop)
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
|
||||||
|
for (auto&& callsite : call_sites_) {
|
||||||
|
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||||
|
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
|
||||||
|
hunter =
|
||||||
|
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
|
||||||
|
hunter->rgcid_ =
|
||||||
|
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
|
||||||
|
reinterpret_cast<intptr_t>(hunter + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
|
||||||
|
#if 0
|
||||||
|
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||||
|
|
||||||
|
std::map<uint32_t, ResolvableGuestCall*> id_to_rgc{};
|
||||||
|
|
||||||
|
for (auto&& callsite : call_sites_) {
|
||||||
|
id_to_rgc[callsite.offset_] = &callsite;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
||||||
|
for (auto&& callsite : call_sites_) {
|
||||||
|
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
|
||||||
|
hunter =
|
||||||
|
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
|
||||||
|
hunter->rgcid_ =
|
||||||
|
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
|
||||||
|
reinterpret_cast<intptr_t>(hunter + 1));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
||||||
GuestFunction* function) {
|
GuestFunction* function) {
|
||||||
// To avoid changing xbyak, we do a switcharoo here.
|
// To avoid changing xbyak, we do a switcharoo here.
|
||||||
|
@ -218,25 +269,9 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
|
||||||
if (function) {
|
if (function) {
|
||||||
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
|
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
|
||||||
new_execute_address, new_write_address);
|
new_execute_address, new_write_address);
|
||||||
if (cvars::resolve_rel32_guest_calls) {
|
|
||||||
for (auto&& callsite : call_sites_) {
|
|
||||||
#pragma pack(push, 1)
|
|
||||||
struct RGCEmitted {
|
|
||||||
uint8_t ff_;
|
|
||||||
uint32_t rgcid_;
|
|
||||||
};
|
|
||||||
#pragma pack(pop)
|
|
||||||
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
|
|
||||||
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
|
|
||||||
hunter = reinterpret_cast<RGCEmitted*>(
|
|
||||||
reinterpret_cast<char*>(hunter) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
|
if (cvars::resolve_rel32_guest_calls) {
|
||||||
hunter->rgcid_ =
|
InjectCallAddresses(new_execute_address);
|
||||||
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
|
|
||||||
reinterpret_cast<intptr_t>(hunter + 1));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
|
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
|
||||||
|
@ -367,6 +402,9 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
|
||||||
label = label->next;
|
label = label->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cvars::align_all_basic_blocks) {
|
||||||
|
align(cvars::align_all_basic_blocks, true);
|
||||||
|
}
|
||||||
// Process instructions.
|
// Process instructions.
|
||||||
const Instr* instr = block->instr_head;
|
const Instr* instr = block->instr_head;
|
||||||
while (instr) {
|
while (instr) {
|
||||||
|
@ -1000,12 +1038,6 @@ static const vec128_t xmm_consts[] = {
|
||||||
vec128i(0x7f800000),
|
vec128i(0x7f800000),
|
||||||
/* XMMThreeFloatMask */
|
/* XMMThreeFloatMask */
|
||||||
vec128i(~0U, ~0U, ~0U, 0U),
|
vec128i(~0U, ~0U, ~0U, 0U),
|
||||||
/*XMMXenosF16ExtRangeStart*/
|
|
||||||
vec128f(65504),
|
|
||||||
/*XMMVSRShlByteshuf*/
|
|
||||||
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
|
|
||||||
// XMMVSRMask
|
|
||||||
vec128b(1),
|
|
||||||
/*
|
/*
|
||||||
XMMF16UnpackLCPI2
|
XMMF16UnpackLCPI2
|
||||||
*/
|
*/
|
||||||
|
@ -1036,8 +1068,7 @@ static const vec128_t xmm_consts[] = {
|
||||||
/*XMMXOPWordShiftMask*/
|
/*XMMXOPWordShiftMask*/
|
||||||
vec128s(15),
|
vec128s(15),
|
||||||
/*XMMXOPDwordShiftMask*/
|
/*XMMXOPDwordShiftMask*/
|
||||||
vec128i(31)
|
vec128i(31)};
|
||||||
};
|
|
||||||
|
|
||||||
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
|
||||||
for (auto& vec : xmm_consts) {
|
for (auto& vec : xmm_consts) {
|
||||||
|
|
|
@ -157,9 +157,6 @@ enum XmmConst {
|
||||||
XMMLVSRTableBase,
|
XMMLVSRTableBase,
|
||||||
XMMSingleDenormalMask,
|
XMMSingleDenormalMask,
|
||||||
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
|
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
|
||||||
XMMXenosF16ExtRangeStart,
|
|
||||||
XMMVSRShlByteshuf,
|
|
||||||
XMMVSRMask,
|
|
||||||
XMMF16UnpackLCPI2, // 0x38000000, 1/ 32768
|
XMMF16UnpackLCPI2, // 0x38000000, 1/ 32768
|
||||||
XMMF16UnpackLCPI3, // 0x0x7fe000007fe000
|
XMMF16UnpackLCPI3, // 0x0x7fe000007fe000
|
||||||
XMMF16PackLCPI0,
|
XMMF16PackLCPI0,
|
||||||
|
@ -194,7 +191,7 @@ enum X64EmitterFeatureFlags {
|
||||||
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
|
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
|
||||||
kX64EmitBMI1 = 1 << 3,
|
kX64EmitBMI1 = 1 << 3,
|
||||||
kX64EmitBMI2 = 1 << 4,
|
kX64EmitBMI2 = 1 << 4,
|
||||||
kX64EmitF16C = 1 << 5,
|
kX64EmitPrefetchW = 1 << 5,
|
||||||
kX64EmitMovbe = 1 << 6,
|
kX64EmitMovbe = 1 << 6,
|
||||||
kX64EmitGFNI = 1 << 7,
|
kX64EmitGFNI = 1 << 7,
|
||||||
|
|
||||||
|
@ -215,11 +212,14 @@ enum X64EmitterFeatureFlags {
|
||||||
// inc/dec) do not introduce false dependencies on EFLAGS
|
// inc/dec) do not introduce false dependencies on EFLAGS
|
||||||
// because the individual flags are treated as different vars by
|
// because the individual flags are treated as different vars by
|
||||||
// the processor. (this applies to zen)
|
// the processor. (this applies to zen)
|
||||||
kX64EmitPrefetchW = 1 << 16,
|
kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx
|
||||||
kX64EmitXOP = 1 << 17, // chrispy: xop maps really well to many vmx
|
|
||||||
// instructions, and FX users need the boost
|
// instructions, and FX users need the boost
|
||||||
kX64EmitFMA4 = 1 << 18, // todo: also use on zen1?
|
kX64EmitFMA4 = 1 << 17, // todo: also use on zen1?
|
||||||
kX64EmitTBM = 1 << 19
|
kX64EmitTBM = 1 << 18,
|
||||||
|
// kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
|
||||||
|
// 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
|
||||||
|
// instructions by using the legacy sse version if we recently cleared the
|
||||||
|
// high 128 bits of the
|
||||||
};
|
};
|
||||||
class ResolvableGuestCall {
|
class ResolvableGuestCall {
|
||||||
public:
|
public:
|
||||||
|
@ -251,6 +251,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
|
||||||
uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
|
uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
|
||||||
void** out_code_address, size_t* out_code_size,
|
void** out_code_address, size_t* out_code_size,
|
||||||
std::vector<SourceMapEntry>* out_source_map);
|
std::vector<SourceMapEntry>* out_source_map);
|
||||||
|
void InjectCallAddresses(void* new_execute_addr);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// Reserved: rsp, rsi, rdi
|
// Reserved: rsp, rsi, rdi
|
||||||
|
|
|
@ -43,23 +43,23 @@ enum KeyType {
|
||||||
KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
|
KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
|
||||||
KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
|
KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
|
||||||
};
|
};
|
||||||
|
using InstrKeyValue = uint32_t;
|
||||||
#pragma pack(push, 1)
|
#pragma pack(push, 1)
|
||||||
union InstrKey {
|
union InstrKey {
|
||||||
uint32_t value;
|
InstrKeyValue value;
|
||||||
struct {
|
struct {
|
||||||
uint32_t opcode : 8;
|
InstrKeyValue opcode : 8;
|
||||||
uint32_t dest : 5;
|
InstrKeyValue dest : 5;
|
||||||
uint32_t src1 : 5;
|
InstrKeyValue src1 : 5;
|
||||||
uint32_t src2 : 5;
|
InstrKeyValue src2 : 5;
|
||||||
uint32_t src3 : 5;
|
InstrKeyValue src3 : 5;
|
||||||
uint32_t reserved : 4;
|
InstrKeyValue reserved : 4;
|
||||||
};
|
};
|
||||||
|
|
||||||
operator uint32_t() const { return value; }
|
operator InstrKeyValue() const { return value; }
|
||||||
|
|
||||||
InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
|
InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
|
||||||
InstrKey(uint32_t v) : value(v) {}
|
InstrKey(InstrKeyValue v) : value(v) {}
|
||||||
|
|
||||||
// this used to take about 1% cpu while precompiling
|
// this used to take about 1% cpu while precompiling
|
||||||
// it kept reloading opcode, and also constantly repacking and unpacking the
|
// it kept reloading opcode, and also constantly repacking and unpacking the
|
||||||
|
@ -67,16 +67,16 @@ union InstrKey {
|
||||||
InstrKey(const Instr* i) : value(0) {
|
InstrKey(const Instr* i) : value(0) {
|
||||||
const OpcodeInfo* info = i->GetOpcodeInfo();
|
const OpcodeInfo* info = i->GetOpcodeInfo();
|
||||||
|
|
||||||
uint32_t sig = info->signature;
|
InstrKeyValue sig = info->signature;
|
||||||
|
|
||||||
OpcodeSignatureType dest_type, src1_type, src2_type, src3_type;
|
OpcodeSignatureType dest_type, src1_type, src2_type, src3_type;
|
||||||
|
|
||||||
UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type);
|
UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type);
|
||||||
|
|
||||||
uint32_t out_desttype = (uint32_t)dest_type;
|
InstrKeyValue out_desttype = (InstrKeyValue)dest_type;
|
||||||
uint32_t out_src1type = (uint32_t)src1_type;
|
InstrKeyValue out_src1type = (InstrKeyValue)src1_type;
|
||||||
uint32_t out_src2type = (uint32_t)src2_type;
|
InstrKeyValue out_src2type = (InstrKeyValue)src2_type;
|
||||||
uint32_t out_src3type = (uint32_t)src3_type;
|
InstrKeyValue out_src3type = (InstrKeyValue)src3_type;
|
||||||
|
|
||||||
Value* destv = i->dest;
|
Value* destv = i->dest;
|
||||||
// pre-deref, even if not value
|
// pre-deref, even if not value
|
||||||
|
@ -105,7 +105,7 @@ union InstrKey {
|
||||||
template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
|
template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
|
||||||
KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
|
KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
|
||||||
struct Construct {
|
struct Construct {
|
||||||
static const uint32_t value =
|
static const InstrKeyValue value =
|
||||||
(OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
|
(OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -307,8 +307,8 @@ struct I<OPCODE, DEST> : DestField<DEST> {
|
||||||
protected:
|
protected:
|
||||||
template <typename SEQ, typename T>
|
template <typename SEQ, typename T>
|
||||||
friend struct Sequence;
|
friend struct Sequence;
|
||||||
bool Load(const Instr* i) {
|
bool Load(const Instr* i, InstrKeyValue kv) {
|
||||||
if (InstrKey(i).value == key && BASE::LoadDest(i)) {
|
if (kv == key && BASE::LoadDest(i)) {
|
||||||
instr = i;
|
instr = i;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -329,8 +329,8 @@ struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
|
||||||
protected:
|
protected:
|
||||||
template <typename SEQ, typename T>
|
template <typename SEQ, typename T>
|
||||||
friend struct Sequence;
|
friend struct Sequence;
|
||||||
bool Load(const Instr* i) {
|
bool Load(const Instr* i, InstrKeyValue kv) {
|
||||||
if (InstrKey(i).value == key && BASE::LoadDest(i)) {
|
if (kv == key && BASE::LoadDest(i)) {
|
||||||
instr = i;
|
instr = i;
|
||||||
src1.Load(i->src1);
|
src1.Load(i->src1);
|
||||||
return true;
|
return true;
|
||||||
|
@ -355,8 +355,8 @@ struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
|
||||||
protected:
|
protected:
|
||||||
template <typename SEQ, typename T>
|
template <typename SEQ, typename T>
|
||||||
friend struct Sequence;
|
friend struct Sequence;
|
||||||
bool Load(const Instr* i) {
|
bool Load(const Instr* i, InstrKeyValue kv) {
|
||||||
if (InstrKey(i).value == key && BASE::LoadDest(i)) {
|
if (kv == key && BASE::LoadDest(i)) {
|
||||||
instr = i;
|
instr = i;
|
||||||
src1.Load(i->src1);
|
src1.Load(i->src1);
|
||||||
src2.Load(i->src2);
|
src2.Load(i->src2);
|
||||||
|
@ -385,8 +385,8 @@ struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
|
||||||
protected:
|
protected:
|
||||||
template <typename SEQ, typename T>
|
template <typename SEQ, typename T>
|
||||||
friend struct Sequence;
|
friend struct Sequence;
|
||||||
bool Load(const Instr* i) {
|
bool Load(const Instr* i, InstrKeyValue ikey) {
|
||||||
if (InstrKey(i).value == key && BASE::LoadDest(i)) {
|
if (ikey == key && BASE::LoadDest(i)) {
|
||||||
instr = i;
|
instr = i;
|
||||||
src1.Load(i->src1);
|
src1.Load(i->src1);
|
||||||
src2.Load(i->src2);
|
src2.Load(i->src2);
|
||||||
|
@ -422,9 +422,9 @@ struct Sequence {
|
||||||
|
|
||||||
static constexpr uint32_t head_key() { return T::key; }
|
static constexpr uint32_t head_key() { return T::key; }
|
||||||
|
|
||||||
static bool Select(X64Emitter& e, const Instr* i) {
|
static bool Select(X64Emitter& e, const Instr* i, InstrKeyValue ikey) {
|
||||||
T args;
|
T args;
|
||||||
if (!args.Load(i)) {
|
if (!args.Load(i, ikey)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
SEQ::Emit(e, args);
|
SEQ::Emit(e, args);
|
||||||
|
|
|
@ -27,12 +27,6 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) {
|
||||||
if (valid) {
|
if (valid) {
|
||||||
auto name = i.src2.value->name;
|
auto name = i.src2.value->name;
|
||||||
switch (opcode) {
|
switch (opcode) {
|
||||||
case OPCODE_IS_TRUE:
|
|
||||||
e.jnz(name, e.T_NEAR);
|
|
||||||
break;
|
|
||||||
case OPCODE_IS_FALSE:
|
|
||||||
e.jz(name, e.T_NEAR);
|
|
||||||
break;
|
|
||||||
case OPCODE_COMPARE_EQ:
|
case OPCODE_COMPARE_EQ:
|
||||||
e.je(name, e.T_NEAR);
|
e.je(name, e.T_NEAR);
|
||||||
break;
|
break;
|
||||||
|
@ -299,26 +293,14 @@ struct CALL_TRUE_I64
|
||||||
struct CALL_TRUE_F32
|
struct CALL_TRUE_F32
|
||||||
: Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
|
: Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
assert_true(i.src2.value->is_guest());
|
assert_impossible_sequence(CALL_TRUE_F32);
|
||||||
e.vptest(i.src1, i.src1);
|
|
||||||
Xbyak::Label skip;
|
|
||||||
e.jz(skip);
|
|
||||||
e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
|
|
||||||
e.L(skip);
|
|
||||||
e.ForgetMxcsrMode();
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CALL_TRUE_F64
|
struct CALL_TRUE_F64
|
||||||
: Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
|
: Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
assert_true(i.src2.value->is_guest());
|
assert_impossible_sequence(CALL_TRUE_F64);
|
||||||
e.vptest(i.src1, i.src1);
|
|
||||||
Xbyak::Label skip;
|
|
||||||
e.jz(skip);
|
|
||||||
e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
|
|
||||||
e.L(skip);
|
|
||||||
e.ForgetMxcsrMode();
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
|
EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
|
||||||
|
@ -404,22 +386,14 @@ struct CALL_INDIRECT_TRUE_F32
|
||||||
: Sequence<CALL_INDIRECT_TRUE_F32,
|
: Sequence<CALL_INDIRECT_TRUE_F32,
|
||||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
|
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.vptest(i.src1, i.src1);
|
assert_impossible_sequence(CALL_INDIRECT_TRUE_F32);
|
||||||
Xbyak::Label skip;
|
|
||||||
e.jz(skip, CodeGenerator::T_NEAR);
|
|
||||||
e.CallIndirect(i.instr, i.src2);
|
|
||||||
e.L(skip);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct CALL_INDIRECT_TRUE_F64
|
struct CALL_INDIRECT_TRUE_F64
|
||||||
: Sequence<CALL_INDIRECT_TRUE_F64,
|
: Sequence<CALL_INDIRECT_TRUE_F64,
|
||||||
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
|
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.vptest(i.src1, i.src1);
|
assert_impossible_sequence(CALL_INDIRECT_TRUE_F64);
|
||||||
Xbyak::Label skip;
|
|
||||||
e.jz(skip, CodeGenerator::T_NEAR);
|
|
||||||
e.CallIndirect(i.instr, i.src2);
|
|
||||||
e.L(skip);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
|
EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
|
||||||
|
@ -486,15 +460,13 @@ struct RETURN_TRUE_I64
|
||||||
struct RETURN_TRUE_F32
|
struct RETURN_TRUE_F32
|
||||||
: Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
|
: Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.vptest(i.src1, i.src1);
|
assert_impossible_sequence(RETURN_TRUE_F32);
|
||||||
e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct RETURN_TRUE_F64
|
struct RETURN_TRUE_F64
|
||||||
: Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
|
: Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.vptest(i.src1, i.src1);
|
assert_impossible_sequence(RETURN_TRUE_F64);
|
||||||
e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
|
EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
|
||||||
|
@ -553,33 +525,25 @@ struct BRANCH_TRUE_I64
|
||||||
struct BRANCH_TRUE_F32
|
struct BRANCH_TRUE_F32
|
||||||
: Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
|
: Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info &&
|
/*
|
||||||
i.instr->prev->dest == i.src1.value) {
|
chrispy: right now, im not confident that we are always clearing
|
||||||
e.jnz(i.src2.value->name, e.T_NEAR);
|
the upper 96 bits of registers, making vptest extremely unsafe. many
|
||||||
} else if (i.instr->prev &&
|
ss/sd operations copy over the upper 96 from the source, and for abs we
|
||||||
i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
|
negate ALL elements, making the top 64 bits contain 0x80000000 etc
|
||||||
i.instr->prev->dest == i.src1.value) {
|
*/
|
||||||
e.jz(i.src2.value->name, e.T_NEAR);
|
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||||
} else {
|
e.vmovd(e.eax, input);
|
||||||
e.vptest(i.src1, i.src1);
|
e.test(e.eax, e.eax);
|
||||||
e.jnz(i.src2.value->name, e.T_NEAR);
|
e.jnz(i.src2.value->name, e.T_NEAR);
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct BRANCH_TRUE_F64
|
struct BRANCH_TRUE_F64
|
||||||
: Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
|
: Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info &&
|
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||||
i.instr->prev->dest == i.src1.value) {
|
e.vmovq(e.rax, input);
|
||||||
e.jnz(i.src2.value->name, e.T_NEAR);
|
e.test(e.rax, e.rax);
|
||||||
} else if (i.instr->prev &&
|
e.jnz(i.src2.value->name, e.T_NEAR);
|
||||||
i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
|
|
||||||
i.instr->prev->dest == i.src1.value) {
|
|
||||||
e.jz(i.src2.value->name, e.T_NEAR);
|
|
||||||
} else {
|
|
||||||
e.vptest(i.src1, i.src1);
|
|
||||||
e.jnz(i.src2.value->name, e.T_NEAR);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
|
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
|
||||||
|
@ -624,7 +588,9 @@ struct BRANCH_FALSE_F32
|
||||||
: Sequence<BRANCH_FALSE_F32,
|
: Sequence<BRANCH_FALSE_F32,
|
||||||
I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
|
I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.vptest(i.src1, i.src1);
|
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||||
|
e.vmovd(e.eax, input);
|
||||||
|
e.test(e.eax, e.eax);
|
||||||
e.jz(i.src2.value->name, e.T_NEAR);
|
e.jz(i.src2.value->name, e.T_NEAR);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -632,7 +598,9 @@ struct BRANCH_FALSE_F64
|
||||||
: Sequence<BRANCH_FALSE_F64,
|
: Sequence<BRANCH_FALSE_F64,
|
||||||
I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
|
I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
e.vptest(i.src1, i.src1);
|
Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
|
||||||
|
e.vmovq(e.rax, input);
|
||||||
|
e.test(e.rax, e.rax);
|
||||||
e.jz(i.src2.value->name, e.T_NEAR);
|
e.jz(i.src2.value->name, e.T_NEAR);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -975,6 +975,9 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
|
||||||
if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) {
|
if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (IsTracingData()) { // incompatible with tracing
|
||||||
|
return false;
|
||||||
|
}
|
||||||
uint32_t guestaddr = i->GuestAddressFor();
|
uint32_t guestaddr = i->GuestAddressFor();
|
||||||
if (!guestaddr) {
|
if (!guestaddr) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -984,7 +987,54 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
|
||||||
|
|
||||||
return flags && flags->accessed_mmio;
|
return flags && flags->accessed_mmio;
|
||||||
}
|
}
|
||||||
|
template <typename T, bool swap>
|
||||||
|
static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
|
||||||
|
if (swap) {
|
||||||
|
value = xe::byte_swap(value);
|
||||||
|
}
|
||||||
|
if (guestaddr >= 0xE0000000) {
|
||||||
|
guestaddr += 0x1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
|
||||||
|
|
||||||
|
auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
|
||||||
|
if (!gaddr) {
|
||||||
|
*reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
|
||||||
|
} else {
|
||||||
|
value = xe::byte_swap(value); /*
|
||||||
|
was having issues, found by comparing the values used with exceptions
|
||||||
|
to these that we were reversed...
|
||||||
|
*/
|
||||||
|
gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, bool swap>
|
||||||
|
static T MMIOAwareLoad(void* _ctx, unsigned int guestaddr) {
|
||||||
|
T value;
|
||||||
|
|
||||||
|
if (guestaddr >= 0xE0000000) {
|
||||||
|
guestaddr += 0x1000;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
|
||||||
|
|
||||||
|
auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
|
||||||
|
if (!gaddr) {
|
||||||
|
value = *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr);
|
||||||
|
if (swap) {
|
||||||
|
value = xe::byte_swap(value);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
was having issues, found by comparing the values used with exceptions
|
||||||
|
to these that we were reversed...
|
||||||
|
*/
|
||||||
|
value = gaddr->read(nullptr, gaddr->callback_context, guestaddr);
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// OPCODE_LOAD_OFFSET
|
// OPCODE_LOAD_OFFSET
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -1016,16 +1066,38 @@ struct LOAD_OFFSET_I16
|
||||||
struct LOAD_OFFSET_I32
|
struct LOAD_OFFSET_I32
|
||||||
: Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
|
: Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
|
if (IsPossibleMMIOInstruction(e, i.instr)) {
|
||||||
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
|
void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
|
||||||
if (e.IsFeatureEnabled(kX64EmitMovbe)) {
|
|
||||||
e.movbe(i.dest, e.dword[addr]);
|
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
|
||||||
|
addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
|
||||||
|
}
|
||||||
|
if (i.src1.is_constant) {
|
||||||
|
e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
|
||||||
|
} else {
|
||||||
|
e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i.src2.is_constant) {
|
||||||
|
e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant());
|
||||||
|
} else {
|
||||||
|
e.add(e.GetNativeParam(0).cvt32(), i.src2.reg().cvt32());
|
||||||
|
}
|
||||||
|
|
||||||
|
e.CallNativeSafe(addrptr);
|
||||||
|
e.mov(i.dest, e.eax);
|
||||||
|
} else {
|
||||||
|
auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
|
||||||
|
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
|
||||||
|
if (e.IsFeatureEnabled(kX64EmitMovbe)) {
|
||||||
|
e.movbe(i.dest, e.dword[addr]);
|
||||||
|
} else {
|
||||||
|
e.mov(i.dest, e.dword[addr]);
|
||||||
|
e.bswap(i.dest);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
e.mov(i.dest, e.dword[addr]);
|
e.mov(i.dest, e.dword[addr]);
|
||||||
e.bswap(i.dest);
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
e.mov(i.dest, e.dword[addr]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1049,28 +1121,6 @@ struct LOAD_OFFSET_I64
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
|
EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
|
||||||
LOAD_OFFSET_I32, LOAD_OFFSET_I64);
|
LOAD_OFFSET_I32, LOAD_OFFSET_I64);
|
||||||
|
|
||||||
template <typename T, bool swap>
|
|
||||||
static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
|
|
||||||
if (swap) {
|
|
||||||
value = xe::byte_swap(value);
|
|
||||||
}
|
|
||||||
if (guestaddr >= 0xE0000000) {
|
|
||||||
guestaddr += 0x1000;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
|
|
||||||
|
|
||||||
auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
|
|
||||||
if (!gaddr) {
|
|
||||||
*reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
|
|
||||||
} else {
|
|
||||||
value = xe::byte_swap(value); /*
|
|
||||||
was having issues, found by comparing the values used with exceptions
|
|
||||||
to these that we were reversed...
|
|
||||||
*/
|
|
||||||
gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// OPCODE_STORE_OFFSET
|
// OPCODE_STORE_OFFSET
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -1225,21 +1275,37 @@ struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
|
||||||
};
|
};
|
||||||
struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
|
struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
auto addr = ComputeMemoryAddress(e, i.src1);
|
if (IsPossibleMMIOInstruction(e, i.instr)) {
|
||||||
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
|
void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
|
||||||
if (e.IsFeatureEnabled(kX64EmitMovbe)) {
|
|
||||||
e.movbe(i.dest, e.dword[addr]);
|
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
|
||||||
|
addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
|
||||||
|
}
|
||||||
|
if (i.src1.is_constant) {
|
||||||
|
e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
|
||||||
|
} else {
|
||||||
|
e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
|
||||||
|
}
|
||||||
|
|
||||||
|
e.CallNativeSafe(addrptr);
|
||||||
|
e.mov(i.dest, e.eax);
|
||||||
|
} else {
|
||||||
|
auto addr = ComputeMemoryAddress(e, i.src1);
|
||||||
|
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
|
||||||
|
if (e.IsFeatureEnabled(kX64EmitMovbe)) {
|
||||||
|
e.movbe(i.dest, e.dword[addr]);
|
||||||
|
} else {
|
||||||
|
e.mov(i.dest, e.dword[addr]);
|
||||||
|
e.bswap(i.dest);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
e.mov(i.dest, e.dword[addr]);
|
e.mov(i.dest, e.dword[addr]);
|
||||||
e.bswap(i.dest);
|
|
||||||
}
|
}
|
||||||
} else {
|
if (IsTracingData()) {
|
||||||
e.mov(i.dest, e.dword[addr]);
|
e.mov(e.GetNativeParam(1).cvt32(), i.dest);
|
||||||
}
|
e.lea(e.GetNativeParam(0), e.ptr[addr]);
|
||||||
if (IsTracingData()) {
|
e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
|
||||||
e.mov(e.GetNativeParam(1).cvt32(), i.dest);
|
}
|
||||||
e.lea(e.GetNativeParam(0), e.ptr[addr]);
|
|
||||||
e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1390,14 +1456,13 @@ struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
|
||||||
} else {
|
} else {
|
||||||
e.mov(e.dword[addr], i.src2);
|
e.mov(e.dword[addr], i.src2);
|
||||||
}
|
}
|
||||||
|
if (IsTracingData()) {
|
||||||
|
e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
|
||||||
|
e.lea(e.GetNativeParam(0), e.ptr[addr]);
|
||||||
|
e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (IsTracingData()) {
|
|
||||||
auto addr = ComputeMemoryAddress(e, i.src1);
|
|
||||||
e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
|
|
||||||
e.lea(e.GetNativeParam(0), e.ptr[addr]);
|
|
||||||
e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
|
struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {
|
||||||
|
|
|
@ -19,15 +19,15 @@
|
||||||
#include "xenia/base/cvar.h"
|
#include "xenia/base/cvar.h"
|
||||||
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
|
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
|
||||||
|
|
||||||
DEFINE_bool(xop_rotates, false, "rotate via xop", "X64");
|
DEFINE_bool(xop_rotates, false, "rotate via xop", "x64");
|
||||||
|
|
||||||
DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64");
|
DEFINE_bool(xop_left_shifts, false, "shl via xop", "x64");
|
||||||
|
|
||||||
DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64");
|
DEFINE_bool(xop_right_shifts, false, "shr via xop", "x64");
|
||||||
|
|
||||||
DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64");
|
DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "x64");
|
||||||
|
|
||||||
DEFINE_bool(xop_compares, true, "compare via xop", "X64");
|
DEFINE_bool(xop_compares, true, "compare via xop", "x64");
|
||||||
|
|
||||||
namespace xe {
|
namespace xe {
|
||||||
namespace cpu {
|
namespace cpu {
|
||||||
|
|
|
@ -67,7 +67,7 @@ using namespace xe::cpu::hir;
|
||||||
|
|
||||||
using xe::cpu::hir::Instr;
|
using xe::cpu::hir::Instr;
|
||||||
|
|
||||||
typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*);
|
typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, InstrKeyValue ikey);
|
||||||
std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
|
std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
@ -868,59 +868,6 @@ static bool MayCombineSetxWithFollowingCtxStore(const hir::Instr* setx_insn,
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
#define EMITTER_IS_TRUE(typ, tester) \
|
|
||||||
struct IS_TRUE_##typ \
|
|
||||||
: Sequence<IS_TRUE_##typ, I<OPCODE_IS_TRUE, I8Op, typ##Op>> { \
|
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) { \
|
|
||||||
e.tester(i.src1, i.src1); \
|
|
||||||
unsigned ctxoffset = 0; \
|
|
||||||
if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \
|
|
||||||
e.setnz(e.byte[e.GetContextReg() + ctxoffset]); \
|
|
||||||
} else { \
|
|
||||||
e.setnz(i.dest); \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define EMITTER_IS_TRUE_INT(typ) EMITTER_IS_TRUE(typ, test)
|
|
||||||
|
|
||||||
EMITTER_IS_TRUE_INT(I8);
|
|
||||||
EMITTER_IS_TRUE_INT(I16);
|
|
||||||
EMITTER_IS_TRUE_INT(I32);
|
|
||||||
EMITTER_IS_TRUE_INT(I64);
|
|
||||||
EMITTER_IS_TRUE(F32, vtestps);
|
|
||||||
EMITTER_IS_TRUE(F64, vtestpd);
|
|
||||||
|
|
||||||
EMITTER_IS_TRUE(V128, vptest);
|
|
||||||
|
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32,
|
|
||||||
IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128);
|
|
||||||
|
|
||||||
#define EMITTER_IS_FALSE(typ, tester) \
|
|
||||||
struct IS_FALSE_##typ \
|
|
||||||
: Sequence<IS_FALSE_##typ, I<OPCODE_IS_FALSE, I8Op, typ##Op>> { \
|
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) { \
|
|
||||||
e.tester(i.src1, i.src1); \
|
|
||||||
unsigned ctxoffset = 0; \
|
|
||||||
if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \
|
|
||||||
e.setz(e.byte[e.GetContextReg() + ctxoffset]); \
|
|
||||||
} else { \
|
|
||||||
e.setz(i.dest); \
|
|
||||||
} \
|
|
||||||
} \
|
|
||||||
}
|
|
||||||
#define EMITTER_IS_FALSE_INT(typ) EMITTER_IS_FALSE(typ, test)
|
|
||||||
EMITTER_IS_FALSE_INT(I8);
|
|
||||||
EMITTER_IS_FALSE_INT(I16);
|
|
||||||
EMITTER_IS_FALSE_INT(I32);
|
|
||||||
EMITTER_IS_FALSE_INT(I64);
|
|
||||||
EMITTER_IS_FALSE(F32, vtestps);
|
|
||||||
EMITTER_IS_FALSE(F64, vtestpd);
|
|
||||||
|
|
||||||
EMITTER_IS_FALSE(V128, vptest);
|
|
||||||
|
|
||||||
EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32,
|
|
||||||
IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128);
|
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// OPCODE_IS_NAN
|
// OPCODE_IS_NAN
|
||||||
|
@ -3308,7 +3255,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
|
||||||
|
|
||||||
auto it = sequence_table.find(key);
|
auto it = sequence_table.find(key);
|
||||||
if (it != sequence_table.end()) {
|
if (it != sequence_table.end()) {
|
||||||
if (it->second(*e, i)) {
|
if (it->second(*e, i, InstrKey(i))) {
|
||||||
*new_tail = i->next;
|
*new_tail = i->next;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ namespace x64 {
|
||||||
|
|
||||||
class X64Emitter;
|
class X64Emitter;
|
||||||
|
|
||||||
typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*);
|
typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*, uint32_t ikey);
|
||||||
extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
|
extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
|
@ -361,28 +361,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case OPCODE_IS_TRUE:
|
|
||||||
if (i->src1.value->IsConstant()) {
|
|
||||||
if (i->src1.value->IsConstantTrue()) {
|
|
||||||
v->set_constant(uint8_t(1));
|
|
||||||
} else {
|
|
||||||
v->set_constant(uint8_t(0));
|
|
||||||
}
|
|
||||||
i->Remove();
|
|
||||||
result = true;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case OPCODE_IS_FALSE:
|
|
||||||
if (i->src1.value->IsConstant()) {
|
|
||||||
if (i->src1.value->IsConstantFalse()) {
|
|
||||||
v->set_constant(uint8_t(1));
|
|
||||||
} else {
|
|
||||||
v->set_constant(uint8_t(0));
|
|
||||||
}
|
|
||||||
i->Remove();
|
|
||||||
result = true;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case OPCODE_IS_NAN:
|
case OPCODE_IS_NAN:
|
||||||
if (i->src1.value->IsConstant()) {
|
if (i->src1.value->IsConstant()) {
|
||||||
if (i->src1.value->type == FLOAT32_TYPE &&
|
if (i->src1.value->type == FLOAT32_TYPE &&
|
||||||
|
@ -602,7 +580,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
|
||||||
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
|
||||||
if (should_skip_because_of_float) {
|
if (should_skip_because_of_float) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
v->set_from(i->src1.value);
|
v->set_from(i->src1.value);
|
||||||
v->Max(i->src2.value);
|
v->Max(i->src2.value);
|
||||||
i->Remove();
|
i->Remove();
|
||||||
|
|
|
@ -214,12 +214,7 @@ bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
|
||||||
bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
|
bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
|
||||||
|
|
||||||
Value* new_value = nullptr;
|
Value* new_value = nullptr;
|
||||||
if (xorop == OPCODE_IS_FALSE) {
|
if (xorop == OPCODE_COMPARE_EQ) {
|
||||||
new_value = builder->IsTrue(xordef->src1.value);
|
|
||||||
|
|
||||||
} else if (xorop == OPCODE_IS_TRUE) {
|
|
||||||
new_value = builder->IsFalse(xordef->src1.value);
|
|
||||||
} else if (xorop == OPCODE_COMPARE_EQ) {
|
|
||||||
new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
|
new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
|
||||||
|
|
||||||
} else if (xorop == OPCODE_COMPARE_NE) {
|
} else if (xorop == OPCODE_COMPARE_NE) {
|
||||||
|
@ -294,7 +289,7 @@ bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
|
bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
|
||||||
return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE;
|
return def_opcode >= OPCODE_COMPARE_EQ && def_opcode <= OPCODE_DID_SATURATE;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation,
|
inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation,
|
||||||
|
@ -804,24 +799,12 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
||||||
if (!var_definition) {
|
if (!var_definition) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// x == 0 -> !x
|
|
||||||
if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) {
|
|
||||||
i->Replace(&OPCODE_IS_FALSE_info, 0);
|
|
||||||
i->set_src1(variable);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// x != 0 -> !!x
|
|
||||||
if (cmpop == OPCODE_COMPARE_NE && constant_unpacked == 0) {
|
|
||||||
i->Replace(&OPCODE_IS_TRUE_info, 0);
|
|
||||||
i->set_src1(variable);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cmpop == OPCODE_COMPARE_ULE &&
|
if (cmpop == OPCODE_COMPARE_ULE &&
|
||||||
constant_unpacked ==
|
constant_unpacked ==
|
||||||
0) { // less than or equal to zero = (== 0) = IS_FALSE
|
0) { // less than or equal to zero = (== 0) = IS_FALSE
|
||||||
i->Replace(&OPCODE_IS_FALSE_info, 0);
|
i->opcode = &OPCODE_COMPARE_EQ_info;
|
||||||
i->set_src1(variable);
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// todo: OPCODE_COMPARE_NE too?
|
// todo: OPCODE_COMPARE_NE too?
|
||||||
|
@ -840,15 +823,20 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
||||||
}
|
}
|
||||||
if (cmpop == OPCODE_COMPARE_ULT &&
|
if (cmpop == OPCODE_COMPARE_ULT &&
|
||||||
constant_unpacked == 1) { // unsigned lt 1 means == 0
|
constant_unpacked == 1) { // unsigned lt 1 means == 0
|
||||||
i->Replace(&OPCODE_IS_FALSE_info, 0);
|
// i->Replace(&OPCODE_IS_FALSE_info, 0);
|
||||||
i->set_src1(variable);
|
|
||||||
|
i->opcode = &OPCODE_COMPARE_EQ_info;
|
||||||
|
|
||||||
|
// i->set_src1(variable);
|
||||||
|
i->set_src2(builder->LoadZero(variable->type));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (cmpop == OPCODE_COMPARE_UGT &&
|
if (cmpop == OPCODE_COMPARE_UGT &&
|
||||||
constant_unpacked == 0) { // unsigned gt 1 means != 0
|
constant_unpacked == 0) { // unsigned gt 1 means != 0
|
||||||
|
|
||||||
i->Replace(&OPCODE_IS_TRUE_info, 0);
|
// i->Replace(&OPCODE_IS_TRUE_info, 0);
|
||||||
i->set_src1(variable);
|
// i->set_src1(variable);
|
||||||
|
i->opcode = &OPCODE_COMPARE_NE_info;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -870,8 +858,11 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
||||||
} else if (cmpop == OPCODE_COMPARE_SGT && signbit_definitely_0 &&
|
} else if (cmpop == OPCODE_COMPARE_SGT && signbit_definitely_0 &&
|
||||||
constant_unpacked == 0) {
|
constant_unpacked == 0) {
|
||||||
// signbit cant be set, and checking if gt 0, so actually checking != 0
|
// signbit cant be set, and checking if gt 0, so actually checking != 0
|
||||||
i->Replace(&OPCODE_IS_TRUE_info, 0);
|
// i->Replace(&OPCODE_IS_TRUE_info, 0);
|
||||||
i->set_src1(variable);
|
|
||||||
|
// i->set_src1(variable);
|
||||||
|
i->opcode = &OPCODE_COMPARE_NE_info;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -885,9 +876,9 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
||||||
Value* constant_replacement = nullptr;
|
Value* constant_replacement = nullptr;
|
||||||
|
|
||||||
if (cmpop == OPCODE_COMPARE_EQ || cmpop == OPCODE_COMPARE_UGE) {
|
if (cmpop == OPCODE_COMPARE_EQ || cmpop == OPCODE_COMPARE_UGE) {
|
||||||
repl = &OPCODE_IS_TRUE_info;
|
repl = &OPCODE_COMPARE_NE_info;
|
||||||
} else if (cmpop == OPCODE_COMPARE_NE || cmpop == OPCODE_COMPARE_ULT) {
|
} else if (cmpop == OPCODE_COMPARE_NE || cmpop == OPCODE_COMPARE_ULT) {
|
||||||
repl = &OPCODE_IS_FALSE_info;
|
repl = &OPCODE_COMPARE_EQ_info;
|
||||||
|
|
||||||
} else if (cmpop == OPCODE_COMPARE_UGT) {
|
} else if (cmpop == OPCODE_COMPARE_UGT) {
|
||||||
// impossible, cannot be greater than mask
|
// impossible, cannot be greater than mask
|
||||||
|
@ -906,6 +897,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
||||||
if (repl) {
|
if (repl) {
|
||||||
i->Replace(repl, 0);
|
i->Replace(repl, 0);
|
||||||
i->set_src1(variable);
|
i->set_src1(variable);
|
||||||
|
i->set_src2(builder->LoadZero(variable->type));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (constant_replacement) {
|
if (constant_replacement) {
|
||||||
|
@ -919,10 +911,16 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
|
||||||
}
|
}
|
||||||
bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
|
bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
|
||||||
hir::HIRBuilder* builder) {
|
hir::HIRBuilder* builder) {
|
||||||
bool istrue = i->opcode == &OPCODE_IS_TRUE_info;
|
bool istrue = i->opcode == &OPCODE_COMPARE_NE_info;
|
||||||
bool isfalse = i->opcode == &OPCODE_IS_FALSE_info;
|
bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info;
|
||||||
|
|
||||||
Value* input = i->src1.value;
|
auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar();
|
||||||
|
|
||||||
|
if (!input_cosntant || input_cosntant->AsUint64() != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Value* input = i->src1.value;
|
||||||
TypeName input_type = input->type;
|
TypeName input_type = input->type;
|
||||||
if (!IsScalarIntegralType(input_type)) {
|
if (!IsScalarIntegralType(input_type)) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -1012,8 +1010,10 @@ bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
|
||||||
i->set_src1(isfalsetest);
|
i->set_src1(isfalsetest);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
i->Replace(&OPCODE_IS_FALSE_info, 0);
|
// i->Replace(&OPCODE_IS_FALSE_info, 0);
|
||||||
|
i->Replace(&OPCODE_COMPARE_EQ_info, 0);
|
||||||
i->set_src1(lz_input);
|
i->set_src1(lz_input);
|
||||||
|
i->set_src2(builder->LoadZero(lz_input->type));
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -1067,7 +1067,7 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
|
||||||
while (i) {
|
while (i) {
|
||||||
// vector types use the same opcodes as scalar ones for AND/OR/XOR! we
|
// vector types use the same opcodes as scalar ones for AND/OR/XOR! we
|
||||||
// don't handle these in our simplifications, so skip
|
// don't handle these in our simplifications, so skip
|
||||||
if (i->dest && IsScalarIntegralType(i->dest->type)) {
|
if (i->AllScalarIntegral()) {
|
||||||
Opcode iop = i->opcode->num;
|
Opcode iop = i->opcode->num;
|
||||||
|
|
||||||
if (iop == OPCODE_OR) {
|
if (iop == OPCODE_OR) {
|
||||||
|
@ -1080,7 +1080,6 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
|
||||||
result |= CheckAdd(i, builder);
|
result |= CheckAdd(i, builder);
|
||||||
} else if (IsScalarBasicCmp(iop)) {
|
} else if (IsScalarBasicCmp(iop)) {
|
||||||
result |= CheckScalarConstCmp(i, builder);
|
result |= CheckScalarConstCmp(i, builder);
|
||||||
} else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
|
|
||||||
result |= CheckIsTrueIsFalse(i, builder);
|
result |= CheckIsTrueIsFalse(i, builder);
|
||||||
} else if (iop == OPCODE_SHR) {
|
} else if (iop == OPCODE_SHR) {
|
||||||
result |= CheckSHR(i, builder);
|
result |= CheckSHR(i, builder);
|
||||||
|
|
|
@ -1023,7 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) {
|
||||||
|
|
||||||
Value* HIRBuilder::Convert(Value* value, TypeName target_type,
|
Value* HIRBuilder::Convert(Value* value, TypeName target_type,
|
||||||
RoundMode round_mode) {
|
RoundMode round_mode) {
|
||||||
|
|
||||||
Instr* i =
|
Instr* i =
|
||||||
AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
|
AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
|
||||||
i->set_src1(value);
|
i->set_src1(value);
|
||||||
|
@ -1034,7 +1033,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type,
|
||||||
Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
|
Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
|
||||||
ASSERT_FLOAT_OR_VECTOR_TYPE(value);
|
ASSERT_FLOAT_OR_VECTOR_TYPE(value);
|
||||||
|
|
||||||
|
|
||||||
Instr* i =
|
Instr* i =
|
||||||
AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
|
AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
|
||||||
i->set_src1(value);
|
i->set_src1(value);
|
||||||
|
@ -1248,7 +1246,34 @@ void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
|
||||||
i->set_src2(value);
|
i->set_src2(value);
|
||||||
i->src3.value = NULL;
|
i->src3.value = NULL;
|
||||||
}
|
}
|
||||||
|
Value* HIRBuilder::LoadVectorLeft(Value* address) {
|
||||||
|
ASSERT_ADDRESS_TYPE(address);
|
||||||
|
Instr* i = AppendInstr(OPCODE_LVL_info, 0, AllocValue(VEC128_TYPE));
|
||||||
|
i->set_src1(address);
|
||||||
|
i->src2.value = i->src3.value = NULL;
|
||||||
|
return i->dest;
|
||||||
|
}
|
||||||
|
Value* HIRBuilder::LoadVectorRight(Value* address) {
|
||||||
|
ASSERT_ADDRESS_TYPE(address);
|
||||||
|
Instr* i = AppendInstr(OPCODE_LVR_info, 0, AllocValue(VEC128_TYPE));
|
||||||
|
i->set_src1(address);
|
||||||
|
i->src2.value = i->src3.value = NULL;
|
||||||
|
return i->dest;
|
||||||
|
}
|
||||||
|
void HIRBuilder::StoreVectorLeft(Value* address, Value* value) {
|
||||||
|
ASSERT_ADDRESS_TYPE(address);
|
||||||
|
Instr* i = AppendInstr(OPCODE_STVL_info, 0);
|
||||||
|
i->set_src1(address);
|
||||||
|
i->set_src2(value);
|
||||||
|
i->src3.value = NULL;
|
||||||
|
}
|
||||||
|
void HIRBuilder::StoreVectorRight(Value* address, Value* value) {
|
||||||
|
ASSERT_ADDRESS_TYPE(address);
|
||||||
|
Instr* i = AppendInstr(OPCODE_STVR_info, 0);
|
||||||
|
i->set_src1(address);
|
||||||
|
i->set_src2(value);
|
||||||
|
i->src3.value = NULL;
|
||||||
|
}
|
||||||
void HIRBuilder::Memset(Value* address, Value* value, Value* length) {
|
void HIRBuilder::Memset(Value* address, Value* value, Value* length) {
|
||||||
ASSERT_ADDRESS_TYPE(address);
|
ASSERT_ADDRESS_TYPE(address);
|
||||||
ASSERT_TYPES_EQUAL(address, length);
|
ASSERT_TYPES_EQUAL(address, length);
|
||||||
|
@ -1283,7 +1308,7 @@ void HIRBuilder::SetNJM(Value* value) {
|
||||||
Value* HIRBuilder::Max(Value* value1, Value* value2) {
|
Value* HIRBuilder::Max(Value* value1, Value* value2) {
|
||||||
ASSERT_TYPES_EQUAL(value1, value2);
|
ASSERT_TYPES_EQUAL(value1, value2);
|
||||||
|
|
||||||
if (IsScalarIntegralType( value1->type) && value1->IsConstant() &&
|
if (IsScalarIntegralType(value1->type) && value1->IsConstant() &&
|
||||||
value2->IsConstant()) {
|
value2->IsConstant()) {
|
||||||
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
|
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
|
||||||
}
|
}
|
||||||
|
@ -1351,27 +1376,51 @@ Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) {
|
||||||
i->set_src3(value2);
|
i->set_src3(value2);
|
||||||
return i->dest;
|
return i->dest;
|
||||||
}
|
}
|
||||||
|
static Value* OrLanes32(HIRBuilder& f, Value* value) {
|
||||||
|
hir::Value* v1 = f.Extract(value, (uint8_t)0, INT32_TYPE);
|
||||||
|
hir::Value* v2 = f.Extract(value, (uint8_t)1, INT32_TYPE);
|
||||||
|
hir::Value* v3 = f.Extract(value, (uint8_t)2, INT32_TYPE);
|
||||||
|
hir::Value* ored = f.Or(v1, v2);
|
||||||
|
|
||||||
|
hir::Value* v4 = f.Extract(value, (uint8_t)3, INT32_TYPE);
|
||||||
|
ored = f.Or(ored, v3);
|
||||||
|
|
||||||
|
ored = f.Or(ored, v4);
|
||||||
|
return ored;
|
||||||
|
}
|
||||||
Value* HIRBuilder::IsTrue(Value* value) {
|
Value* HIRBuilder::IsTrue(Value* value) {
|
||||||
|
assert_true(value);
|
||||||
|
if (value->type == VEC128_TYPE) {
|
||||||
|
// chrispy: this probably doesnt happen often enough to be worth its own
|
||||||
|
// opcode or special code path but this could be optimized to not require as
|
||||||
|
// many extracts, we can shuffle and or v128 and then extract the low
|
||||||
|
|
||||||
|
return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
|
||||||
|
}
|
||||||
|
|
||||||
if (value->IsConstant()) {
|
if (value->IsConstant()) {
|
||||||
return LoadConstantInt8(value->IsConstantTrue() ? 1 : 0);
|
return LoadConstantInt8(value->IsConstantTrue() ? 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
Instr* i = AppendInstr(OPCODE_IS_TRUE_info, 0, AllocValue(INT8_TYPE));
|
return CompareNE(value, LoadZero(value->type));
|
||||||
i->set_src1(value);
|
|
||||||
i->src2.value = i->src3.value = NULL;
|
|
||||||
return i->dest;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Value* HIRBuilder::IsFalse(Value* value) {
|
Value* HIRBuilder::IsFalse(Value* value) {
|
||||||
|
assert_true(value);
|
||||||
|
|
||||||
|
if (value->type == VEC128_TYPE) {
|
||||||
|
// chrispy: this probably doesnt happen often enough to be worth its own
|
||||||
|
// opcode or special code path but this could be optimized to not require as
|
||||||
|
// many extracts, we can shuffle and or v128 and then extract the low
|
||||||
|
|
||||||
|
return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
|
||||||
|
}
|
||||||
|
|
||||||
if (value->IsConstant()) {
|
if (value->IsConstant()) {
|
||||||
return LoadConstantInt8(value->IsConstantFalse() ? 1 : 0);
|
return LoadConstantInt8(value->IsConstantFalse() ? 1 : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
Instr* i = AppendInstr(OPCODE_IS_FALSE_info, 0, AllocValue(INT8_TYPE));
|
return CompareEQ(value, LoadZero(value->type));
|
||||||
i->set_src1(value);
|
|
||||||
i->src2.value = i->src3.value = NULL;
|
|
||||||
return i->dest;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Value* HIRBuilder::IsNan(Value* value) {
|
Value* HIRBuilder::IsNan(Value* value) {
|
||||||
|
|
|
@ -166,6 +166,11 @@ class HIRBuilder {
|
||||||
uint32_t store_flags = 0);
|
uint32_t store_flags = 0);
|
||||||
|
|
||||||
Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
|
Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
|
||||||
|
|
||||||
|
Value* LoadVectorLeft(Value* address);
|
||||||
|
Value* LoadVectorRight(Value* address);
|
||||||
|
void StoreVectorLeft(Value* address, Value* value);
|
||||||
|
void StoreVectorRight(Value* address, Value* value);
|
||||||
void Store(Value* address, Value* value, uint32_t store_flags = 0);
|
void Store(Value* address, Value* value, uint32_t store_flags = 0);
|
||||||
void Memset(Value* address, Value* value, Value* length);
|
void Memset(Value* address, Value* value, Value* length);
|
||||||
void CacheControl(Value* address, size_t cache_line_size,
|
void CacheControl(Value* address, size_t cache_line_size,
|
||||||
|
@ -268,6 +273,7 @@ class HIRBuilder {
|
||||||
Value* new_value);
|
Value* new_value);
|
||||||
Value* AtomicAdd(Value* address, Value* value);
|
Value* AtomicAdd(Value* address, Value* value);
|
||||||
Value* AtomicSub(Value* address, Value* value);
|
Value* AtomicSub(Value* address, Value* value);
|
||||||
|
|
||||||
void SetNJM(Value* value);
|
void SetNJM(Value* value);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
|
|
@ -213,7 +213,19 @@ uint32_t Instr::GuestAddressFor() const {
|
||||||
|
|
||||||
return 0; // eek.
|
return 0; // eek.
|
||||||
}
|
}
|
||||||
|
bool Instr::AllScalarIntegral() {
|
||||||
|
bool result = true;
|
||||||
|
if (dest) {
|
||||||
|
if (!IsScalarIntegralType(dest->type)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
VisitValueOperands([&result](Value* v, uint32_t idx) {
|
||||||
|
result = result && IsScalarIntegralType(v->type);
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}
|
||||||
} // namespace hir
|
} // namespace hir
|
||||||
} // namespace cpu
|
} // namespace cpu
|
||||||
} // namespace xe
|
} // namespace xe
|
||||||
|
|
|
@ -171,6 +171,8 @@ if both are constant, return nullptr, nullptr
|
||||||
const hir::Instr* GetNonFakePrev() const;
|
const hir::Instr* GetNonFakePrev() const;
|
||||||
|
|
||||||
uint32_t GuestAddressFor() const;
|
uint32_t GuestAddressFor() const;
|
||||||
|
|
||||||
|
bool AllScalarIntegral(); // dest and all srcs are scalar integral
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace hir
|
} // namespace hir
|
||||||
|
|
|
@ -210,10 +210,10 @@ enum Opcode {
|
||||||
OPCODE_STORE,
|
OPCODE_STORE,
|
||||||
// chrispy: todo: implement, our current codegen for the unaligned loads is
|
// chrispy: todo: implement, our current codegen for the unaligned loads is
|
||||||
// very bad
|
// very bad
|
||||||
OPCODE_LVLX,
|
OPCODE_LVL,
|
||||||
OPCODE_LVRX,
|
OPCODE_LVR,
|
||||||
OPCODE_STVLX,
|
OPCODE_STVL,
|
||||||
OPCODE_STVRX,
|
OPCODE_STVR,
|
||||||
OPCODE_MEMSET,
|
OPCODE_MEMSET,
|
||||||
OPCODE_CACHE_CONTROL,
|
OPCODE_CACHE_CONTROL,
|
||||||
OPCODE_MEMORY_BARRIER,
|
OPCODE_MEMORY_BARRIER,
|
||||||
|
@ -222,8 +222,6 @@ enum Opcode {
|
||||||
OPCODE_MIN,
|
OPCODE_MIN,
|
||||||
OPCODE_VECTOR_MIN,
|
OPCODE_VECTOR_MIN,
|
||||||
OPCODE_SELECT,
|
OPCODE_SELECT,
|
||||||
OPCODE_IS_TRUE,
|
|
||||||
OPCODE_IS_FALSE,
|
|
||||||
OPCODE_IS_NAN,
|
OPCODE_IS_NAN,
|
||||||
OPCODE_COMPARE_EQ,
|
OPCODE_COMPARE_EQ,
|
||||||
OPCODE_COMPARE_NE,
|
OPCODE_COMPARE_NE,
|
||||||
|
|
|
@ -303,17 +303,6 @@ DEFINE_OPCODE(
|
||||||
OPCODE_SIG_V_V_V_V,
|
OPCODE_SIG_V_V_V_V,
|
||||||
0)
|
0)
|
||||||
|
|
||||||
DEFINE_OPCODE(
|
|
||||||
OPCODE_IS_TRUE,
|
|
||||||
"is_true",
|
|
||||||
OPCODE_SIG_V_V,
|
|
||||||
0)
|
|
||||||
|
|
||||||
DEFINE_OPCODE(
|
|
||||||
OPCODE_IS_FALSE,
|
|
||||||
"is_false",
|
|
||||||
OPCODE_SIG_V_V,
|
|
||||||
0)
|
|
||||||
|
|
||||||
DEFINE_OPCODE(
|
DEFINE_OPCODE(
|
||||||
OPCODE_IS_NAN,
|
OPCODE_IS_NAN,
|
||||||
|
@ -706,4 +695,27 @@ DEFINE_OPCODE(
|
||||||
OPCODE_SIG_X_V,
|
OPCODE_SIG_X_V,
|
||||||
0
|
0
|
||||||
)
|
)
|
||||||
|
DEFINE_OPCODE(
|
||||||
|
OPCODE_LVL,
|
||||||
|
"loadv_left",
|
||||||
|
OPCODE_SIG_V_V,
|
||||||
|
OPCODE_FLAG_MEMORY
|
||||||
|
)
|
||||||
|
|
||||||
|
DEFINE_OPCODE(
|
||||||
|
OPCODE_LVR,
|
||||||
|
"loadv_right",
|
||||||
|
OPCODE_SIG_V_V,
|
||||||
|
OPCODE_FLAG_MEMORY
|
||||||
|
)
|
||||||
|
DEFINE_OPCODE(
|
||||||
|
OPCODE_STVL,
|
||||||
|
"storev_left",
|
||||||
|
OPCODE_SIG_X_V_V,
|
||||||
|
OPCODE_FLAG_MEMORY)
|
||||||
|
|
||||||
|
DEFINE_OPCODE(
|
||||||
|
OPCODE_STVR,
|
||||||
|
"storev_right",
|
||||||
|
OPCODE_SIG_X_V_V,
|
||||||
|
OPCODE_FLAG_MEMORY)
|
||||||
|
|
|
@ -418,6 +418,10 @@ void PPCHIRBuilder::UpdateCR6(Value* src_value) {
|
||||||
// Testing for all 1's and all 0's.
|
// Testing for all 1's and all 0's.
|
||||||
// if (Rc) CR6 = all_equal | 0 | none_equal | 0
|
// if (Rc) CR6 = all_equal | 0 | none_equal | 0
|
||||||
// TODO(benvanik): efficient instruction?
|
// TODO(benvanik): efficient instruction?
|
||||||
|
|
||||||
|
// chrispy: nothing seems to write cr6_1, figure out if no documented
|
||||||
|
// instructions write anything other than 0 to it and remove these stores if
|
||||||
|
// so
|
||||||
StoreContext(offsetof(PPCContext, cr6.cr6_1), LoadZeroInt8());
|
StoreContext(offsetof(PPCContext, cr6.cr6_1), LoadZeroInt8());
|
||||||
StoreContext(offsetof(PPCContext, cr6.cr6_3), LoadZeroInt8());
|
StoreContext(offsetof(PPCContext, cr6.cr6_3), LoadZeroInt8());
|
||||||
StoreContext(offsetof(PPCContext, cr6.cr6_all_equal),
|
StoreContext(offsetof(PPCContext, cr6.cr6_all_equal),
|
||||||
|
|
|
@ -7,18 +7,17 @@
|
||||||
******************************************************************************
|
******************************************************************************
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "xenia/base/assert.h"
|
#include "xenia/base/assert.h"
|
||||||
#include "xenia/base/byte_order.h"
|
#include "xenia/base/byte_order.h"
|
||||||
#include "xenia/base/cvar.h"
|
#include "xenia/base/cvar.h"
|
||||||
#include "xenia/base/logging.h"
|
#include "xenia/base/logging.h"
|
||||||
#include "xenia/base/math.h"
|
#include "xenia/base/math.h"
|
||||||
#include "xenia/base/profiling.h"
|
#include "xenia/base/profiling.h"
|
||||||
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
|
|
||||||
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
|
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
|
||||||
#include "xenia/gpu/d3d12/d3d12_shader.h"
|
#include "xenia/gpu/d3d12/d3d12_shader.h"
|
||||||
#include "xenia/gpu/draw_util.h"
|
#include "xenia/gpu/draw_util.h"
|
||||||
|
@ -843,6 +842,7 @@ bool D3D12CommandProcessor::SetupContext() {
|
||||||
bool draw_resolution_scale_not_clamped =
|
bool draw_resolution_scale_not_clamped =
|
||||||
TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x,
|
TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x,
|
||||||
draw_resolution_scale_y);
|
draw_resolution_scale_y);
|
||||||
|
|
||||||
if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported(
|
if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported(
|
||||||
draw_resolution_scale_x, draw_resolution_scale_y, provider)) {
|
draw_resolution_scale_x, draw_resolution_scale_y, provider)) {
|
||||||
draw_resolution_scale_not_clamped = false;
|
draw_resolution_scale_not_clamped = false;
|
||||||
|
@ -1676,37 +1676,52 @@ void D3D12CommandProcessor::ShutdownContext() {
|
||||||
|
|
||||||
CommandProcessor::ShutdownContext();
|
CommandProcessor::ShutdownContext();
|
||||||
}
|
}
|
||||||
|
// todo: bit-pack the bools and use bitarith to reduce branches
|
||||||
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
|
||||||
CommandProcessor::WriteRegister(index, value);
|
CommandProcessor::WriteRegister(index, value);
|
||||||
|
|
||||||
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
|
bool cbuf_binding_float_pixel_utd = cbuffer_binding_float_pixel_.up_to_date;
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
|
bool cbuf_binding_float_vertex_utd = cbuffer_binding_float_vertex_.up_to_date;
|
||||||
if (frame_open_) {
|
bool cbuf_binding_bool_loop_utd = cbuffer_binding_bool_loop_.up_to_date;
|
||||||
uint32_t float_constant_index =
|
|
||||||
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
|
if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
|
||||||
if (float_constant_index >= 256) {
|
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
|
||||||
float_constant_index -= 256;
|
cbuffer_binding_fetch_.up_to_date = false;
|
||||||
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
// texture cache is never nullptr
|
||||||
(1ull << (float_constant_index & 63))) {
|
// if (texture_cache_ != nullptr) {
|
||||||
cbuffer_binding_float_pixel_.up_to_date = false;
|
texture_cache_->TextureFetchConstantWritten(
|
||||||
}
|
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
|
||||||
} else {
|
// }
|
||||||
if (current_float_constant_map_vertex_[float_constant_index >> 6] &
|
} else {
|
||||||
(1ull << (float_constant_index & 63))) {
|
if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd |
|
||||||
cbuffer_binding_float_vertex_.up_to_date = false;
|
cbuf_binding_bool_loop_utd)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
|
||||||
|
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
|
||||||
|
if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (frame_open_) {
|
||||||
|
uint32_t float_constant_index =
|
||||||
|
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
|
||||||
|
if (float_constant_index >= 256) {
|
||||||
|
float_constant_index -= 256;
|
||||||
|
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
|
||||||
|
(1ull << (float_constant_index & 63))) {
|
||||||
|
cbuffer_binding_float_pixel_.up_to_date = false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (current_float_constant_map_vertex_[float_constant_index >> 6] &
|
||||||
|
(1ull << (float_constant_index & 63))) {
|
||||||
|
cbuffer_binding_float_vertex_.up_to_date = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
|
||||||
} else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
|
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
|
cbuffer_binding_bool_loop_.up_to_date = false;
|
||||||
cbuffer_binding_bool_loop_.up_to_date = false;
|
|
||||||
} else if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
|
|
||||||
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
|
|
||||||
cbuffer_binding_fetch_.up_to_date = false;
|
|
||||||
if (texture_cache_ != nullptr) {
|
|
||||||
texture_cache_->TextureFetchConstantWritten(
|
|
||||||
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2301,14 +2316,26 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
||||||
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
|
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
|
||||||
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
|
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
|
||||||
draw_util::ViewportInfo viewport_info;
|
draw_util::ViewportInfo viewport_info;
|
||||||
draw_util::GetHostViewportInfo(
|
draw_util::GetViewportInfoArgs gviargs{};
|
||||||
regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
|
|
||||||
|
gviargs.Setup(
|
||||||
|
draw_resolution_scale_x, draw_resolution_scale_y,
|
||||||
|
texture_cache_->draw_resolution_scale_x_divisor(),
|
||||||
|
texture_cache_->draw_resolution_scale_y_divisor(), true,
|
||||||
D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
|
D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
|
||||||
normalized_depth_control,
|
normalized_depth_control,
|
||||||
host_render_targets_used &&
|
host_render_targets_used &&
|
||||||
render_target_cache_->depth_float24_convert_in_pixel_shader(),
|
render_target_cache_->depth_float24_convert_in_pixel_shader(),
|
||||||
host_render_targets_used, pixel_shader && pixel_shader->writes_depth(),
|
host_render_targets_used, pixel_shader && pixel_shader->writes_depth());
|
||||||
viewport_info);
|
gviargs.SetupRegisterValues(regs);
|
||||||
|
|
||||||
|
if (gviargs == previous_viewport_info_args_) {
|
||||||
|
viewport_info = previous_viewport_info_;
|
||||||
|
} else {
|
||||||
|
draw_util::GetHostViewportInfo(&gviargs, viewport_info);
|
||||||
|
previous_viewport_info_args_ = gviargs;
|
||||||
|
previous_viewport_info_ = viewport_info;
|
||||||
|
}
|
||||||
draw_util::Scissor scissor;
|
draw_util::Scissor scissor;
|
||||||
draw_util::GetScissor(regs, scissor);
|
draw_util::GetScissor(regs, scissor);
|
||||||
scissor.offset[0] *= draw_resolution_scale_x;
|
scissor.offset[0] *= draw_resolution_scale_x;
|
||||||
|
@ -2711,6 +2738,24 @@ void D3D12CommandProcessor::InitializeTrace() {
|
||||||
shared_memory_->InitializeTraceCompleteDownloads();
|
shared_memory_->InitializeTraceCompleteDownloads();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
static void DmaPrefunc(dma::XeDMAJob* job) {
|
||||||
|
D3D12_RANGE readback_range;
|
||||||
|
readback_range.Begin = 0;
|
||||||
|
readback_range.End = job->size;
|
||||||
|
void* readback_mapping;
|
||||||
|
ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
|
||||||
|
|
||||||
|
HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
|
||||||
|
xenia_assert(SUCCEEDED(mapres));
|
||||||
|
|
||||||
|
job->source = (uint8_t*)readback_mapping;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void DmaPostfunc(dma::XeDMAJob* job) {
|
||||||
|
D3D12_RANGE readback_write_range = {};
|
||||||
|
ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
|
||||||
|
readback_buffer->Unmap(0, &readback_write_range);
|
||||||
|
}
|
||||||
|
|
||||||
bool D3D12CommandProcessor::IssueCopy() {
|
bool D3D12CommandProcessor::IssueCopy() {
|
||||||
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
||||||
|
@ -2736,17 +2781,35 @@ bool D3D12CommandProcessor::IssueCopy() {
|
||||||
readback_buffer, 0, shared_memory_buffer, written_address,
|
readback_buffer, 0, shared_memory_buffer, written_address,
|
||||||
written_length);
|
written_length);
|
||||||
if (AwaitAllQueueOperationsCompletion()) {
|
if (AwaitAllQueueOperationsCompletion()) {
|
||||||
|
#if 1
|
||||||
D3D12_RANGE readback_range;
|
D3D12_RANGE readback_range;
|
||||||
readback_range.Begin = 0;
|
readback_range.Begin = 0;
|
||||||
readback_range.End = written_length;
|
readback_range.End = written_length;
|
||||||
void* readback_mapping;
|
void* readback_mapping;
|
||||||
if (SUCCEEDED(
|
if (SUCCEEDED(
|
||||||
readback_buffer->Map(0, &readback_range, &readback_mapping))) {
|
readback_buffer->Map(0, &readback_range, &readback_mapping))) {
|
||||||
std::memcpy(memory_->TranslatePhysical(written_address),
|
// chrispy: this memcpy needs to be optimized as much as possible
|
||||||
readback_mapping, written_length);
|
|
||||||
|
auto physaddr = memory_->TranslatePhysical(written_address);
|
||||||
|
dma::vastcpy(physaddr, (uint8_t*)readback_mapping,
|
||||||
|
written_length);
|
||||||
|
// XEDmaCpy(physaddr, readback_mapping, written_length);
|
||||||
D3D12_RANGE readback_write_range = {};
|
D3D12_RANGE readback_write_range = {};
|
||||||
readback_buffer->Unmap(0, &readback_write_range);
|
readback_buffer->Unmap(0, &readback_write_range);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
dma::XeDMAJob job{};
|
||||||
|
job.destination = memory_->TranslatePhysical(written_address);
|
||||||
|
job.size = written_length;
|
||||||
|
job.source = nullptr;
|
||||||
|
job.userdata1 = (void*)readback_buffer;
|
||||||
|
job.precall = DmaPrefunc;
|
||||||
|
job.postcall = DmaPostfunc;
|
||||||
|
|
||||||
|
readback_available_ = GetDMAC()->PushDMAJob(&job);
|
||||||
|
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3885,9 +3948,10 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
if (bool_loop_constants == nullptr) {
|
if (bool_loop_constants == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
std::memcpy(bool_loop_constants,
|
xe::smallcpy_const<kBoolLoopConstantsSize>(
|
||||||
®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32,
|
bool_loop_constants,
|
||||||
kBoolLoopConstantsSize);
|
®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32);
|
||||||
|
|
||||||
cbuffer_binding_bool_loop_.up_to_date = true;
|
cbuffer_binding_bool_loop_.up_to_date = true;
|
||||||
current_graphics_root_up_to_date_ &=
|
current_graphics_root_up_to_date_ &=
|
||||||
~(1u << root_parameter_bool_loop_constants);
|
~(1u << root_parameter_bool_loop_constants);
|
||||||
|
@ -3901,9 +3965,9 @@ bool D3D12CommandProcessor::UpdateBindings(
|
||||||
if (fetch_constants == nullptr) {
|
if (fetch_constants == nullptr) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
std::memcpy(fetch_constants,
|
xe::smallcpy_const<kFetchConstantsSize>(
|
||||||
®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32,
|
fetch_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32);
|
||||||
kFetchConstantsSize);
|
|
||||||
cbuffer_binding_fetch_.up_to_date = true;
|
cbuffer_binding_fetch_.up_to_date = true;
|
||||||
current_graphics_root_up_to_date_ &=
|
current_graphics_root_up_to_date_ &=
|
||||||
~(1u << root_parameter_fetch_constants);
|
~(1u << root_parameter_fetch_constants);
|
||||||
|
@ -4542,6 +4606,12 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
|
||||||
if (size == 0) {
|
if (size == 0) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
#if 0
|
||||||
|
if (readback_available_) {
|
||||||
|
GetDMAC()->WaitJobDone(readback_available_);
|
||||||
|
readback_available_ = 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
size = xe::align(size, kReadbackBufferSizeIncrement);
|
size = xe::align(size, kReadbackBufferSizeIncrement);
|
||||||
if (size > readback_buffer_size_) {
|
if (size > readback_buffer_size_) {
|
||||||
const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
|
const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
|
||||||
|
@ -4561,6 +4631,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
|
||||||
readback_buffer_->Release();
|
readback_buffer_->Release();
|
||||||
}
|
}
|
||||||
readback_buffer_ = buffer;
|
readback_buffer_ = buffer;
|
||||||
|
readback_buffer_size_ = size;
|
||||||
}
|
}
|
||||||
return readback_buffer_;
|
return readback_buffer_;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
/**
|
||||||
/**
|
/**
|
||||||
******************************************************************************
|
******************************************************************************
|
||||||
* Xenia : Xbox 360 Emulator Research Project *
|
* Xenia : Xbox 360 Emulator Research Project *
|
||||||
|
@ -19,6 +20,7 @@
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
#include "xenia/base/assert.h"
|
#include "xenia/base/assert.h"
|
||||||
|
#include "xenia/base/dma.h"
|
||||||
#include "xenia/gpu/command_processor.h"
|
#include "xenia/gpu/command_processor.h"
|
||||||
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
|
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
|
||||||
#include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
|
#include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
|
||||||
|
@ -581,6 +583,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
|
|
||||||
static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
|
static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
|
||||||
ID3D12Resource* readback_buffer_ = nullptr;
|
ID3D12Resource* readback_buffer_ = nullptr;
|
||||||
|
dma::DMACJobHandle readback_available_ = 0;
|
||||||
uint32_t readback_buffer_size_ = 0;
|
uint32_t readback_buffer_size_ = 0;
|
||||||
|
|
||||||
std::atomic<bool> pix_capture_requested_ = false;
|
std::atomic<bool> pix_capture_requested_ = false;
|
||||||
|
@ -614,9 +617,11 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
DxbcShaderTranslator::SystemConstants system_constants_;
|
DxbcShaderTranslator::SystemConstants system_constants_;
|
||||||
|
|
||||||
// Float constant usage masks of the last draw call.
|
// Float constant usage masks of the last draw call.
|
||||||
uint64_t current_float_constant_map_vertex_[4];
|
// chrispy: make sure accesses to these cant cross cacheline boundaries
|
||||||
uint64_t current_float_constant_map_pixel_[4];
|
struct alignas(XE_HOST_CACHE_LINE_SIZE) {
|
||||||
|
uint64_t current_float_constant_map_vertex_[4];
|
||||||
|
uint64_t current_float_constant_map_pixel_[4];
|
||||||
|
};
|
||||||
// Constant buffer bindings.
|
// Constant buffer bindings.
|
||||||
struct ConstantBufferBinding {
|
struct ConstantBufferBinding {
|
||||||
D3D12_GPU_VIRTUAL_ADDRESS address;
|
D3D12_GPU_VIRTUAL_ADDRESS address;
|
||||||
|
@ -670,6 +675,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
||||||
|
|
||||||
// Current primitive topology.
|
// Current primitive topology.
|
||||||
D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
|
D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
|
||||||
|
|
||||||
|
draw_util::GetViewportInfoArgs previous_viewport_info_args_;
|
||||||
|
draw_util::ViewportInfo previous_viewport_info_;
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace d3d12
|
} // namespace d3d12
|
||||||
|
|
|
@ -406,14 +406,15 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
|
||||||
}
|
}
|
||||||
|
|
||||||
bool D3D12SharedMemory::UploadRanges(
|
bool D3D12SharedMemory::UploadRanges(
|
||||||
const std::pair<uint32_t, uint32_t>* upload_page_ranges, unsigned num_upload_page_ranges) {
|
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
|
||||||
|
unsigned num_upload_page_ranges) {
|
||||||
if (!num_upload_page_ranges) {
|
if (!num_upload_page_ranges) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
|
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
|
||||||
command_processor_.SubmitBarriers();
|
command_processor_.SubmitBarriers();
|
||||||
auto& command_list = command_processor_.GetDeferredCommandList();
|
auto& command_list = command_processor_.GetDeferredCommandList();
|
||||||
//for (auto upload_range : upload_page_ranges) {
|
// for (auto upload_range : upload_page_ranges) {
|
||||||
for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
|
for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
|
||||||
auto& upload_range = upload_page_ranges[i];
|
auto& upload_range = upload_page_ranges[i];
|
||||||
uint32_t upload_range_start = upload_range.first;
|
uint32_t upload_range_start = upload_range.first;
|
||||||
|
@ -434,10 +435,20 @@ bool D3D12SharedMemory::UploadRanges(
|
||||||
}
|
}
|
||||||
MakeRangeValid(upload_range_start << page_size_log2(),
|
MakeRangeValid(upload_range_start << page_size_log2(),
|
||||||
uint32_t(upload_buffer_size), false, false);
|
uint32_t(upload_buffer_size), false, false);
|
||||||
std::memcpy(
|
|
||||||
upload_buffer_mapping,
|
if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) {
|
||||||
memory().TranslatePhysical(upload_range_start << page_size_log2()),
|
dma::vastcpy(
|
||||||
upload_buffer_size);
|
upload_buffer_mapping,
|
||||||
|
memory().TranslatePhysical(upload_range_start << page_size_log2()),
|
||||||
|
static_cast<uint32_t>(upload_buffer_size));
|
||||||
|
swcache::WriteFence();
|
||||||
|
|
||||||
|
} else {
|
||||||
|
memcpy(
|
||||||
|
upload_buffer_mapping,
|
||||||
|
memory().TranslatePhysical(upload_range_start << page_size_log2()),
|
||||||
|
upload_buffer_size);
|
||||||
|
}
|
||||||
command_list.D3DCopyBufferRegion(
|
command_list.D3DCopyBufferRegion(
|
||||||
buffer_, upload_range_start << page_size_log2(), upload_buffer,
|
buffer_, upload_range_start << page_size_log2(), upload_buffer,
|
||||||
UINT64(upload_buffer_offset), UINT64(upload_buffer_size));
|
UINT64(upload_buffer_offset), UINT64(upload_buffer_size));
|
||||||
|
|
|
@ -167,17 +167,17 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void GetHostViewportInfo(const RegisterFile& regs,
|
static float ViewportRecip2_0(float f) {
|
||||||
uint32_t draw_resolution_scale_x,
|
float f1 = ArchReciprocalRefined(f);
|
||||||
uint32_t draw_resolution_scale_y,
|
return f1 + f1;
|
||||||
bool origin_bottom_left, uint32_t x_max,
|
}
|
||||||
uint32_t y_max, bool allow_reverse_z,
|
|
||||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
// chrispy: todo, the int/float divides and the nan-checked mins show up
|
||||||
bool convert_z_to_float24, bool full_float24_in_0_to_1,
|
// relatively high on uprof when i uc to 1.7ghz
|
||||||
bool pixel_shader_writes_depth,
|
void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
|
||||||
ViewportInfo& viewport_info_out) {
|
ViewportInfo& viewport_info_out) {
|
||||||
assert_not_zero(draw_resolution_scale_x);
|
assert_not_zero(args->draw_resolution_scale_x);
|
||||||
assert_not_zero(draw_resolution_scale_y);
|
assert_not_zero(args->draw_resolution_scale_y);
|
||||||
|
|
||||||
// A vertex position goes the following path:
|
// A vertex position goes the following path:
|
||||||
//
|
//
|
||||||
|
@ -304,38 +304,32 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
// TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for
|
// TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for
|
||||||
// D24FS8 as well.
|
// D24FS8 as well.
|
||||||
|
|
||||||
auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
|
auto pa_cl_clip_cntl = args->pa_cl_clip_cntl;
|
||||||
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
auto pa_cl_vte_cntl = args->pa_cl_vte_cntl;
|
||||||
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
|
auto pa_su_sc_mode_cntl = args->pa_su_sc_mode_cntl;
|
||||||
auto pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
|
auto pa_su_vtx_cntl = args->pa_su_vtx_cntl;
|
||||||
|
|
||||||
// Obtain the original viewport values in a normalized way.
|
// Obtain the original viewport values in a normalized way.
|
||||||
float scale_xy[] = {
|
float scale_xy[] = {
|
||||||
pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32
|
|
||||||
: 1.0f,
|
pa_cl_vte_cntl.vport_x_scale_ena ? args->PA_CL_VPORT_XSCALE : 1.0f,
|
||||||
pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32
|
pa_cl_vte_cntl.vport_y_scale_ena ? args->PA_CL_VPORT_YSCALE : 1.0f,
|
||||||
: 1.0f,
|
|
||||||
};
|
};
|
||||||
float scale_z = pa_cl_vte_cntl.vport_z_scale_ena
|
float scale_z =
|
||||||
? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32
|
pa_cl_vte_cntl.vport_z_scale_ena ? args->PA_CL_VPORT_ZSCALE : 1.0f;
|
||||||
: 1.0f;
|
|
||||||
float offset_base_xy[] = {
|
float offset_base_xy[] = {
|
||||||
pa_cl_vte_cntl.vport_x_offset_ena
|
pa_cl_vte_cntl.vport_x_offset_ena ? args->PA_CL_VPORT_XOFFSET : 0.0f,
|
||||||
? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32
|
pa_cl_vte_cntl.vport_y_offset_ena ? args->PA_CL_VPORT_YOFFSET : 0.0f,
|
||||||
: 0.0f,
|
|
||||||
pa_cl_vte_cntl.vport_y_offset_ena
|
|
||||||
? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
|
|
||||||
: 0.0f,
|
|
||||||
};
|
};
|
||||||
float offset_z = pa_cl_vte_cntl.vport_z_offset_ena
|
float offset_z =
|
||||||
? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32
|
pa_cl_vte_cntl.vport_z_offset_ena ? args->PA_CL_VPORT_ZOFFSET : 0.0f;
|
||||||
: 0.0f;
|
|
||||||
// Calculate all the integer.0 or integer.5 offsetting exactly at full
|
// Calculate all the integer.0 or integer.5 offsetting exactly at full
|
||||||
// precision, separately so it can be used in other integer calculations
|
// precision, separately so it can be used in other integer calculations
|
||||||
// without double rounding if needed.
|
// without double rounding if needed.
|
||||||
float offset_add_xy[2] = {};
|
float offset_add_xy[2] = {};
|
||||||
if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
|
if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
|
||||||
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
|
auto pa_sc_window_offset = args->pa_sc_window_offset;
|
||||||
offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset);
|
offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset);
|
||||||
offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset);
|
offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset);
|
||||||
}
|
}
|
||||||
|
@ -346,8 +340,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
|
|
||||||
// The maximum value is at least the maximum host render target size anyway -
|
// The maximum value is at least the maximum host render target size anyway -
|
||||||
// and a guest pixel is always treated as a whole with resolution scaling.
|
// and a guest pixel is always treated as a whole with resolution scaling.
|
||||||
uint32_t xy_max_unscaled[] = {x_max / draw_resolution_scale_x,
|
// cbrispy: todo, this integer divides show up high on the profiler somehow
|
||||||
y_max / draw_resolution_scale_y};
|
// (it was a very long session, too)
|
||||||
|
uint32_t xy_max_unscaled[] = {
|
||||||
|
args->draw_resolution_scale_x_divisor.Apply(args->x_max),
|
||||||
|
args->draw_resolution_scale_y_divisor.Apply(args->y_max)};
|
||||||
assert_not_zero(xy_max_unscaled[0]);
|
assert_not_zero(xy_max_unscaled[0]);
|
||||||
assert_not_zero(xy_max_unscaled[1]);
|
assert_not_zero(xy_max_unscaled[1]);
|
||||||
|
|
||||||
|
@ -367,9 +364,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]);
|
std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]);
|
||||||
viewport_info_out.xy_extent[i] =
|
viewport_info_out.xy_extent[i] =
|
||||||
extent_axis_unscaled *
|
extent_axis_unscaled *
|
||||||
(i ? draw_resolution_scale_y : draw_resolution_scale_x);
|
(i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x);
|
||||||
float extent_axis_unscaled_float = float(extent_axis_unscaled);
|
float extent_axis_unscaled_float = float(extent_axis_unscaled);
|
||||||
float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float;
|
|
||||||
|
float pixels_to_ndc_axis = ViewportRecip2_0(extent_axis_unscaled_float);
|
||||||
|
|
||||||
ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis;
|
ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis;
|
||||||
ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f +
|
ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f +
|
||||||
offset_add_xy[i]) *
|
offset_add_xy[i]) *
|
||||||
|
@ -394,7 +393,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
// doing truncation for simplicity - since maxing with 0 is done anyway
|
// doing truncation for simplicity - since maxing with 0 is done anyway
|
||||||
// (we only return viewports in the positive quarter-plane).
|
// (we only return viewports in the positive quarter-plane).
|
||||||
uint32_t axis_resolution_scale =
|
uint32_t axis_resolution_scale =
|
||||||
i ? draw_resolution_scale_y : draw_resolution_scale_x;
|
i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x;
|
||||||
float offset_axis = offset_base_xy[i] + offset_add_xy[i];
|
float offset_axis = offset_base_xy[i] + offset_add_xy[i];
|
||||||
float scale_axis = scale_xy[i];
|
float scale_axis = scale_xy[i];
|
||||||
float scale_axis_abs = std::abs(scale_xy[i]);
|
float scale_axis_abs = std::abs(scale_xy[i]);
|
||||||
|
@ -422,11 +421,14 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
// space, a region previously outside -W...W should end up within it, so
|
// space, a region previously outside -W...W should end up within it, so
|
||||||
// the scale should be < 1.
|
// the scale should be < 1.
|
||||||
float axis_extent_rounded = float(axis_extent_int);
|
float axis_extent_rounded = float(axis_extent_int);
|
||||||
ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded;
|
float inv_axis_extent_rounded =
|
||||||
|
ArchReciprocalRefined(axis_extent_rounded);
|
||||||
|
|
||||||
|
ndc_scale_axis = scale_axis * 2.0f * inv_axis_extent_rounded;
|
||||||
// Move the origin of the snapped coordinates back to the original one.
|
// Move the origin of the snapped coordinates back to the original one.
|
||||||
ndc_offset_axis = (float(offset_axis) -
|
ndc_offset_axis = (float(offset_axis) -
|
||||||
(float(axis_0_int) + axis_extent_rounded * 0.5f)) *
|
(float(axis_0_int) + axis_extent_rounded * 0.5f)) *
|
||||||
2.0f / axis_extent_rounded;
|
2.0f * inv_axis_extent_rounded;
|
||||||
} else {
|
} else {
|
||||||
// Empty viewport (everything outside the viewport scissor).
|
// Empty viewport (everything outside the viewport scissor).
|
||||||
ndc_scale_axis = 1.0f;
|
ndc_scale_axis = 1.0f;
|
||||||
|
@ -497,7 +499,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
ndc_scale[2] = 0.5f;
|
ndc_scale[2] = 0.5f;
|
||||||
ndc_offset[2] = 0.5f;
|
ndc_offset[2] = 0.5f;
|
||||||
}
|
}
|
||||||
if (pixel_shader_writes_depth) {
|
if (args->pixel_shader_writes_depth) {
|
||||||
// Allow the pixel shader to write any depth value since
|
// Allow the pixel shader to write any depth value since
|
||||||
// PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel
|
// PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel
|
||||||
// shaders don't have access to the original Z in the viewport space
|
// shaders don't have access to the original Z in the viewport space
|
||||||
|
@ -515,7 +517,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
// Direct3D 12 doesn't allow reverse depth range - on some drivers it
|
// Direct3D 12 doesn't allow reverse depth range - on some drivers it
|
||||||
// works, on some drivers it doesn't, actually, but it was never
|
// works, on some drivers it doesn't, actually, but it was never
|
||||||
// explicitly allowed by the specification.
|
// explicitly allowed by the specification.
|
||||||
if (!allow_reverse_z && z_min > z_max) {
|
if (!args->allow_reverse_z && z_min > z_max) {
|
||||||
std::swap(z_min, z_max);
|
std::swap(z_min, z_max);
|
||||||
ndc_scale[2] = -ndc_scale[2];
|
ndc_scale[2] = -ndc_scale[2];
|
||||||
ndc_offset[2] = 1.0f - ndc_offset[2];
|
ndc_offset[2] = 1.0f - ndc_offset[2];
|
||||||
|
@ -523,10 +525,9 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (normalized_depth_control.z_enable &&
|
if (args->normalized_depth_control.z_enable &&
|
||||||
regs.Get<reg::RB_DEPTH_INFO>().depth_format ==
|
args->depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
|
||||||
xenos::DepthRenderTargetFormat::kD24FS8) {
|
if (args->convert_z_to_float24) {
|
||||||
if (convert_z_to_float24) {
|
|
||||||
// Need to adjust the bounds that the resulting depth values will be
|
// Need to adjust the bounds that the resulting depth values will be
|
||||||
// clamped to after the pixel shader. Preferring adding some error to
|
// clamped to after the pixel shader. Preferring adding some error to
|
||||||
// interpolated Z instead if conversion can't be done exactly, without
|
// interpolated Z instead if conversion can't be done exactly, without
|
||||||
|
@ -537,7 +538,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
|
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
|
||||||
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
|
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
|
||||||
}
|
}
|
||||||
if (full_float24_in_0_to_1) {
|
if (args->full_float24_in_0_to_1) {
|
||||||
// Remap the full [0...2) float24 range to [0...1) support data round-trip
|
// Remap the full [0...2) float24 range to [0...1) support data round-trip
|
||||||
// during render target ownership transfer of EDRAM tiles through depth
|
// during render target ownership transfer of EDRAM tiles through depth
|
||||||
// input without unrestricted depth range.
|
// input without unrestricted depth range.
|
||||||
|
@ -548,7 +549,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
viewport_info_out.z_min = z_min;
|
viewport_info_out.z_min = z_min;
|
||||||
viewport_info_out.z_max = z_max;
|
viewport_info_out.z_max = z_max;
|
||||||
|
|
||||||
if (origin_bottom_left) {
|
if (args->origin_bottom_left) {
|
||||||
ndc_scale[1] = -ndc_scale[1];
|
ndc_scale[1] = -ndc_scale[1];
|
||||||
ndc_offset[1] = -ndc_offset[1];
|
ndc_offset[1] = -ndc_offset[1];
|
||||||
}
|
}
|
||||||
|
@ -557,7 +558,6 @@ void GetHostViewportInfo(const RegisterFile& regs,
|
||||||
viewport_info_out.ndc_offset[i] = ndc_offset[i];
|
viewport_info_out.ndc_offset[i] = ndc_offset[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
|
void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
|
||||||
bool clamp_to_surface_pitch) {
|
bool clamp_to_surface_pitch) {
|
||||||
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
|
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
|
||||||
|
@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
|
||||||
xenos::kMaxResolveSize);
|
xenos::kMaxResolveSize);
|
||||||
y1 = y0 + int32_t(xenos::kMaxResolveSize);
|
y1 = y0 + int32_t(xenos::kMaxResolveSize);
|
||||||
}
|
}
|
||||||
//fails in forza horizon 1
|
// fails in forza horizon 1
|
||||||
assert_true(x0 < x1 && y0 < y1);
|
assert_true(x0 < x1 && y0 < y1);
|
||||||
if (x0 >= x1 || y0 >= y1) {
|
if (x0 >= x1 || y0 >= y1) {
|
||||||
XELOGE("Resolve region is empty");
|
XELOGE("Resolve region is empty");
|
||||||
|
|
|
@ -277,18 +277,151 @@ struct ViewportInfo {
|
||||||
float ndc_scale[3];
|
float ndc_scale[3];
|
||||||
float ndc_offset[3];
|
float ndc_offset[3];
|
||||||
};
|
};
|
||||||
|
static_assert(sizeof(xenos::DepthRenderTargetFormat) == sizeof(uint32_t),
|
||||||
|
"Change in depthrendertargetformat throws off "
|
||||||
|
"getviewportinfoargs by a bit");
|
||||||
|
struct GetViewportInfoArgs {
|
||||||
|
union alignas(64) {
|
||||||
|
struct {
|
||||||
|
// group 1
|
||||||
|
uint32_t x_max;
|
||||||
|
uint32_t y_max;
|
||||||
|
union {
|
||||||
|
struct {
|
||||||
|
uint32_t origin_bottom_left : 1;
|
||||||
|
uint32_t allow_reverse_z : 1;
|
||||||
|
uint32_t convert_z_to_float24 : 1;
|
||||||
|
uint32_t full_float24_in_0_to_1 : 1;
|
||||||
|
uint32_t pixel_shader_writes_depth : 1;
|
||||||
|
xenos::DepthRenderTargetFormat depth_format : 1;
|
||||||
|
};
|
||||||
|
uint32_t packed_portions;
|
||||||
|
};
|
||||||
|
reg::RB_DEPTHCONTROL normalized_depth_control;
|
||||||
|
// group 2
|
||||||
|
reg::PA_CL_CLIP_CNTL pa_cl_clip_cntl;
|
||||||
|
reg::PA_CL_VTE_CNTL pa_cl_vte_cntl;
|
||||||
|
reg::PA_SU_SC_MODE_CNTL pa_su_sc_mode_cntl;
|
||||||
|
reg::PA_SU_VTX_CNTL pa_su_vtx_cntl;
|
||||||
|
// group 3
|
||||||
|
reg::PA_SC_WINDOW_OFFSET pa_sc_window_offset;
|
||||||
|
float PA_CL_VPORT_XSCALE;
|
||||||
|
float PA_CL_VPORT_YSCALE;
|
||||||
|
float PA_CL_VPORT_ZSCALE;
|
||||||
|
|
||||||
|
float PA_CL_VPORT_XOFFSET;
|
||||||
|
float PA_CL_VPORT_YOFFSET;
|
||||||
|
float PA_CL_VPORT_ZOFFSET;
|
||||||
|
uint32_t padding_set_to_0;
|
||||||
|
};
|
||||||
|
#if XE_ARCH_AMD64 == 1
|
||||||
|
struct {
|
||||||
|
__m128i first4; // x_max, y_max, packed_portions,
|
||||||
|
// normalized_depth_control
|
||||||
|
__m128i second4; // pa_cl_clip_cntl, pa_cl_vte_cntl, pa_su_sc_mode_cntl,
|
||||||
|
// pa_su_vtx_cntl
|
||||||
|
__m128i third4; // pa_sc_window_offset, PA_CL_VPORT_XSCALE,
|
||||||
|
// PA_CL_VPORT_YSCALE, PA_CL_VPORT_ZSCALE
|
||||||
|
__m128i last4; // PA_CL_VPORT_XOFFSET, PA_CL_VPORT_YOFFSET,
|
||||||
|
// PA_CL_VPORT_ZOFFSET, padding_set_to_0
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
// everything that follows here does not need to be compared
|
||||||
|
uint32_t draw_resolution_scale_x;
|
||||||
|
uint32_t draw_resolution_scale_y;
|
||||||
|
divisors::MagicDiv draw_resolution_scale_x_divisor;
|
||||||
|
divisors::MagicDiv draw_resolution_scale_y_divisor;
|
||||||
|
void Setup(uint32_t _draw_resolution_scale_x,
|
||||||
|
uint32_t _draw_resolution_scale_y,
|
||||||
|
divisors::MagicDiv _draw_resolution_scale_x_divisor,
|
||||||
|
divisors::MagicDiv _draw_resolution_scale_y_divisor,
|
||||||
|
bool _origin_bottom_left, uint32_t _x_max, uint32_t _y_max,
|
||||||
|
bool _allow_reverse_z,
|
||||||
|
reg::RB_DEPTHCONTROL _normalized_depth_control,
|
||||||
|
bool _convert_z_to_float24, bool _full_float24_in_0_to_1,
|
||||||
|
bool _pixel_shader_writes_depth) {
|
||||||
|
packed_portions = 0;
|
||||||
|
padding_set_to_0 = 0; // important to zero this
|
||||||
|
draw_resolution_scale_x = _draw_resolution_scale_x;
|
||||||
|
draw_resolution_scale_y = _draw_resolution_scale_y;
|
||||||
|
draw_resolution_scale_x_divisor = _draw_resolution_scale_x_divisor;
|
||||||
|
draw_resolution_scale_y_divisor = _draw_resolution_scale_y_divisor;
|
||||||
|
origin_bottom_left = _origin_bottom_left;
|
||||||
|
x_max = _x_max;
|
||||||
|
y_max = _y_max;
|
||||||
|
allow_reverse_z = _allow_reverse_z;
|
||||||
|
normalized_depth_control = _normalized_depth_control;
|
||||||
|
convert_z_to_float24 = _convert_z_to_float24;
|
||||||
|
full_float24_in_0_to_1 = _full_float24_in_0_to_1;
|
||||||
|
pixel_shader_writes_depth = _pixel_shader_writes_depth;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SetupRegisterValues(const RegisterFile& regs) {
|
||||||
|
pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
|
||||||
|
pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
|
||||||
|
pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
|
||||||
|
pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
|
||||||
|
PA_CL_VPORT_XSCALE = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;
|
||||||
|
PA_CL_VPORT_YSCALE = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;
|
||||||
|
PA_CL_VPORT_ZSCALE = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32;
|
||||||
|
PA_CL_VPORT_XOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32;
|
||||||
|
PA_CL_VPORT_YOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
|
||||||
|
PA_CL_VPORT_ZOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32;
|
||||||
|
pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
|
||||||
|
depth_format = regs.Get<reg::RB_DEPTH_INFO>().depth_format;
|
||||||
|
}
|
||||||
|
XE_FORCEINLINE
|
||||||
|
bool operator==(const GetViewportInfoArgs& prev) {
|
||||||
|
#if XE_ARCH_AMD64 == 0
|
||||||
|
bool result = true;
|
||||||
|
|
||||||
|
auto accum_eq = [&result](auto x, auto y) { result &= (x == y); };
|
||||||
|
|
||||||
|
#define EQC(field) accum_eq(field, prev.field)
|
||||||
|
EQC(x_max);
|
||||||
|
EQC(y_max);
|
||||||
|
EQC(packed_portions);
|
||||||
|
EQC(normalized_depth_control.value);
|
||||||
|
EQC(pa_cl_clip_cntl.value);
|
||||||
|
EQC(pa_cl_vte_cntl.value);
|
||||||
|
|
||||||
|
EQC(pa_su_sc_mode_cntl.value);
|
||||||
|
EQC(pa_su_vtx_cntl.value);
|
||||||
|
EQC(PA_CL_VPORT_XSCALE);
|
||||||
|
EQC(PA_CL_VPORT_YSCALE);
|
||||||
|
EQC(PA_CL_VPORT_ZSCALE);
|
||||||
|
EQC(PA_CL_VPORT_XOFFSET);
|
||||||
|
EQC(PA_CL_VPORT_YOFFSET);
|
||||||
|
EQC(PA_CL_VPORT_ZOFFSET);
|
||||||
|
EQC(pa_sc_window_offset.value);
|
||||||
|
|
||||||
|
#undef EQC
|
||||||
|
return result;
|
||||||
|
#else
|
||||||
|
__m128i mask1 = _mm_cmpeq_epi32(first4, prev.first4);
|
||||||
|
__m128i mask2 = _mm_cmpeq_epi32(second4, prev.second4);
|
||||||
|
|
||||||
|
__m128i mask3 = _mm_cmpeq_epi32(third4, prev.third4);
|
||||||
|
__m128i unified1 = _mm_and_si128(mask1, mask2);
|
||||||
|
__m128i mask4 = _mm_cmpeq_epi32(last4, prev.last4);
|
||||||
|
|
||||||
|
__m128i unified2 = _mm_and_si128(unified1, mask3);
|
||||||
|
|
||||||
|
__m128i unified3 = _mm_and_si128(unified2, mask4);
|
||||||
|
|
||||||
|
return _mm_movemask_epi8(unified3) == 0xFFFF;
|
||||||
|
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// Converts the guest viewport (or fakes one if drawing without a viewport) to
|
// Converts the guest viewport (or fakes one if drawing without a viewport) to
|
||||||
// a viewport, plus values to multiply-add the returned position by, usable on
|
// a viewport, plus values to multiply-add the returned position by, usable on
|
||||||
// host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
|
// host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
|
||||||
// Direct3D clip space with 0...W Z rather than -W...W.
|
// Direct3D clip space with 0...W Z rather than -W...W.
|
||||||
void GetHostViewportInfo(const RegisterFile& regs,
|
void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
|
||||||
uint32_t draw_resolution_scale_x,
|
|
||||||
uint32_t draw_resolution_scale_y,
|
|
||||||
bool origin_bottom_left, uint32_t x_max,
|
|
||||||
uint32_t y_max, bool allow_reverse_z,
|
|
||||||
reg::RB_DEPTHCONTROL normalized_depth_control,
|
|
||||||
bool convert_z_to_float24, bool full_float24_in_0_to_1,
|
|
||||||
bool pixel_shader_writes_depth,
|
|
||||||
ViewportInfo& viewport_info_out);
|
ViewportInfo& viewport_info_out);
|
||||||
|
|
||||||
struct Scissor {
|
struct Scissor {
|
||||||
|
|
|
@ -813,20 +813,22 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
|
||||||
}
|
}
|
||||||
// Make sure the same render target isn't bound into two different slots
|
// Make sure the same render target isn't bound into two different slots
|
||||||
// over time.
|
// over time.
|
||||||
for (uint32_t i = 1; are_accumulated_render_targets_valid_ &&
|
// chrispy: this needs optimization!
|
||||||
i < 1 + xenos::kMaxColorRenderTargets;
|
if (are_accumulated_render_targets_valid_) {
|
||||||
++i) {
|
for (uint32_t i = 1; i < 1 + xenos::kMaxColorRenderTargets; ++i) {
|
||||||
const RenderTarget* render_target =
|
const RenderTarget* render_target =
|
||||||
last_update_accumulated_render_targets_[i];
|
last_update_accumulated_render_targets_[i];
|
||||||
if (!render_target) {
|
if (!render_target) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (uint32_t j = 0; j < i; ++j) {
|
for (uint32_t j = 0; j < i; ++j) {
|
||||||
if (last_update_accumulated_render_targets_[j] == render_target) {
|
if (last_update_accumulated_render_targets_[j] == render_target) {
|
||||||
are_accumulated_render_targets_valid_ = false;
|
are_accumulated_render_targets_valid_ = false;
|
||||||
break;
|
goto exit_slot_check_loop;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
exit_slot_check_loop:;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!are_accumulated_render_targets_valid_) {
|
if (!are_accumulated_render_targets_valid_) {
|
||||||
|
|
|
@ -154,7 +154,9 @@ TextureCache::TextureCache(const RegisterFile& register_file,
|
||||||
: register_file_(register_file),
|
: register_file_(register_file),
|
||||||
shared_memory_(shared_memory),
|
shared_memory_(shared_memory),
|
||||||
draw_resolution_scale_x_(draw_resolution_scale_x),
|
draw_resolution_scale_x_(draw_resolution_scale_x),
|
||||||
draw_resolution_scale_y_(draw_resolution_scale_y) {
|
draw_resolution_scale_y_(draw_resolution_scale_y),
|
||||||
|
draw_resolution_scale_x_divisor_(draw_resolution_scale_x),
|
||||||
|
draw_resolution_scale_y_divisor_(draw_resolution_scale_y) {
|
||||||
assert_true(draw_resolution_scale_x >= 1);
|
assert_true(draw_resolution_scale_x >= 1);
|
||||||
assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis);
|
assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis);
|
||||||
assert_true(draw_resolution_scale_y >= 1);
|
assert_true(draw_resolution_scale_y >= 1);
|
||||||
|
@ -187,6 +189,7 @@ bool TextureCache::GetConfigDrawResolutionScale(uint32_t& x_out,
|
||||||
uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x));
|
uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x));
|
||||||
uint32_t config_y =
|
uint32_t config_y =
|
||||||
uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y));
|
uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y));
|
||||||
|
|
||||||
uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x);
|
uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x);
|
||||||
uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y);
|
uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y);
|
||||||
x_out = clamped_x;
|
x_out = clamped_x;
|
||||||
|
@ -552,8 +555,7 @@ void TextureCache::Texture::MarkAsUsed() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void TextureCache::Texture::WatchCallback(
|
void TextureCache::Texture::WatchCallback(
|
||||||
[[maybe_unused]] const global_unique_lock_type& global_lock,
|
[[maybe_unused]] const global_unique_lock_type& global_lock, bool is_mip) {
|
||||||
bool is_mip) {
|
|
||||||
if (is_mip) {
|
if (is_mip) {
|
||||||
assert_not_zero(GetGuestMipsSize());
|
assert_not_zero(GetGuestMipsSize());
|
||||||
mips_outdated_ = true;
|
mips_outdated_ = true;
|
||||||
|
@ -566,8 +568,8 @@ void TextureCache::Texture::WatchCallback(
|
||||||
}
|
}
|
||||||
|
|
||||||
void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
|
void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
|
||||||
void* context,
|
void* context, void* data, uint64_t argument,
|
||||||
void* data, uint64_t argument, bool invalidated_by_gpu) {
|
bool invalidated_by_gpu) {
|
||||||
Texture& texture = *static_cast<Texture*>(context);
|
Texture& texture = *static_cast<Texture*>(context);
|
||||||
texture.WatchCallback(global_lock, argument != 0);
|
texture.WatchCallback(global_lock, argument != 0);
|
||||||
texture.texture_cache().texture_became_outdated_.store(
|
texture.texture_cache().texture_became_outdated_.store(
|
||||||
|
@ -910,8 +912,8 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
|
||||||
}
|
}
|
||||||
|
|
||||||
void TextureCache::ScaledResolveGlobalWatchCallback(
|
void TextureCache::ScaledResolveGlobalWatchCallback(
|
||||||
const global_unique_lock_type& global_lock,
|
const global_unique_lock_type& global_lock, uint32_t address_first,
|
||||||
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
|
uint32_t address_last, bool invalidated_by_gpu) {
|
||||||
assert_true(IsDrawResolutionScaled());
|
assert_true(IsDrawResolutionScaled());
|
||||||
if (invalidated_by_gpu) {
|
if (invalidated_by_gpu) {
|
||||||
// Resolves themselves do exactly the opposite of what this should do.
|
// Resolves themselves do exactly the opposite of what this should do.
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
#include "xenia/base/assert.h"
|
#include "xenia/base/assert.h"
|
||||||
#include "xenia/base/hash.h"
|
#include "xenia/base/hash.h"
|
||||||
|
#include "xenia/base/math.h"
|
||||||
#include "xenia/base/mutex.h"
|
#include "xenia/base/mutex.h"
|
||||||
#include "xenia/gpu/register_file.h"
|
#include "xenia/gpu/register_file.h"
|
||||||
#include "xenia/gpu/shared_memory.h"
|
#include "xenia/gpu/shared_memory.h"
|
||||||
|
@ -70,6 +71,14 @@ class TextureCache {
|
||||||
static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out);
|
static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out);
|
||||||
uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; }
|
uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; }
|
||||||
uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; }
|
uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; }
|
||||||
|
|
||||||
|
divisors::MagicDiv draw_resolution_scale_x_divisor() const {
|
||||||
|
return draw_resolution_scale_x_divisor_;
|
||||||
|
}
|
||||||
|
divisors::MagicDiv draw_resolution_scale_y_divisor() const {
|
||||||
|
return draw_resolution_scale_y_divisor_;
|
||||||
|
}
|
||||||
|
|
||||||
bool IsDrawResolutionScaled() const {
|
bool IsDrawResolutionScaled() const {
|
||||||
return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1;
|
return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1;
|
||||||
}
|
}
|
||||||
|
@ -576,8 +585,8 @@ class TextureCache {
|
||||||
|
|
||||||
// Shared memory callback for texture data invalidation.
|
// Shared memory callback for texture data invalidation.
|
||||||
static void WatchCallback(const global_unique_lock_type& global_lock,
|
static void WatchCallback(const global_unique_lock_type& global_lock,
|
||||||
void* context,
|
void* context, void* data, uint64_t argument,
|
||||||
void* data, uint64_t argument, bool invalidated_by_gpu);
|
bool invalidated_by_gpu);
|
||||||
|
|
||||||
// Checks if there are any pages that contain scaled resolve data within the
|
// Checks if there are any pages that contain scaled resolve data within the
|
||||||
// range.
|
// range.
|
||||||
|
@ -588,14 +597,15 @@ class TextureCache {
|
||||||
const global_unique_lock_type& global_lock, void* context,
|
const global_unique_lock_type& global_lock, void* context,
|
||||||
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
|
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
|
||||||
void ScaledResolveGlobalWatchCallback(
|
void ScaledResolveGlobalWatchCallback(
|
||||||
const global_unique_lock_type& global_lock,
|
const global_unique_lock_type& global_lock, uint32_t address_first,
|
||||||
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
|
uint32_t address_last, bool invalidated_by_gpu);
|
||||||
|
|
||||||
const RegisterFile& register_file_;
|
const RegisterFile& register_file_;
|
||||||
SharedMemory& shared_memory_;
|
SharedMemory& shared_memory_;
|
||||||
uint32_t draw_resolution_scale_x_;
|
uint32_t draw_resolution_scale_x_;
|
||||||
uint32_t draw_resolution_scale_y_;
|
uint32_t draw_resolution_scale_y_;
|
||||||
|
divisors::MagicDiv draw_resolution_scale_x_divisor_;
|
||||||
|
divisors::MagicDiv draw_resolution_scale_y_divisor_;
|
||||||
static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
|
static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
|
||||||
|
|
||||||
xe::global_critical_region global_critical_region_;
|
xe::global_critical_region global_critical_region_;
|
||||||
|
|
|
@ -2366,6 +2366,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
|
||||||
|
|
||||||
// Get dynamic rasterizer state.
|
// Get dynamic rasterizer state.
|
||||||
draw_util::ViewportInfo viewport_info;
|
draw_util::ViewportInfo viewport_info;
|
||||||
|
|
||||||
// Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must
|
// Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must
|
||||||
// be at least 2 * max(maxViewportDimensions[0...1]) - 1, and
|
// be at least 2 * max(maxViewportDimensions[0...1]) - 1, and
|
||||||
// maxViewportDimensions must be greater than or equal to the size of the
|
// maxViewportDimensions must be greater than or equal to the size of the
|
||||||
|
@ -2382,11 +2383,16 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
|
||||||
// life. Or even disregard the viewport bounds range in the fragment shader
|
// life. Or even disregard the viewport bounds range in the fragment shader
|
||||||
// interlocks case completely - apply the viewport and the scissor offset
|
// interlocks case completely - apply the viewport and the scissor offset
|
||||||
// directly to pixel address and to things like ps_param_gen.
|
// directly to pixel address and to things like ps_param_gen.
|
||||||
draw_util::GetHostViewportInfo(
|
draw_util::GetViewportInfoArgs gviargs{};
|
||||||
regs, 1, 1, false, device_limits.maxViewportDimensions[0],
|
gviargs.Setup(1, 1, divisors::MagicDiv{1}, divisors::MagicDiv{1}, false,
|
||||||
device_limits.maxViewportDimensions[1], true, normalized_depth_control,
|
device_limits.maxViewportDimensions[0],
|
||||||
false, host_render_targets_used,
|
|
||||||
pixel_shader && pixel_shader->writes_depth(), viewport_info);
|
device_limits.maxViewportDimensions[1], true,
|
||||||
|
normalized_depth_control, false, host_render_targets_used,
|
||||||
|
pixel_shader && pixel_shader->writes_depth());
|
||||||
|
gviargs.SetupRegisterValues(regs);
|
||||||
|
|
||||||
|
draw_util::GetHostViewportInfo(&gviargs, viewport_info);
|
||||||
|
|
||||||
// Update dynamic graphics pipeline state.
|
// Update dynamic graphics pipeline state.
|
||||||
UpdateDynamicState(viewport_info, primitive_polygonal,
|
UpdateDynamicState(viewport_info, primitive_polygonal,
|
||||||
|
|
|
@ -326,7 +326,14 @@ constexpr bool IsColorRenderTargetFormat64bpp(ColorRenderTargetFormat format) {
|
||||||
format == ColorRenderTargetFormat::k_32_32_FLOAT;
|
format == ColorRenderTargetFormat::k_32_32_FLOAT;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline uint32_t GetColorRenderTargetFormatComponentCount(
|
// if 0, 1
|
||||||
|
// if 1, 2
|
||||||
|
// if 3, 4
|
||||||
|
// 2 bits per entry, shift and add 1
|
||||||
|
|
||||||
|
using ColorFormatComponentTable = uint32_t;
|
||||||
|
|
||||||
|
static constexpr uint32_t GetComponentCountConst(
|
||||||
ColorRenderTargetFormat format) {
|
ColorRenderTargetFormat format) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
case ColorRenderTargetFormat::k_8_8_8_8:
|
case ColorRenderTargetFormat::k_8_8_8_8:
|
||||||
|
@ -337,19 +344,51 @@ inline uint32_t GetColorRenderTargetFormatComponentCount(
|
||||||
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
|
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
|
||||||
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
|
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
|
||||||
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
|
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
|
||||||
return 4;
|
return 4 - 1;
|
||||||
case ColorRenderTargetFormat::k_16_16:
|
case ColorRenderTargetFormat::k_16_16:
|
||||||
case ColorRenderTargetFormat::k_16_16_FLOAT:
|
case ColorRenderTargetFormat::k_16_16_FLOAT:
|
||||||
case ColorRenderTargetFormat::k_32_32_FLOAT:
|
case ColorRenderTargetFormat::k_32_32_FLOAT:
|
||||||
return 2;
|
return 2 - 1;
|
||||||
case ColorRenderTargetFormat::k_32_FLOAT:
|
case ColorRenderTargetFormat::k_32_FLOAT:
|
||||||
return 1;
|
return 1 - 1;
|
||||||
default:
|
default:
|
||||||
assert_unhandled_case(format);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
namespace detail {
|
||||||
|
static constexpr uint32_t encode_format_component_table() {
|
||||||
|
uint32_t result = 0;
|
||||||
|
|
||||||
|
#define ADDFORMAT(name) \
|
||||||
|
result |= GetComponentCountConst(ColorRenderTargetFormat::name) \
|
||||||
|
<< (static_cast<uint32_t>(ColorRenderTargetFormat::name) * 2)
|
||||||
|
ADDFORMAT(k_8_8_8_8);
|
||||||
|
ADDFORMAT(k_8_8_8_8_GAMMA);
|
||||||
|
ADDFORMAT(k_2_10_10_10);
|
||||||
|
ADDFORMAT(k_2_10_10_10_FLOAT);
|
||||||
|
|
||||||
|
ADDFORMAT(k_16_16_16_16);
|
||||||
|
ADDFORMAT(k_16_16_16_16_FLOAT);
|
||||||
|
ADDFORMAT(k_2_10_10_10_AS_10_10_10_10);
|
||||||
|
ADDFORMAT(k_2_10_10_10_FLOAT_AS_16_16_16_16);
|
||||||
|
|
||||||
|
ADDFORMAT(k_16_16);
|
||||||
|
ADDFORMAT(k_16_16_FLOAT);
|
||||||
|
ADDFORMAT(k_32_32_FLOAT);
|
||||||
|
ADDFORMAT(k_32_FLOAT);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
constexpr uint32_t color_format_component_table =
|
||||||
|
encode_format_component_table();
|
||||||
|
|
||||||
|
} // namespace detail
|
||||||
|
constexpr uint32_t GetColorRenderTargetFormatComponentCount(
|
||||||
|
ColorRenderTargetFormat format) {
|
||||||
|
return ((detail::color_format_component_table >>
|
||||||
|
(static_cast<uint32_t>(format) * 2)) &
|
||||||
|
0b11) +
|
||||||
|
1;
|
||||||
|
}
|
||||||
// Returns the version of the format with the same packing and meaning of values
|
// Returns the version of the format with the same packing and meaning of values
|
||||||
// stored in it, but without blending precision modifiers.
|
// stored in it, but without blending precision modifiers.
|
||||||
constexpr ColorRenderTargetFormat GetStorageColorFormat(
|
constexpr ColorRenderTargetFormat GetStorageColorFormat(
|
||||||
|
|
Loading…
Reference in New Issue