Merge pull request #66 from chrisps/canary_experimental

Huge boost to readback_memexport/resolve performance by fixing old bug; miscellaneous optimizations
This commit is contained in:
Radosław Gliński 2022-08-29 00:55:24 +02:00 committed by GitHub
commit c1d3e35eb9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 1593 additions and 513 deletions

415
src/xenia/base/dma.cc Normal file
View File

@ -0,0 +1,415 @@
#include "dma.h"
template <size_t N, typename... Ts>
static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
char buffer[1024];
sprintf_s(buffer, fmt, args...);
XELOGI("%s", buffer);
}
//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
#define XEDMALOG(...) static_cast<void>(0)
using xe::swcache::CacheLine;
static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
XE_FORCEINLINE
static void XeCopy16384Streaming(CacheLine* XE_RESTRICT to,
CacheLine* XE_RESTRICT from) {
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
CacheLine* dest1 = to;
CacheLine* src1 = from;
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
xe::swcache::CacheLine line0, line1, line2, line3;
xe::swcache::ReadLine(&line0, src1 + i);
xe::swcache::ReadLine(&line1, src2 + i);
xe::swcache::ReadLine(&line2, src3 + i);
xe::swcache::ReadLine(&line3, src4 + i);
XE_MSVC_REORDER_BARRIER();
xe::swcache::WriteLineNT(dest1 + i, &line0);
xe::swcache::WriteLineNT(dest2 + i, &line1);
xe::swcache::WriteLineNT(dest3 + i, &line2);
xe::swcache::WriteLineNT(dest4 + i, &line3);
}
XE_MSVC_REORDER_BARRIER();
}
namespace xe::dma {
XE_FORCEINLINE
static void vastcpy_impl(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length) {
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
while (written_length >= 16384) {
XeCopy16384Streaming(physaddr, rdmapping);
physaddr += NUM_LINES_FOR_16K;
rdmapping += NUM_LINES_FOR_16K;
written_length -= 16384;
}
if (!written_length) {
return;
}
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
uint32_t i = 0;
for (; i + 1 < num_written_lines; i += 2) {
xe::swcache::CacheLine line0, line1;
xe::swcache::ReadLine(&line0, rdmapping + i);
xe::swcache::ReadLine(&line1, rdmapping + i + 1);
XE_MSVC_REORDER_BARRIER();
xe::swcache::WriteLineNT(physaddr + i, &line0);
xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
}
if (i < num_written_lines) {
xe::swcache::CacheLine line0;
xe::swcache::ReadLine(&line0, rdmapping + i);
xe::swcache::WriteLineNT(physaddr + i, &line0);
}
}
XE_NOINLINE
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
uint32_t written_length) {
return vastcpy_impl((CacheLine*)physaddr, (CacheLine*)rdmapping,
written_length);
}
#define XEDMA_NUM_WORKERS 4
class alignas(256) XeDMACGeneric : public XeDMAC {
struct alignas(XE_HOST_CACHE_LINE_SIZE) {
std::atomic<uint64_t> free_job_slots_;
std::atomic<uint64_t> jobs_submitted_;
std::atomic<uint64_t> jobs_completed_;
std::atomic<uint32_t> num_workers_awoken_;
std::atomic<uint32_t> current_job_serial_;
} dma_volatile_;
alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64];
volatile uint32_t jobserials_[64];
alignas(XE_HOST_CACHE_LINE_SIZE)
std::unique_ptr<threading::Event> job_done_signals_[64];
// really dont like using unique pointer for this...
std::unique_ptr<threading::Event> job_submitted_signal_;
std::unique_ptr<threading::Event> job_completed_signal_;
std::unique_ptr<threading::Thread> scheduler_thread_;
struct WorkSlice {
uint8_t* destination;
uint8_t* source;
size_t numbytes;
};
std::unique_ptr<threading::Thread> workers_[XEDMA_NUM_WORKERS];
std::unique_ptr<threading::Event> worker_has_work_; //[XEDMA_NUM_WORKERS];
std::unique_ptr<threading::Event> worker_has_finished_[XEDMA_NUM_WORKERS];
threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS];
WorkSlice worker_workslice_[XEDMA_NUM_WORKERS];
// chrispy: this is bad
static uint32_t find_free_hole_in_dword(uint64_t dw) {
XEDMALOG("Finding free hole in 0x%llX", dw);
for (uint32_t i = 0; i < 64; ++i) {
if (dw & (1ULL << i)) {
continue;
}
return i;
}
return ~0U;
}
uint32_t allocate_free_dma_slot() {
XEDMALOG("Allocating free slot");
uint32_t got_slot = 0;
uint64_t slots;
uint64_t allocated_slot;
do {
slots = dma_volatile_.free_job_slots_.load();
got_slot = find_free_hole_in_dword(slots);
if (!~got_slot) {
XEDMALOG("Didn't get a slot!");
return ~0U;
}
allocated_slot = slots | (1ULL << got_slot);
} while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
slots, allocated_slot)));
XEDMALOG("Allocated slot %d", got_slot);
return got_slot;
}
// chrispy: on x86 this can just be interlockedbittestandreset...
void free_dma_slot(uint32_t slot) {
XEDMALOG("Freeing slot %d", slot);
uint64_t slots;
uint64_t deallocated_slot;
do {
slots = dma_volatile_.free_job_slots_.load();
deallocated_slot = slots & (~(1ULL << slot));
} while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong(
slots, deallocated_slot)));
}
void DoDMAJob(uint32_t idx) {
XeDMAJob& job = jobs_[idx];
if (job.precall) {
job.precall(&job);
}
// memcpy(job.destination, job.source, job.size);
size_t job_size = job.size;
size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE;
size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE;
size_t rem = job_size - line_rounded;
size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS;
XEDMALOG(
"Distributing %d bytes from %p to %p across %d workers, remainder is "
"%d",
line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem);
if (num_per_worker < 2048) {
XEDMALOG("not distributing across workers, num_per_worker < 8192");
// not worth splitting up
memcpy(job.destination, job.source, job.size);
job.signal_on_done->Set();
} else {
for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
worker_workslice_[i].destination =
(i * num_per_worker) + job.destination;
worker_workslice_[i].source = (i * num_per_worker) + job.source;
worker_workslice_[i].numbytes = num_per_worker;
}
if (rem) {
__movsb(job.destination + line_rounded, job.source + line_rounded, rem);
}
// wake them up
worker_has_work_->Set();
XEDMALOG("Starting waitall for job");
threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS,
false);
XEDMALOG("Waitall for job completed!");
job.signal_on_done->Set();
}
if (job.postcall) {
job.postcall(&job);
}
++dma_volatile_.jobs_completed_;
}
void WorkerIter(uint32_t worker_index) {
xenia_assert(worker_index < XEDMA_NUM_WORKERS);
auto [dest, src, size] = worker_workslice_[worker_index];
// if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) {
worker_has_work_->Reset();
//}
xenia_assert(size < (1ULL << 32));
// memcpy(dest, src, size);
dma::vastcpy(dest, src, static_cast<uint32_t>(size));
}
XE_NOINLINE
void WorkerMainLoop(uint32_t worker_index) {
do {
XEDMALOG("Worker iter for worker %d", worker_index);
WorkerIter(worker_index);
XEDMALOG("Worker %d is done\n", worker_index);
threading::SignalAndWait(worker_has_finished_[worker_index].get(),
worker_has_work_.get(), false);
} while (true);
}
void WorkerMain(uint32_t worker_index) {
XEDMALOG("Entered worker main loop, index %d", worker_index);
threading::Wait(worker_has_work_.get(), false);
XEDMALOG("First wait for worker %d completed, first job ever",
worker_index);
WorkerMainLoop(worker_index);
}
static void WorkerMainForwarder(void* ptr) {
// we aligned XeDma to 256 bytes and encode extra info in the low 8
uintptr_t uptr = (uintptr_t)ptr;
uint32_t worker_index = (uint8_t)uptr;
uptr &= ~0xFFULL;
char name_buffer[64];
sprintf_s(name_buffer, "dma_worker_%d", worker_index);
xe::threading::set_name(name_buffer);
reinterpret_cast<XeDMACGeneric*>(uptr)->WorkerMain(worker_index);
}
void DMAMain() {
XEDMALOG("DmaMain");
do {
threading::Wait(job_submitted_signal_.get(), false);
auto slots = dma_volatile_.free_job_slots_.load();
for (uint32_t i = 0; i < 64; ++i) {
if (slots & (1ULL << i)) {
XEDMALOG("Got new job at index %d in DMAMain", i);
DoDMAJob(i);
free_dma_slot(i);
job_completed_signal_->Set();
// break;
}
}
} while (true);
}
static void DMAMainForwarder(void* ud) {
xe::threading::set_name("dma_main");
reinterpret_cast<XeDMACGeneric*>(ud)->DMAMain();
}
public:
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
XEDMALOG("New job, %p to %p with size %d", job->source, job->destination,
job->size);
uint32_t slot;
do {
slot = allocate_free_dma_slot();
if (!~slot) {
XEDMALOG(
"Didn't get a free slot, waiting for a job to complete before "
"resuming.");
threading::Wait(job_completed_signal_.get(), false);
} else {
break;
}
} while (true);
jobs_[slot] = *job;
jobs_[slot].signal_on_done = job_done_signals_[slot].get();
jobs_[slot].signal_on_done->Reset();
XEDMALOG("Setting job submit signal, pushed into slot %d", slot);
uint32_t new_serial = dma_volatile_.current_job_serial_++;
jobserials_[slot] = new_serial;
++dma_volatile_.jobs_submitted_;
job_submitted_signal_->Set();
return (static_cast<uint64_t>(new_serial) << 32) |
static_cast<uint64_t>(slot);
// return job_done_signals_[slot].get();
}
bool AllJobsDone() {
return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_;
}
virtual void WaitJobDone(DMACJobHandle handle) override {
uint32_t serial = static_cast<uint32_t>(handle >> 32);
uint32_t jobid = static_cast<uint32_t>(handle);
do {
if (jobserials_[jobid] != serial) {
return; // done, our slot was reused
}
auto waitres = threading::Wait(job_done_signals_[jobid].get(), false,
std::chrono::milliseconds{1});
if (waitres == threading::WaitResult::kTimeout) {
continue;
} else {
return;
}
} while (true);
}
virtual void WaitForIdle() override {
while (!AllJobsDone()) {
threading::MaybeYield();
}
}
XeDMACGeneric() {
XEDMALOG("Constructing xedma at addr %p", this);
dma_volatile_.free_job_slots_.store(0ULL);
dma_volatile_.jobs_submitted_.store(0ULL);
dma_volatile_.jobs_completed_.store(0ULL);
dma_volatile_.current_job_serial_.store(
1ULL); // so that a jobhandle is never 0
std::memset(jobs_, 0, sizeof(jobs_));
job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false);
job_completed_signal_ = threading::Event::CreateAutoResetEvent(false);
worker_has_work_ = threading::Event::CreateManualResetEvent(false);
threading::Thread::CreationParameters worker_params{};
worker_params.create_suspended = false;
worker_params.initial_priority = threading::ThreadPriority::kBelowNormal;
worker_params.stack_size = 65536; // dont need much stack at all
for (uint32_t i = 0; i < 64; ++i) {
job_done_signals_[i] = threading::Event::CreateManualResetEvent(false);
}
for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) {
// worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false);
worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false);
worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get();
uintptr_t encoded = reinterpret_cast<uintptr_t>(this);
xenia_assert(!(encoded & 0xFFULL));
xenia_assert(i < 256);
encoded |= i;
workers_[i] = threading::Thread::Create(worker_params, [encoded]() {
XeDMACGeneric::WorkerMainForwarder((void*)encoded);
});
}
threading::Thread::CreationParameters scheduler_params{};
scheduler_params.create_suspended = false;
scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal;
scheduler_params.stack_size = 65536;
scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() {
XeDMACGeneric::DMAMainForwarder((void*)this);
});
}
};
XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
} // namespace xe::dma

46
src/xenia/base/dma.h Normal file
View File

@ -0,0 +1,46 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_BASE_DMA_H_
#define XENIA_BASE_DMA_H_
#include "memory.h"
#include "threading.h"
namespace xe::dma {
struct XeDMAJob;
using DmaPrecall = void (*)(XeDMAJob* job);
using DmaPostcall = void (*)(XeDMAJob* job);
struct XeDMAJob {
threading::Event* signal_on_done;
uint8_t* destination;
uint8_t* source;
size_t size;
DmaPrecall precall;
DmaPostcall postcall;
void* userdata1;
void* userdata2;
};
using DMACJobHandle = uint64_t;
class XeDMAC {
public:
virtual ~XeDMAC() {}
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
virtual void WaitJobDone(DMACJobHandle handle) = 0;
virtual void WaitForIdle() = 0;
};
XeDMAC* CreateDMAC();
// must be divisible by cache line size
XE_NOINLINE
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
uint32_t written_length);
} // namespace xe::dma
#endif // XENIA_BASE_DMA_H_

View File

@ -377,29 +377,45 @@ int64_t m128_i64(const __m128& v) {
return m128_i64<N>(_mm_castps_pd(v)); return m128_i64<N>(_mm_castps_pd(v));
} }
/* /*
std::min/max float has handling for nans, where if either argument is nan the first argument is returned
minss/maxss are different, if either argument is nan the second operand to the instruction is returned std::min/max float has handling for nans, where if either argument is
this is problematic because we have no assurances from the compiler on the argument ordering nan the first argument is returned
so only use in places where nan handling is not needed minss/maxss are different, if either argument is nan the second operand
to the instruction is returned this is problematic because we have no
assurances from the compiler on the argument ordering
so only use in places where nan handling is not needed
*/ */
static float xe_minf(float x, float y) { XE_FORCEINLINE
static float ArchMin(float x, float y) {
return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y))); return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y)));
} }
static float xe_maxf(float x, float y) { XE_FORCEINLINE
static float ArchMax(float x, float y) {
return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y))); return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y)));
} }
static float xe_rcpf(float den) { XE_FORCEINLINE
static float ArchReciprocal(float den) {
return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den))); return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den)));
} }
#else #else
static float xe_minf(float x, float y) { return std::min<float>(x, y); } static float ArchMin(float x, float y) { return std::min<float>(x, y); }
static float xe_maxf(float x, float y) { return std::max<float>(x, y); } static float ArchMax(float x, float y) { return std::max<float>(x, y); }
static float xe_rcpf(float den) { return 1.0f / den; } static float ArchReciprocal(float den) { return 1.0f / den; }
#endif #endif
XE_FORCEINLINE
static float RefineReciprocal(float initial, float den) {
float t0 = initial * den;
float t1 = t0 * initial;
float rcp2 = initial + initial;
return rcp2 - t1;
}
XE_FORCEINLINE
static float ArchReciprocalRefined(float den) {
return RefineReciprocal(ArchReciprocal(den), den);
}
// Similar to the C++ implementation of XMConvertFloatToHalf and // Similar to the C++ implementation of XMConvertFloatToHalf and
// XMConvertHalfToFloat from DirectXMath 3.00 (pre-3.04, which switched from the // XMConvertHalfToFloat from DirectXMath 3.00 (pre-3.04, which switched from the
@ -494,7 +510,101 @@ inline T sat_sub(T a, T b) {
} }
return T(result); return T(result);
} }
namespace divisors {
union IDivExtraInfo {
uint32_t value_;
struct {
uint32_t shift_ : 31;
uint32_t add_ : 1;
} info;
};
// returns magicnum multiplier
static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) {
IDivExtraInfo extra;
uint32_t d = _denom;
int p;
uint32_t nc, delta, q1, r1, q2, r2;
struct {
unsigned M;
int a;
int s;
} magu;
magu.a = 0;
nc = -1 - ((uint32_t) - (int32_t)d) % d;
p = 31;
q1 = 0x80000000 / nc;
r1 = 0x80000000 - q1 * nc;
q2 = 0x7FFFFFFF / d;
r2 = 0x7FFFFFFF - q2 * d;
do {
p += 1;
if (r1 >= nc - r1) {
q1 = 2 * q1 + 1;
r1 = 2 * r1 - nc;
} else {
q1 = 2 * q1;
r1 = 2 * r1;
}
if (r2 + 1 >= d - r2) {
if (q2 >= 0x7FFFFFFF) {
magu.a = 1;
}
q2 = 2 * q2 + 1;
r2 = 2 * r2 + 1 - d;
} else {
if (q2 >= 0x80000000U) {
magu.a = 1;
}
q2 = 2 * q2;
r2 = 2 * r2 + 1;
}
delta = d - 1 - r2;
} while (p < 64 && (q1 < delta || r1 == 0));
extra.info.add_ = magu.a;
extra.info.shift_ = p - 32;
out_extra = extra.value_;
return static_cast<uint64_t>(q2 + 1);
}
static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul,
uint32_t extradata) {
IDivExtraInfo extra;
extra.value_ = extradata;
uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32;
if (extra.info.add_) {
uint32_t addend = result + num;
addend = ((addend < result ? 0x80000000 : 0) | addend);
result = addend;
}
return result >> extra.info.shift_;
}
static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul,
uint32_t extradata, uint32_t original) {
uint32_t dived = ApplyUint32Div(num, mul, extradata);
unsigned result = num - (dived * original);
return result;
}
struct MagicDiv {
uint32_t multiplier_;
uint32_t extradata_;
MagicDiv() : multiplier_(0), extradata_(0) {}
MagicDiv(uint32_t original) {
multiplier_ = PregenerateUint32Div(original, extradata_);
}
uint32_t Apply(uint32_t numerator) const {
return ApplyUint32Div(numerator, multiplier_, extradata_);
}
};
} // namespace divisors
} // namespace xe } // namespace xe
#endif // XENIA_BASE_MATH_H_ #endif // XENIA_BASE_MATH_H_

View File

@ -672,25 +672,58 @@ static void Prefetch<PrefetchTag::Level1>(const void* addr) {
#define XE_MSVC_REORDER_BARRIER() static_cast<void>(0) #define XE_MSVC_REORDER_BARRIER() static_cast<void>(0)
#endif #endif
#if XE_ARCH_AMD64 == 1 #if XE_ARCH_AMD64 == 1
union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
struct {
__m256 low32;
__m256 high32;
};
struct {
__m128i xmms[4];
};
float floats[XE_HOST_CACHE_LINE_SIZE / sizeof(float)];
};
XE_FORCEINLINE XE_FORCEINLINE
static void WriteLineNT(void* destination, const void* source) { static void WriteLineNT(CacheLine* XE_RESTRICT destination,
assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0); const CacheLine* XE_RESTRICT source) {
__m256i low = _mm256_loadu_si256((const __m256i*)source); assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
__m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]); __m256 low = _mm256_loadu_ps(&source->floats[0]);
XE_MSVC_REORDER_BARRIER(); __m256 high = _mm256_loadu_ps(&source->floats[8]);
_mm256_stream_si256((__m256i*)destination, low); _mm256_stream_ps(&destination->floats[0], low);
_mm256_stream_si256(&((__m256i*)destination)[1], high); _mm256_stream_ps(&destination->floats[8], high);
} }
XE_FORCEINLINE XE_FORCEINLINE
static void ReadLineNT(void* destination, const void* source) { static void ReadLineNT(CacheLine* XE_RESTRICT destination,
assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0); const CacheLine* XE_RESTRICT source) {
__m256i low = _mm256_stream_load_si256((const __m256i*)source); assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
__m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]);
XE_MSVC_REORDER_BARRIER(); __m128i first = _mm_stream_load_si128(&source->xmms[0]);
_mm256_storeu_si256((__m256i*)destination, low); __m128i second = _mm_stream_load_si128(&source->xmms[1]);
_mm256_storeu_si256(&((__m256i*)destination)[1], high); __m128i third = _mm_stream_load_si128(&source->xmms[2]);
__m128i fourth = _mm_stream_load_si128(&source->xmms[3]);
destination->xmms[0] = first;
destination->xmms[1] = second;
destination->xmms[2] = third;
destination->xmms[3] = fourth;
}
XE_FORCEINLINE
static void ReadLine(CacheLine* XE_RESTRICT destination,
const CacheLine* XE_RESTRICT source) {
assert_true((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0);
__m256 low = _mm256_loadu_ps(&source->floats[0]);
__m256 high = _mm256_loadu_ps(&source->floats[8]);
_mm256_storeu_ps(&destination->floats[0], low);
_mm256_storeu_ps(&destination->floats[8], high);
}
XE_FORCEINLINE
static void WriteLine(CacheLine* XE_RESTRICT destination,
const CacheLine* XE_RESTRICT source) {
assert_true((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0);
__m256 low = _mm256_loadu_ps(&source->floats[0]);
__m256 high = _mm256_loadu_ps(&source->floats[8]);
_mm256_storeu_ps(&destination->floats[0], low);
_mm256_storeu_ps(&destination->floats[8], high);
} }
XE_FORCEINLINE XE_FORCEINLINE
@ -699,19 +732,29 @@ XE_FORCEINLINE
static void ReadFence() { _mm_lfence(); } static void ReadFence() { _mm_lfence(); }
XE_FORCEINLINE XE_FORCEINLINE
static void ReadWriteFence() { _mm_mfence(); } static void ReadWriteFence() { _mm_mfence(); }
#else #else
union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine {
uint8_t bvals[XE_HOST_CACHE_LINE_SIZE];
};
XE_FORCEINLINE XE_FORCEINLINE
static void WriteLineNT(void* destination, const void* source) { static void WriteLineNT(CacheLine* destination, const CacheLine* source) {
assert((reinterpret_cast<uintptr_t>(destination) & 63ULL) == 0); memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
memcpy(destination, source, 64);
} }
XE_FORCEINLINE XE_FORCEINLINE
static void ReadLineNT(void* destination, const void* source) { static void ReadLineNT(CacheLine* destination, const CacheLine* source) {
assert((reinterpret_cast<uintptr_t>(source) & 63ULL) == 0); memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
memcpy(destination, source, 64);
} }
XE_FORCEINLINE
static void WriteLine(CacheLine* destination, const CacheLine* source) {
memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
}
XE_FORCEINLINE
static void ReadLine(CacheLine* destination, const CacheLine* source) {
memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE);
}
XE_FORCEINLINE XE_FORCEINLINE
static void WriteFence() {} static void WriteFence() {}
XE_FORCEINLINE XE_FORCEINLINE
@ -720,6 +763,47 @@ XE_FORCEINLINE
static void ReadWriteFence() {} static void ReadWriteFence() {}
#endif #endif
} // namespace swcache } // namespace swcache
template <unsigned Size>
static void smallcpy_const(void* destination, const void* source) {
#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
if constexpr ((Size & 7) == 0) {
__movsq((unsigned long long*)destination, (const unsigned long long*)source,
Size / 8);
} else if constexpr ((Size & 3) == 0) {
__movsd((unsigned long*)destination, (const unsigned long*)source,
Size / 4);
// dont even bother with movsw, i think the operand size override prefix
// slows it down
} else {
__movsb((unsigned char*)destination, (const unsigned char*)source, Size);
}
#else
memcpy(destination, source, Size);
#endif
}
template <unsigned Size>
static void smallset_const(void* destination, unsigned char fill_value) {
#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1
if constexpr ((Size & 7) == 0) {
unsigned long long fill =
static_cast<unsigned long long>(fill_value) * 0x0101010101010101ULL;
__stosq((unsigned long long*)destination, fill, Size / 8);
} else if constexpr ((Size & 3) == 0) {
static constexpr unsigned long fill =
static_cast<unsigned long>(fill_value) * 0x01010101U;
__stosd((unsigned long*)destination, fill, Size / 4);
// dont even bother with movsw, i think the operand size override prefix
// slows it down
} else {
__stosb((unsigned char*)destination, fill_value, Size);
}
#else
memset(destination, fill_value, Size);
#endif
}
} // namespace xe } // namespace xe
#endif // XENIA_BASE_MEMORY_H_ #endif // XENIA_BASE_MEMORY_H_

View File

@ -59,16 +59,8 @@ class RingBuffer {
// subtract instead // subtract instead
void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; } void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; }
ring_size_t read_count() const { ring_size_t read_count() const {
// chrispy: these branches are unpredictable // chrispy: these branches are unpredictable
#if 0
if (read_offset_ == write_offset_) {
return 0;
} else if (read_offset_ < write_offset_) {
return write_offset_ - read_offset_;
} else {
return (capacity_ - read_offset_) + write_offset_;
}
#else
ring_size_t read_offs = read_offset_; ring_size_t read_offs = read_offset_;
ring_size_t write_offs = write_offset_; ring_size_t write_offs = write_offset_;
ring_size_t cap = capacity_; ring_size_t cap = capacity_;
@ -77,14 +69,6 @@ class RingBuffer {
ring_size_t wrap_read_count = (cap - read_offs) + write_offs; ring_size_t wrap_read_count = (cap - read_offs) + write_offs;
ring_size_t comparison_value = read_offs <= write_offs; ring_size_t comparison_value = read_offs <= write_offs;
#if 0
size_t selector =
static_cast<size_t>(-static_cast<ptrdiff_t>(comparison_value));
offset_delta &= selector;
wrap_read_count &= ~selector;
return offset_delta | wrap_read_count;
#else
if (XE_LIKELY(read_offs <= write_offs)) { if (XE_LIKELY(read_offs <= write_offs)) {
return offset_delta; // will be 0 if they are equal, semantically return offset_delta; // will be 0 if they are equal, semantically
@ -93,8 +77,6 @@ class RingBuffer {
} else { } else {
return wrap_read_count; return wrap_read_count;
} }
#endif
#endif
} }
ring_size_t write_offset() const { return write_offset_; } ring_size_t write_offset() const { return write_offset_; }
@ -116,9 +98,9 @@ class RingBuffer {
void AdvanceWrite(size_t count); void AdvanceWrite(size_t count);
struct ReadRange { struct ReadRange {
const uint8_t* first; const uint8_t* XE_RESTRICT first;
const uint8_t* second; const uint8_t* XE_RESTRICT second;
ring_size_t first_length; ring_size_t first_length;
ring_size_t second_length; ring_size_t second_length;
}; };
@ -126,9 +108,11 @@ class RingBuffer {
void EndRead(ReadRange read_range); void EndRead(ReadRange read_range);
/* /*
BeginRead, but if there is a second Range it will prefetch all lines of it BeginRead, but if there is a second Range it will prefetch all lines of
it
this does not prefetch the first range, because software prefetching can do that faster than we can this does not prefetch the first range, because software
prefetching can do that faster than we can
*/ */
template <swcache::PrefetchTag tag> template <swcache::PrefetchTag tag>
XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) { XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) {
@ -138,7 +122,7 @@ class RingBuffer {
ring_size_t numlines = ring_size_t numlines =
xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) / xe::align<ring_size_t>(range.second_length, XE_HOST_CACHE_LINE_SIZE) /
XE_HOST_CACHE_LINE_SIZE; XE_HOST_CACHE_LINE_SIZE;
//chrispy: maybe unroll? // chrispy: maybe unroll?
for (ring_size_t i = 0; i < numlines; ++i) { for (ring_size_t i = 0; i < numlines; ++i) {
swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE)); swcache::Prefetch<tag>(range.second + (i * XE_HOST_CACHE_LINE_SIZE));
} }
@ -187,7 +171,7 @@ class RingBuffer {
} }
private: private:
uint8_t* buffer_ = nullptr; uint8_t* XE_RESTRICT buffer_ = nullptr;
ring_size_t capacity_ = 0; ring_size_t capacity_ = 0;
ring_size_t read_offset_ = 0; ring_size_t read_offset_ = 0;
ring_size_t write_offset_ = 0; ring_size_t write_offset_ = 0;

View File

@ -0,0 +1,87 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2019 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_BASE_SPLIT_MAP_H_
#define XENIA_BASE_SPLIT_MAP_H_
#include <algorithm>
#include <vector>
namespace xe {
/*
a map structure that is optimized for infrequent
reallocation/resizing/erasure and frequent searches by key implemented as 2
std::vectors, one of the keys and one of the values
*/
template <typename TKey, typename TValue>
class split_map {
using key_vector = std::vector<TKey>;
using value_vector = std::vector<TValue>;
key_vector keys_;
value_vector values_;
public:
using my_type = split_map<TKey, TValue>;
uint32_t IndexForKey(const TKey& k) {
auto lbound = std::lower_bound(keys_.begin(), keys_.end(), k);
return static_cast<uint32_t>(lbound - keys_.begin());
}
uint32_t size() const { return static_cast<uint32_t>(keys_.size()); }
key_vector& Keys() { return keys_; }
value_vector& Values() { return values_; }
void clear() {
keys_.clear();
values_.clear();
}
void resize(uint32_t new_size) {
keys_.resize(static_cast<size_t>(new_size));
values_.resize(static_cast<size_t>(new_size));
}
void reserve(uint32_t new_size) {
keys_.reserve(static_cast<size_t>(new_size));
values_.reserve(static_cast<size_t>(new_size));
}
const TKey* KeyAt(uint32_t index) const {
if (index == size()) {
return nullptr;
} else {
return &keys_[index];
}
}
const TValue* ValueAt(uint32_t index) const {
if (index == size()) {
return nullptr;
} else {
return &values_[index];
}
}
void InsertAt(TKey k, TValue v, uint32_t idx) {
uint32_t old_size = size();
bool needs_shiftup = idx != old_size;
values_.insert(values_.begin() + idx, v);
keys_.insert(keys_.begin() + idx, k);
}
void EraseAt(uint32_t idx) {
uint32_t old_size = size();
if (idx == old_size) {
return; // trying to erase nonexistent entry
} else {
values_.erase(values_.begin() + idx);
keys_.erase(keys_.begin() + idx);
}
}
};
} // namespace xe
#endif // XENIA_BASE_SPLIT_MAP_H_

View File

@ -57,7 +57,11 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false,
"code. The workaround may cause reduced CPU performance but is a " "code. The workaround may cause reduced CPU performance but is a "
"more accurate emulation", "more accurate emulation",
"x64"); "x64");
DEFINE_uint32(align_all_basic_blocks, 0,
"Aligns the start of all basic blocks to N bytes. Only specify a "
"power of 2, 16 is the recommended value. Results in larger "
"icache usage, but potentially faster loops",
"x64");
#if XE_X64_PROFILER_AVAILABLE == 1 #if XE_X64_PROFILER_AVAILABLE == 1
DEFINE_bool(instrument_call_times, false, DEFINE_bool(instrument_call_times, false,
"Compute time taken for functions, for profiling guest code", "Compute time taken for functions, for profiling guest code",
@ -110,7 +114,6 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator)
TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT); TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT);
TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1); TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1);
TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2); TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2);
TEST_EMIT_FEATURE(kX64EmitF16C, Xbyak::util::Cpu::tF16C);
TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE); TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE);
TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI); TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI);
TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F); TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F);
@ -200,7 +203,55 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder,
return true; return true;
} }
#pragma pack(push, 1)
struct RGCEmitted {
uint8_t ff_;
uint32_t rgcid_;
};
#pragma pack(pop)
#if 0
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
for (auto&& callsite : call_sites_) {
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
hunter =
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
}
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
hunter->rgcid_ =
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
reinterpret_cast<intptr_t>(hunter + 1));
}
}
#else
void X64Emitter::InjectCallAddresses(void* new_execute_address) {
#if 0
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
std::map<uint32_t, ResolvableGuestCall*> id_to_rgc{};
for (auto&& callsite : call_sites_) {
id_to_rgc[callsite.offset_] = &callsite;
}
#else
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
for (auto&& callsite : call_sites_) {
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
hunter =
reinterpret_cast<RGCEmitted*>(reinterpret_cast<char*>(hunter) + 1);
}
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8;
hunter->rgcid_ =
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
reinterpret_cast<intptr_t>(hunter + 1));
}
#endif
}
#endif
void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
GuestFunction* function) { GuestFunction* function) {
// To avoid changing xbyak, we do a switcharoo here. // To avoid changing xbyak, we do a switcharoo here.
@ -218,25 +269,9 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info,
if (function) { if (function) {
code_cache_->PlaceGuestCode(function->address(), top_, func_info, function, code_cache_->PlaceGuestCode(function->address(), top_, func_info, function,
new_execute_address, new_write_address); new_execute_address, new_write_address);
if (cvars::resolve_rel32_guest_calls) {
for (auto&& callsite : call_sites_) {
#pragma pack(push, 1)
struct RGCEmitted {
uint8_t ff_;
uint32_t rgcid_;
};
#pragma pack(pop)
RGCEmitted* hunter = (RGCEmitted*)new_execute_address;
while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) {
hunter = reinterpret_cast<RGCEmitted*>(
reinterpret_cast<char*>(hunter) + 1);
}
hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8; if (cvars::resolve_rel32_guest_calls) {
hunter->rgcid_ = InjectCallAddresses(new_execute_address);
static_cast<uint32_t>(static_cast<intptr_t>(callsite.destination_) -
reinterpret_cast<intptr_t>(hunter + 1));
}
} }
} else { } else {
code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address, code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address,
@ -367,6 +402,9 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) {
label = label->next; label = label->next;
} }
if (cvars::align_all_basic_blocks) {
align(cvars::align_all_basic_blocks, true);
}
// Process instructions. // Process instructions.
const Instr* instr = block->instr_head; const Instr* instr = block->instr_head;
while (instr) { while (instr) {
@ -1000,12 +1038,6 @@ static const vec128_t xmm_consts[] = {
vec128i(0x7f800000), vec128i(0x7f800000),
/* XMMThreeFloatMask */ /* XMMThreeFloatMask */
vec128i(~0U, ~0U, ~0U, 0U), vec128i(~0U, ~0U, ~0U, 0U),
/*XMMXenosF16ExtRangeStart*/
vec128f(65504),
/*XMMVSRShlByteshuf*/
v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80),
// XMMVSRMask
vec128b(1),
/* /*
XMMF16UnpackLCPI2 XMMF16UnpackLCPI2
*/ */
@ -1036,8 +1068,7 @@ static const vec128_t xmm_consts[] = {
/*XMMXOPWordShiftMask*/ /*XMMXOPWordShiftMask*/
vec128s(15), vec128s(15),
/*XMMXOPDwordShiftMask*/ /*XMMXOPDwordShiftMask*/
vec128i(31) vec128i(31)};
};
void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) {
for (auto& vec : xmm_consts) { for (auto& vec : xmm_consts) {

View File

@ -157,9 +157,6 @@ enum XmmConst {
XMMLVSRTableBase, XMMLVSRTableBase,
XMMSingleDenormalMask, XMMSingleDenormalMask,
XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3 XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3
XMMXenosF16ExtRangeStart,
XMMVSRShlByteshuf,
XMMVSRMask,
XMMF16UnpackLCPI2, // 0x38000000, 1/ 32768 XMMF16UnpackLCPI2, // 0x38000000, 1/ 32768
XMMF16UnpackLCPI3, // 0x0x7fe000007fe000 XMMF16UnpackLCPI3, // 0x0x7fe000007fe000
XMMF16PackLCPI0, XMMF16PackLCPI0,
@ -194,7 +191,7 @@ enum X64EmitterFeatureFlags {
kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount
kX64EmitBMI1 = 1 << 3, kX64EmitBMI1 = 1 << 3,
kX64EmitBMI2 = 1 << 4, kX64EmitBMI2 = 1 << 4,
kX64EmitF16C = 1 << 5, kX64EmitPrefetchW = 1 << 5,
kX64EmitMovbe = 1 << 6, kX64EmitMovbe = 1 << 6,
kX64EmitGFNI = 1 << 7, kX64EmitGFNI = 1 << 7,
@ -215,11 +212,14 @@ enum X64EmitterFeatureFlags {
// inc/dec) do not introduce false dependencies on EFLAGS // inc/dec) do not introduce false dependencies on EFLAGS
// because the individual flags are treated as different vars by // because the individual flags are treated as different vars by
// the processor. (this applies to zen) // the processor. (this applies to zen)
kX64EmitPrefetchW = 1 << 16, kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx
kX64EmitXOP = 1 << 17, // chrispy: xop maps really well to many vmx
// instructions, and FX users need the boost // instructions, and FX users need the boost
kX64EmitFMA4 = 1 << 18, // todo: also use on zen1? kX64EmitFMA4 = 1 << 17, // todo: also use on zen1?
kX64EmitTBM = 1 << 19 kX64EmitTBM = 1 << 18,
// kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family
// 17h/19h optimization manuals. allows us to save 1 byte on certain xmm
// instructions by using the legacy sse version if we recently cleared the
// high 128 bits of the
}; };
class ResolvableGuestCall { class ResolvableGuestCall {
public: public:
@ -251,6 +251,7 @@ class X64Emitter : public Xbyak::CodeGenerator {
uint32_t debug_info_flags, FunctionDebugInfo* debug_info, uint32_t debug_info_flags, FunctionDebugInfo* debug_info,
void** out_code_address, size_t* out_code_size, void** out_code_address, size_t* out_code_size,
std::vector<SourceMapEntry>* out_source_map); std::vector<SourceMapEntry>* out_source_map);
void InjectCallAddresses(void* new_execute_addr);
public: public:
// Reserved: rsp, rsi, rdi // Reserved: rsp, rsi, rdi

View File

@ -43,23 +43,23 @@ enum KeyType {
KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE, KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE,
KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE, KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE,
}; };
using InstrKeyValue = uint32_t;
#pragma pack(push, 1) #pragma pack(push, 1)
union InstrKey { union InstrKey {
uint32_t value; InstrKeyValue value;
struct { struct {
uint32_t opcode : 8; InstrKeyValue opcode : 8;
uint32_t dest : 5; InstrKeyValue dest : 5;
uint32_t src1 : 5; InstrKeyValue src1 : 5;
uint32_t src2 : 5; InstrKeyValue src2 : 5;
uint32_t src3 : 5; InstrKeyValue src3 : 5;
uint32_t reserved : 4; InstrKeyValue reserved : 4;
}; };
operator uint32_t() const { return value; } operator InstrKeyValue() const { return value; }
InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); } InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); }
InstrKey(uint32_t v) : value(v) {} InstrKey(InstrKeyValue v) : value(v) {}
// this used to take about 1% cpu while precompiling // this used to take about 1% cpu while precompiling
// it kept reloading opcode, and also constantly repacking and unpacking the // it kept reloading opcode, and also constantly repacking and unpacking the
@ -67,16 +67,16 @@ union InstrKey {
InstrKey(const Instr* i) : value(0) { InstrKey(const Instr* i) : value(0) {
const OpcodeInfo* info = i->GetOpcodeInfo(); const OpcodeInfo* info = i->GetOpcodeInfo();
uint32_t sig = info->signature; InstrKeyValue sig = info->signature;
OpcodeSignatureType dest_type, src1_type, src2_type, src3_type; OpcodeSignatureType dest_type, src1_type, src2_type, src3_type;
UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type); UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type);
uint32_t out_desttype = (uint32_t)dest_type; InstrKeyValue out_desttype = (InstrKeyValue)dest_type;
uint32_t out_src1type = (uint32_t)src1_type; InstrKeyValue out_src1type = (InstrKeyValue)src1_type;
uint32_t out_src2type = (uint32_t)src2_type; InstrKeyValue out_src2type = (InstrKeyValue)src2_type;
uint32_t out_src3type = (uint32_t)src3_type; InstrKeyValue out_src3type = (InstrKeyValue)src3_type;
Value* destv = i->dest; Value* destv = i->dest;
// pre-deref, even if not value // pre-deref, even if not value
@ -105,7 +105,7 @@ union InstrKey {
template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X, template <Opcode OPCODE, KeyType DEST = KEY_TYPE_X, KeyType SRC1 = KEY_TYPE_X,
KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X> KeyType SRC2 = KEY_TYPE_X, KeyType SRC3 = KEY_TYPE_X>
struct Construct { struct Construct {
static const uint32_t value = static const InstrKeyValue value =
(OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23); (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23);
}; };
}; };
@ -307,8 +307,8 @@ struct I<OPCODE, DEST> : DestField<DEST> {
protected: protected:
template <typename SEQ, typename T> template <typename SEQ, typename T>
friend struct Sequence; friend struct Sequence;
bool Load(const Instr* i) { bool Load(const Instr* i, InstrKeyValue kv) {
if (InstrKey(i).value == key && BASE::LoadDest(i)) { if (kv == key && BASE::LoadDest(i)) {
instr = i; instr = i;
return true; return true;
} }
@ -329,8 +329,8 @@ struct I<OPCODE, DEST, SRC1> : DestField<DEST> {
protected: protected:
template <typename SEQ, typename T> template <typename SEQ, typename T>
friend struct Sequence; friend struct Sequence;
bool Load(const Instr* i) { bool Load(const Instr* i, InstrKeyValue kv) {
if (InstrKey(i).value == key && BASE::LoadDest(i)) { if (kv == key && BASE::LoadDest(i)) {
instr = i; instr = i;
src1.Load(i->src1); src1.Load(i->src1);
return true; return true;
@ -355,8 +355,8 @@ struct I<OPCODE, DEST, SRC1, SRC2> : DestField<DEST> {
protected: protected:
template <typename SEQ, typename T> template <typename SEQ, typename T>
friend struct Sequence; friend struct Sequence;
bool Load(const Instr* i) { bool Load(const Instr* i, InstrKeyValue kv) {
if (InstrKey(i).value == key && BASE::LoadDest(i)) { if (kv == key && BASE::LoadDest(i)) {
instr = i; instr = i;
src1.Load(i->src1); src1.Load(i->src1);
src2.Load(i->src2); src2.Load(i->src2);
@ -385,8 +385,8 @@ struct I<OPCODE, DEST, SRC1, SRC2, SRC3> : DestField<DEST> {
protected: protected:
template <typename SEQ, typename T> template <typename SEQ, typename T>
friend struct Sequence; friend struct Sequence;
bool Load(const Instr* i) { bool Load(const Instr* i, InstrKeyValue ikey) {
if (InstrKey(i).value == key && BASE::LoadDest(i)) { if (ikey == key && BASE::LoadDest(i)) {
instr = i; instr = i;
src1.Load(i->src1); src1.Load(i->src1);
src2.Load(i->src2); src2.Load(i->src2);
@ -422,9 +422,9 @@ struct Sequence {
static constexpr uint32_t head_key() { return T::key; } static constexpr uint32_t head_key() { return T::key; }
static bool Select(X64Emitter& e, const Instr* i) { static bool Select(X64Emitter& e, const Instr* i, InstrKeyValue ikey) {
T args; T args;
if (!args.Load(i)) { if (!args.Load(i, ikey)) {
return false; return false;
} }
SEQ::Emit(e, args); SEQ::Emit(e, args);

View File

@ -27,12 +27,6 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) {
if (valid) { if (valid) {
auto name = i.src2.value->name; auto name = i.src2.value->name;
switch (opcode) { switch (opcode) {
case OPCODE_IS_TRUE:
e.jnz(name, e.T_NEAR);
break;
case OPCODE_IS_FALSE:
e.jz(name, e.T_NEAR);
break;
case OPCODE_COMPARE_EQ: case OPCODE_COMPARE_EQ:
e.je(name, e.T_NEAR); e.je(name, e.T_NEAR);
break; break;
@ -299,26 +293,14 @@ struct CALL_TRUE_I64
struct CALL_TRUE_F32 struct CALL_TRUE_F32
: Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> { : Sequence<CALL_TRUE_F32, I<OPCODE_CALL_TRUE, VoidOp, F32Op, SymbolOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->is_guest()); assert_impossible_sequence(CALL_TRUE_F32);
e.vptest(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
e.L(skip);
e.ForgetMxcsrMode();
} }
}; };
struct CALL_TRUE_F64 struct CALL_TRUE_F64
: Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> { : Sequence<CALL_TRUE_F64, I<OPCODE_CALL_TRUE, VoidOp, F64Op, SymbolOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->is_guest()); assert_impossible_sequence(CALL_TRUE_F64);
e.vptest(i.src1, i.src1);
Xbyak::Label skip;
e.jz(skip);
e.Call(i.instr, static_cast<GuestFunction*>(i.src2.value));
e.L(skip);
e.ForgetMxcsrMode();
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16, EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16,
@ -404,22 +386,14 @@ struct CALL_INDIRECT_TRUE_F32
: Sequence<CALL_INDIRECT_TRUE_F32, : Sequence<CALL_INDIRECT_TRUE_F32,
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> { I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F32Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1); assert_impossible_sequence(CALL_INDIRECT_TRUE_F32);
Xbyak::Label skip;
e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2);
e.L(skip);
} }
}; };
struct CALL_INDIRECT_TRUE_F64 struct CALL_INDIRECT_TRUE_F64
: Sequence<CALL_INDIRECT_TRUE_F64, : Sequence<CALL_INDIRECT_TRUE_F64,
I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> { I<OPCODE_CALL_INDIRECT_TRUE, VoidOp, F64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1); assert_impossible_sequence(CALL_INDIRECT_TRUE_F64);
Xbyak::Label skip;
e.jz(skip, CodeGenerator::T_NEAR);
e.CallIndirect(i.instr, i.src2);
e.L(skip);
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8, EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8,
@ -486,15 +460,13 @@ struct RETURN_TRUE_I64
struct RETURN_TRUE_F32 struct RETURN_TRUE_F32
: Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> { : Sequence<RETURN_TRUE_F32, I<OPCODE_RETURN_TRUE, VoidOp, F32Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1); assert_impossible_sequence(RETURN_TRUE_F32);
e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
} }
}; };
struct RETURN_TRUE_F64 struct RETURN_TRUE_F64
: Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> { : Sequence<RETURN_TRUE_F64, I<OPCODE_RETURN_TRUE, VoidOp, F64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1); assert_impossible_sequence(RETURN_TRUE_F64);
e.jnz(e.epilog_label(), CodeGenerator::T_NEAR);
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16, EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16,
@ -553,33 +525,25 @@ struct BRANCH_TRUE_I64
struct BRANCH_TRUE_F32 struct BRANCH_TRUE_F32
: Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> { : Sequence<BRANCH_TRUE_F32, I<OPCODE_BRANCH_TRUE, VoidOp, F32Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info && /*
i.instr->prev->dest == i.src1.value) { chrispy: right now, im not confident that we are always clearing
e.jnz(i.src2.value->name, e.T_NEAR); the upper 96 bits of registers, making vptest extremely unsafe. many
} else if (i.instr->prev && ss/sd operations copy over the upper 96 from the source, and for abs we
i.instr->prev->opcode == &OPCODE_IS_FALSE_info && negate ALL elements, making the top 64 bits contain 0x80000000 etc
i.instr->prev->dest == i.src1.value) { */
e.jz(i.src2.value->name, e.T_NEAR); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
} else { e.vmovd(e.eax, input);
e.vptest(i.src1, i.src1); e.test(e.eax, e.eax);
e.jnz(i.src2.value->name, e.T_NEAR); e.jnz(i.src2.value->name, e.T_NEAR);
}
} }
}; };
struct BRANCH_TRUE_F64 struct BRANCH_TRUE_F64
: Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> { : Sequence<BRANCH_TRUE_F64, I<OPCODE_BRANCH_TRUE, VoidOp, F64Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info && Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
i.instr->prev->dest == i.src1.value) { e.vmovq(e.rax, input);
e.jnz(i.src2.value->name, e.T_NEAR); e.test(e.rax, e.rax);
} else if (i.instr->prev && e.jnz(i.src2.value->name, e.T_NEAR);
i.instr->prev->opcode == &OPCODE_IS_FALSE_info &&
i.instr->prev->dest == i.src1.value) {
e.jz(i.src2.value->name, e.T_NEAR);
} else {
e.vptest(i.src1, i.src1);
e.jnz(i.src2.value->name, e.T_NEAR);
}
} }
}; };
EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16,
@ -624,7 +588,9 @@ struct BRANCH_FALSE_F32
: Sequence<BRANCH_FALSE_F32, : Sequence<BRANCH_FALSE_F32,
I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> { I<OPCODE_BRANCH_FALSE, VoidOp, F32Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovd(e.eax, input);
e.test(e.eax, e.eax);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->name, e.T_NEAR);
} }
}; };
@ -632,7 +598,9 @@ struct BRANCH_FALSE_F64
: Sequence<BRANCH_FALSE_F64, : Sequence<BRANCH_FALSE_F64,
I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> { I<OPCODE_BRANCH_FALSE, VoidOp, F64Op, LabelOp>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
e.vptest(i.src1, i.src1); Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0);
e.vmovq(e.rax, input);
e.test(e.rax, e.rax);
e.jz(i.src2.value->name, e.T_NEAR); e.jz(i.src2.value->name, e.T_NEAR);
} }
}; };

View File

@ -975,6 +975,9 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) { if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) {
return false; return false;
} }
if (IsTracingData()) { // incompatible with tracing
return false;
}
uint32_t guestaddr = i->GuestAddressFor(); uint32_t guestaddr = i->GuestAddressFor();
if (!guestaddr) { if (!guestaddr) {
return false; return false;
@ -984,7 +987,54 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) {
return flags && flags->accessed_mmio; return flags && flags->accessed_mmio;
} }
template <typename T, bool swap>
static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
if (swap) {
value = xe::byte_swap(value);
}
if (guestaddr >= 0xE0000000) {
guestaddr += 0x1000;
}
auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
if (!gaddr) {
*reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
} else {
value = xe::byte_swap(value); /*
was having issues, found by comparing the values used with exceptions
to these that we were reversed...
*/
gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
}
}
template <typename T, bool swap>
static T MMIOAwareLoad(void* _ctx, unsigned int guestaddr) {
T value;
if (guestaddr >= 0xE0000000) {
guestaddr += 0x1000;
}
auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
if (!gaddr) {
value = *reinterpret_cast<T*>(ctx->virtual_membase + guestaddr);
if (swap) {
value = xe::byte_swap(value);
}
} else {
/*
was having issues, found by comparing the values used with exceptions
to these that we were reversed...
*/
value = gaddr->read(nullptr, gaddr->callback_context, guestaddr);
}
return value;
}
// ============================================================================ // ============================================================================
// OPCODE_LOAD_OFFSET // OPCODE_LOAD_OFFSET
// ============================================================================ // ============================================================================
@ -1016,16 +1066,38 @@ struct LOAD_OFFSET_I16
struct LOAD_OFFSET_I32 struct LOAD_OFFSET_I32
: Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> { : Sequence<LOAD_OFFSET_I32, I<OPCODE_LOAD_OFFSET, I32Op, I64Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); if (IsPossibleMMIOInstruction(e, i.instr)) {
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
if (e.IsFeatureEnabled(kX64EmitMovbe)) {
e.movbe(i.dest, e.dword[addr]); if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
}
if (i.src1.is_constant) {
e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
} else {
e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
}
if (i.src2.is_constant) {
e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant());
} else {
e.add(e.GetNativeParam(0).cvt32(), i.src2.reg().cvt32());
}
e.CallNativeSafe(addrptr);
e.mov(i.dest, e.eax);
} else {
auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2);
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
if (e.IsFeatureEnabled(kX64EmitMovbe)) {
e.movbe(i.dest, e.dword[addr]);
} else {
e.mov(i.dest, e.dword[addr]);
e.bswap(i.dest);
}
} else { } else {
e.mov(i.dest, e.dword[addr]); e.mov(i.dest, e.dword[addr]);
e.bswap(i.dest);
} }
} else {
e.mov(i.dest, e.dword[addr]);
} }
} }
}; };
@ -1049,28 +1121,6 @@ struct LOAD_OFFSET_I64
EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16, EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16,
LOAD_OFFSET_I32, LOAD_OFFSET_I64); LOAD_OFFSET_I32, LOAD_OFFSET_I64);
template <typename T, bool swap>
static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) {
if (swap) {
value = xe::byte_swap(value);
}
if (guestaddr >= 0xE0000000) {
guestaddr += 0x1000;
}
auto ctx = reinterpret_cast<ppc::PPCContext*>(_ctx);
auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr);
if (!gaddr) {
*reinterpret_cast<T*>(ctx->virtual_membase + guestaddr) = value;
} else {
value = xe::byte_swap(value); /*
was having issues, found by comparing the values used with exceptions
to these that we were reversed...
*/
gaddr->write(nullptr, gaddr->callback_context, guestaddr, value);
}
}
// ============================================================================ // ============================================================================
// OPCODE_STORE_OFFSET // OPCODE_STORE_OFFSET
// ============================================================================ // ============================================================================
@ -1225,21 +1275,37 @@ struct LOAD_I16 : Sequence<LOAD_I16, I<OPCODE_LOAD, I16Op, I64Op>> {
}; };
struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> { struct LOAD_I32 : Sequence<LOAD_I32, I<OPCODE_LOAD, I32Op, I64Op>> {
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
auto addr = ComputeMemoryAddress(e, i.src1); if (IsPossibleMMIOInstruction(e, i.instr)) {
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { void* addrptr = (void*)&MMIOAwareLoad<uint32_t, false>;
if (e.IsFeatureEnabled(kX64EmitMovbe)) {
e.movbe(i.dest, e.dword[addr]); if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
addrptr = (void*)&MMIOAwareLoad<uint32_t, true>;
}
if (i.src1.is_constant) {
e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant());
} else {
e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32());
}
e.CallNativeSafe(addrptr);
e.mov(i.dest, e.eax);
} else {
auto addr = ComputeMemoryAddress(e, i.src1);
if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) {
if (e.IsFeatureEnabled(kX64EmitMovbe)) {
e.movbe(i.dest, e.dword[addr]);
} else {
e.mov(i.dest, e.dword[addr]);
e.bswap(i.dest);
}
} else { } else {
e.mov(i.dest, e.dword[addr]); e.mov(i.dest, e.dword[addr]);
e.bswap(i.dest);
} }
} else { if (IsTracingData()) {
e.mov(i.dest, e.dword[addr]); e.mov(e.GetNativeParam(1).cvt32(), i.dest);
} e.lea(e.GetNativeParam(0), e.ptr[addr]);
if (IsTracingData()) { e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
e.mov(e.GetNativeParam(1).cvt32(), i.dest); }
e.lea(e.GetNativeParam(0), e.ptr[addr]);
e.CallNative(reinterpret_cast<void*>(TraceMemoryLoadI32));
} }
} }
}; };
@ -1390,14 +1456,13 @@ struct STORE_I32 : Sequence<STORE_I32, I<OPCODE_STORE, VoidOp, I64Op, I32Op>> {
} else { } else {
e.mov(e.dword[addr], i.src2); e.mov(e.dword[addr], i.src2);
} }
if (IsTracingData()) {
e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
e.lea(e.GetNativeParam(0), e.ptr[addr]);
e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
}
} }
} }
if (IsTracingData()) {
auto addr = ComputeMemoryAddress(e, i.src1);
e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]);
e.lea(e.GetNativeParam(0), e.ptr[addr]);
e.CallNative(reinterpret_cast<void*>(TraceMemoryStoreI32));
}
} }
}; };
struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> { struct STORE_I64 : Sequence<STORE_I64, I<OPCODE_STORE, VoidOp, I64Op, I64Op>> {

View File

@ -19,15 +19,15 @@
#include "xenia/base/cvar.h" #include "xenia/base/cvar.h"
#include "xenia/cpu/backend/x64/x64_stack_layout.h" #include "xenia/cpu/backend/x64/x64_stack_layout.h"
DEFINE_bool(xop_rotates, false, "rotate via xop", "X64"); DEFINE_bool(xop_rotates, false, "rotate via xop", "x64");
DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64"); DEFINE_bool(xop_left_shifts, false, "shl via xop", "x64");
DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64"); DEFINE_bool(xop_right_shifts, false, "shr via xop", "x64");
DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64"); DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "x64");
DEFINE_bool(xop_compares, true, "compare via xop", "X64"); DEFINE_bool(xop_compares, true, "compare via xop", "x64");
namespace xe { namespace xe {
namespace cpu { namespace cpu {

View File

@ -67,7 +67,7 @@ using namespace xe::cpu::hir;
using xe::cpu::hir::Instr; using xe::cpu::hir::Instr;
typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*); typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, InstrKeyValue ikey);
std::unordered_map<uint32_t, SequenceSelectFn> sequence_table; std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
// ============================================================================ // ============================================================================
@ -868,59 +868,6 @@ static bool MayCombineSetxWithFollowingCtxStore(const hir::Instr* setx_insn,
} }
return false; return false;
} }
#define EMITTER_IS_TRUE(typ, tester) \
struct IS_TRUE_##typ \
: Sequence<IS_TRUE_##typ, I<OPCODE_IS_TRUE, I8Op, typ##Op>> { \
static void Emit(X64Emitter& e, const EmitArgType& i) { \
e.tester(i.src1, i.src1); \
unsigned ctxoffset = 0; \
if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \
e.setnz(e.byte[e.GetContextReg() + ctxoffset]); \
} else { \
e.setnz(i.dest); \
} \
} \
}
#define EMITTER_IS_TRUE_INT(typ) EMITTER_IS_TRUE(typ, test)
EMITTER_IS_TRUE_INT(I8);
EMITTER_IS_TRUE_INT(I16);
EMITTER_IS_TRUE_INT(I32);
EMITTER_IS_TRUE_INT(I64);
EMITTER_IS_TRUE(F32, vtestps);
EMITTER_IS_TRUE(F64, vtestpd);
EMITTER_IS_TRUE(V128, vptest);
EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32,
IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128);
#define EMITTER_IS_FALSE(typ, tester) \
struct IS_FALSE_##typ \
: Sequence<IS_FALSE_##typ, I<OPCODE_IS_FALSE, I8Op, typ##Op>> { \
static void Emit(X64Emitter& e, const EmitArgType& i) { \
e.tester(i.src1, i.src1); \
unsigned ctxoffset = 0; \
if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \
e.setz(e.byte[e.GetContextReg() + ctxoffset]); \
} else { \
e.setz(i.dest); \
} \
} \
}
#define EMITTER_IS_FALSE_INT(typ) EMITTER_IS_FALSE(typ, test)
EMITTER_IS_FALSE_INT(I8);
EMITTER_IS_FALSE_INT(I16);
EMITTER_IS_FALSE_INT(I32);
EMITTER_IS_FALSE_INT(I64);
EMITTER_IS_FALSE(F32, vtestps);
EMITTER_IS_FALSE(F64, vtestpd);
EMITTER_IS_FALSE(V128, vptest);
EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32,
IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128);
// ============================================================================ // ============================================================================
// OPCODE_IS_NAN // OPCODE_IS_NAN
@ -3308,7 +3255,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) {
auto it = sequence_table.find(key); auto it = sequence_table.find(key);
if (it != sequence_table.end()) { if (it != sequence_table.end()) {
if (it->second(*e, i)) { if (it->second(*e, i, InstrKey(i))) {
*new_tail = i->next; *new_tail = i->next;
return true; return true;
} }

View File

@ -25,7 +25,7 @@ namespace x64 {
class X64Emitter; class X64Emitter;
typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*); typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*, uint32_t ikey);
extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table; extern std::unordered_map<uint32_t, SequenceSelectFn> sequence_table;
template <typename T> template <typename T>

View File

@ -361,28 +361,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
} }
} }
break; break;
case OPCODE_IS_TRUE:
if (i->src1.value->IsConstant()) {
if (i->src1.value->IsConstantTrue()) {
v->set_constant(uint8_t(1));
} else {
v->set_constant(uint8_t(0));
}
i->Remove();
result = true;
}
break;
case OPCODE_IS_FALSE:
if (i->src1.value->IsConstant()) {
if (i->src1.value->IsConstantFalse()) {
v->set_constant(uint8_t(1));
} else {
v->set_constant(uint8_t(0));
}
i->Remove();
result = true;
}
break;
case OPCODE_IS_NAN: case OPCODE_IS_NAN:
if (i->src1.value->IsConstant()) { if (i->src1.value->IsConstant()) {
if (i->src1.value->type == FLOAT32_TYPE && if (i->src1.value->type == FLOAT32_TYPE &&
@ -602,7 +580,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) {
if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
if (should_skip_because_of_float) { if (should_skip_because_of_float) {
break; break;
} }
v->set_from(i->src1.value); v->set_from(i->src1.value);
v->Max(i->src2.value); v->Max(i->src2.value);
i->Remove(); i->Remove();

View File

@ -214,12 +214,7 @@ bool SimplificationPass::CheckBooleanXor1(hir::Instr* i,
bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0; bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0;
Value* new_value = nullptr; Value* new_value = nullptr;
if (xorop == OPCODE_IS_FALSE) { if (xorop == OPCODE_COMPARE_EQ) {
new_value = builder->IsTrue(xordef->src1.value);
} else if (xorop == OPCODE_IS_TRUE) {
new_value = builder->IsFalse(xordef->src1.value);
} else if (xorop == OPCODE_COMPARE_EQ) {
new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value); new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value);
} else if (xorop == OPCODE_COMPARE_NE) { } else if (xorop == OPCODE_COMPARE_NE) {
@ -294,7 +289,7 @@ bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) {
return false; return false;
} }
bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) { bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) {
return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE; return def_opcode >= OPCODE_COMPARE_EQ && def_opcode <= OPCODE_DID_SATURATE;
} }
inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation, inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation,
@ -804,24 +799,12 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
if (!var_definition) { if (!var_definition) {
return false; return false;
} }
// x == 0 -> !x
if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) {
i->Replace(&OPCODE_IS_FALSE_info, 0);
i->set_src1(variable);
return true;
}
// x != 0 -> !!x
if (cmpop == OPCODE_COMPARE_NE && constant_unpacked == 0) {
i->Replace(&OPCODE_IS_TRUE_info, 0);
i->set_src1(variable);
return true;
}
if (cmpop == OPCODE_COMPARE_ULE && if (cmpop == OPCODE_COMPARE_ULE &&
constant_unpacked == constant_unpacked ==
0) { // less than or equal to zero = (== 0) = IS_FALSE 0) { // less than or equal to zero = (== 0) = IS_FALSE
i->Replace(&OPCODE_IS_FALSE_info, 0); i->opcode = &OPCODE_COMPARE_EQ_info;
i->set_src1(variable);
return true; return true;
} }
// todo: OPCODE_COMPARE_NE too? // todo: OPCODE_COMPARE_NE too?
@ -840,15 +823,20 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
} }
if (cmpop == OPCODE_COMPARE_ULT && if (cmpop == OPCODE_COMPARE_ULT &&
constant_unpacked == 1) { // unsigned lt 1 means == 0 constant_unpacked == 1) { // unsigned lt 1 means == 0
i->Replace(&OPCODE_IS_FALSE_info, 0); // i->Replace(&OPCODE_IS_FALSE_info, 0);
i->set_src1(variable);
i->opcode = &OPCODE_COMPARE_EQ_info;
// i->set_src1(variable);
i->set_src2(builder->LoadZero(variable->type));
return true; return true;
} }
if (cmpop == OPCODE_COMPARE_UGT && if (cmpop == OPCODE_COMPARE_UGT &&
constant_unpacked == 0) { // unsigned gt 1 means != 0 constant_unpacked == 0) { // unsigned gt 1 means != 0
i->Replace(&OPCODE_IS_TRUE_info, 0); // i->Replace(&OPCODE_IS_TRUE_info, 0);
i->set_src1(variable); // i->set_src1(variable);
i->opcode = &OPCODE_COMPARE_NE_info;
return true; return true;
} }
@ -870,8 +858,11 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
} else if (cmpop == OPCODE_COMPARE_SGT && signbit_definitely_0 && } else if (cmpop == OPCODE_COMPARE_SGT && signbit_definitely_0 &&
constant_unpacked == 0) { constant_unpacked == 0) {
// signbit cant be set, and checking if gt 0, so actually checking != 0 // signbit cant be set, and checking if gt 0, so actually checking != 0
i->Replace(&OPCODE_IS_TRUE_info, 0); // i->Replace(&OPCODE_IS_TRUE_info, 0);
i->set_src1(variable);
// i->set_src1(variable);
i->opcode = &OPCODE_COMPARE_NE_info;
return true; return true;
} }
@ -885,9 +876,9 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
Value* constant_replacement = nullptr; Value* constant_replacement = nullptr;
if (cmpop == OPCODE_COMPARE_EQ || cmpop == OPCODE_COMPARE_UGE) { if (cmpop == OPCODE_COMPARE_EQ || cmpop == OPCODE_COMPARE_UGE) {
repl = &OPCODE_IS_TRUE_info; repl = &OPCODE_COMPARE_NE_info;
} else if (cmpop == OPCODE_COMPARE_NE || cmpop == OPCODE_COMPARE_ULT) { } else if (cmpop == OPCODE_COMPARE_NE || cmpop == OPCODE_COMPARE_ULT) {
repl = &OPCODE_IS_FALSE_info; repl = &OPCODE_COMPARE_EQ_info;
} else if (cmpop == OPCODE_COMPARE_UGT) { } else if (cmpop == OPCODE_COMPARE_UGT) {
// impossible, cannot be greater than mask // impossible, cannot be greater than mask
@ -906,6 +897,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
if (repl) { if (repl) {
i->Replace(repl, 0); i->Replace(repl, 0);
i->set_src1(variable); i->set_src1(variable);
i->set_src2(builder->LoadZero(variable->type));
return true; return true;
} }
if (constant_replacement) { if (constant_replacement) {
@ -919,10 +911,16 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i,
} }
bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i, bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i,
hir::HIRBuilder* builder) { hir::HIRBuilder* builder) {
bool istrue = i->opcode == &OPCODE_IS_TRUE_info; bool istrue = i->opcode == &OPCODE_COMPARE_NE_info;
bool isfalse = i->opcode == &OPCODE_IS_FALSE_info; bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info;
Value* input = i->src1.value; auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar();
if (!input_cosntant || input_cosntant->AsUint64() != 0) {
return false;
}
// Value* input = i->src1.value;
TypeName input_type = input->type; TypeName input_type = input->type;
if (!IsScalarIntegralType(input_type)) { if (!IsScalarIntegralType(input_type)) {
return false; return false;
@ -1012,8 +1010,10 @@ bool SimplificationPass::CheckSHRByConst(hir::Instr* i,
i->set_src1(isfalsetest); i->set_src1(isfalsetest);
} else { } else {
i->Replace(&OPCODE_IS_FALSE_info, 0); // i->Replace(&OPCODE_IS_FALSE_info, 0);
i->Replace(&OPCODE_COMPARE_EQ_info, 0);
i->set_src1(lz_input); i->set_src1(lz_input);
i->set_src2(builder->LoadZero(lz_input->type));
} }
return true; return true;
} }
@ -1067,7 +1067,7 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
while (i) { while (i) {
// vector types use the same opcodes as scalar ones for AND/OR/XOR! we // vector types use the same opcodes as scalar ones for AND/OR/XOR! we
// don't handle these in our simplifications, so skip // don't handle these in our simplifications, so skip
if (i->dest && IsScalarIntegralType(i->dest->type)) { if (i->AllScalarIntegral()) {
Opcode iop = i->opcode->num; Opcode iop = i->opcode->num;
if (iop == OPCODE_OR) { if (iop == OPCODE_OR) {
@ -1080,7 +1080,6 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) {
result |= CheckAdd(i, builder); result |= CheckAdd(i, builder);
} else if (IsScalarBasicCmp(iop)) { } else if (IsScalarBasicCmp(iop)) {
result |= CheckScalarConstCmp(i, builder); result |= CheckScalarConstCmp(i, builder);
} else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) {
result |= CheckIsTrueIsFalse(i, builder); result |= CheckIsTrueIsFalse(i, builder);
} else if (iop == OPCODE_SHR) { } else if (iop == OPCODE_SHR) {
result |= CheckSHR(i, builder); result |= CheckSHR(i, builder);

View File

@ -1023,7 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) {
Value* HIRBuilder::Convert(Value* value, TypeName target_type, Value* HIRBuilder::Convert(Value* value, TypeName target_type,
RoundMode round_mode) { RoundMode round_mode) {
Instr* i = Instr* i =
AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type)); AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type));
i->set_src1(value); i->set_src1(value);
@ -1034,7 +1033,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type,
Value* HIRBuilder::Round(Value* value, RoundMode round_mode) { Value* HIRBuilder::Round(Value* value, RoundMode round_mode) {
ASSERT_FLOAT_OR_VECTOR_TYPE(value); ASSERT_FLOAT_OR_VECTOR_TYPE(value);
Instr* i = Instr* i =
AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type)); AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type));
i->set_src1(value); i->set_src1(value);
@ -1248,7 +1246,34 @@ void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) {
i->set_src2(value); i->set_src2(value);
i->src3.value = NULL; i->src3.value = NULL;
} }
Value* HIRBuilder::LoadVectorLeft(Value* address) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_LVL_info, 0, AllocValue(VEC128_TYPE));
i->set_src1(address);
i->src2.value = i->src3.value = NULL;
return i->dest;
}
Value* HIRBuilder::LoadVectorRight(Value* address) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_LVR_info, 0, AllocValue(VEC128_TYPE));
i->set_src1(address);
i->src2.value = i->src3.value = NULL;
return i->dest;
}
void HIRBuilder::StoreVectorLeft(Value* address, Value* value) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_STVL_info, 0);
i->set_src1(address);
i->set_src2(value);
i->src3.value = NULL;
}
void HIRBuilder::StoreVectorRight(Value* address, Value* value) {
ASSERT_ADDRESS_TYPE(address);
Instr* i = AppendInstr(OPCODE_STVR_info, 0);
i->set_src1(address);
i->set_src2(value);
i->src3.value = NULL;
}
void HIRBuilder::Memset(Value* address, Value* value, Value* length) { void HIRBuilder::Memset(Value* address, Value* value, Value* length) {
ASSERT_ADDRESS_TYPE(address); ASSERT_ADDRESS_TYPE(address);
ASSERT_TYPES_EQUAL(address, length); ASSERT_TYPES_EQUAL(address, length);
@ -1283,7 +1308,7 @@ void HIRBuilder::SetNJM(Value* value) {
Value* HIRBuilder::Max(Value* value1, Value* value2) { Value* HIRBuilder::Max(Value* value1, Value* value2) {
ASSERT_TYPES_EQUAL(value1, value2); ASSERT_TYPES_EQUAL(value1, value2);
if (IsScalarIntegralType( value1->type) && value1->IsConstant() && if (IsScalarIntegralType(value1->type) && value1->IsConstant() &&
value2->IsConstant()) { value2->IsConstant()) {
return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1; return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1;
} }
@ -1351,27 +1376,51 @@ Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) {
i->set_src3(value2); i->set_src3(value2);
return i->dest; return i->dest;
} }
static Value* OrLanes32(HIRBuilder& f, Value* value) {
hir::Value* v1 = f.Extract(value, (uint8_t)0, INT32_TYPE);
hir::Value* v2 = f.Extract(value, (uint8_t)1, INT32_TYPE);
hir::Value* v3 = f.Extract(value, (uint8_t)2, INT32_TYPE);
hir::Value* ored = f.Or(v1, v2);
hir::Value* v4 = f.Extract(value, (uint8_t)3, INT32_TYPE);
ored = f.Or(ored, v3);
ored = f.Or(ored, v4);
return ored;
}
Value* HIRBuilder::IsTrue(Value* value) { Value* HIRBuilder::IsTrue(Value* value) {
assert_true(value);
if (value->type == VEC128_TYPE) {
// chrispy: this probably doesnt happen often enough to be worth its own
// opcode or special code path but this could be optimized to not require as
// many extracts, we can shuffle and or v128 and then extract the low
return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
}
if (value->IsConstant()) { if (value->IsConstant()) {
return LoadConstantInt8(value->IsConstantTrue() ? 1 : 0); return LoadConstantInt8(value->IsConstantTrue() ? 1 : 0);
} }
Instr* i = AppendInstr(OPCODE_IS_TRUE_info, 0, AllocValue(INT8_TYPE)); return CompareNE(value, LoadZero(value->type));
i->set_src1(value);
i->src2.value = i->src3.value = NULL;
return i->dest;
} }
Value* HIRBuilder::IsFalse(Value* value) { Value* HIRBuilder::IsFalse(Value* value) {
assert_true(value);
if (value->type == VEC128_TYPE) {
// chrispy: this probably doesnt happen often enough to be worth its own
// opcode or special code path but this could be optimized to not require as
// many extracts, we can shuffle and or v128 and then extract the low
return CompareEQ(OrLanes32(*this, value), LoadZeroInt32());
}
if (value->IsConstant()) { if (value->IsConstant()) {
return LoadConstantInt8(value->IsConstantFalse() ? 1 : 0); return LoadConstantInt8(value->IsConstantFalse() ? 1 : 0);
} }
Instr* i = AppendInstr(OPCODE_IS_FALSE_info, 0, AllocValue(INT8_TYPE)); return CompareEQ(value, LoadZero(value->type));
i->set_src1(value);
i->src2.value = i->src3.value = NULL;
return i->dest;
} }
Value* HIRBuilder::IsNan(Value* value) { Value* HIRBuilder::IsNan(Value* value) {

View File

@ -166,6 +166,11 @@ class HIRBuilder {
uint32_t store_flags = 0); uint32_t store_flags = 0);
Value* Load(Value* address, TypeName type, uint32_t load_flags = 0); Value* Load(Value* address, TypeName type, uint32_t load_flags = 0);
Value* LoadVectorLeft(Value* address);
Value* LoadVectorRight(Value* address);
void StoreVectorLeft(Value* address, Value* value);
void StoreVectorRight(Value* address, Value* value);
void Store(Value* address, Value* value, uint32_t store_flags = 0); void Store(Value* address, Value* value, uint32_t store_flags = 0);
void Memset(Value* address, Value* value, Value* length); void Memset(Value* address, Value* value, Value* length);
void CacheControl(Value* address, size_t cache_line_size, void CacheControl(Value* address, size_t cache_line_size,
@ -268,6 +273,7 @@ class HIRBuilder {
Value* new_value); Value* new_value);
Value* AtomicAdd(Value* address, Value* value); Value* AtomicAdd(Value* address, Value* value);
Value* AtomicSub(Value* address, Value* value); Value* AtomicSub(Value* address, Value* value);
void SetNJM(Value* value); void SetNJM(Value* value);
protected: protected:

View File

@ -213,7 +213,19 @@ uint32_t Instr::GuestAddressFor() const {
return 0; // eek. return 0; // eek.
} }
bool Instr::AllScalarIntegral() {
bool result = true;
if (dest) {
if (!IsScalarIntegralType(dest->type)) {
return false;
}
}
VisitValueOperands([&result](Value* v, uint32_t idx) {
result = result && IsScalarIntegralType(v->type);
});
return result;
}
} // namespace hir } // namespace hir
} // namespace cpu } // namespace cpu
} // namespace xe } // namespace xe

View File

@ -171,6 +171,8 @@ if both are constant, return nullptr, nullptr
const hir::Instr* GetNonFakePrev() const; const hir::Instr* GetNonFakePrev() const;
uint32_t GuestAddressFor() const; uint32_t GuestAddressFor() const;
bool AllScalarIntegral(); // dest and all srcs are scalar integral
}; };
} // namespace hir } // namespace hir

View File

@ -210,10 +210,10 @@ enum Opcode {
OPCODE_STORE, OPCODE_STORE,
// chrispy: todo: implement, our current codegen for the unaligned loads is // chrispy: todo: implement, our current codegen for the unaligned loads is
// very bad // very bad
OPCODE_LVLX, OPCODE_LVL,
OPCODE_LVRX, OPCODE_LVR,
OPCODE_STVLX, OPCODE_STVL,
OPCODE_STVRX, OPCODE_STVR,
OPCODE_MEMSET, OPCODE_MEMSET,
OPCODE_CACHE_CONTROL, OPCODE_CACHE_CONTROL,
OPCODE_MEMORY_BARRIER, OPCODE_MEMORY_BARRIER,
@ -222,8 +222,6 @@ enum Opcode {
OPCODE_MIN, OPCODE_MIN,
OPCODE_VECTOR_MIN, OPCODE_VECTOR_MIN,
OPCODE_SELECT, OPCODE_SELECT,
OPCODE_IS_TRUE,
OPCODE_IS_FALSE,
OPCODE_IS_NAN, OPCODE_IS_NAN,
OPCODE_COMPARE_EQ, OPCODE_COMPARE_EQ,
OPCODE_COMPARE_NE, OPCODE_COMPARE_NE,

View File

@ -303,17 +303,6 @@ DEFINE_OPCODE(
OPCODE_SIG_V_V_V_V, OPCODE_SIG_V_V_V_V,
0) 0)
DEFINE_OPCODE(
OPCODE_IS_TRUE,
"is_true",
OPCODE_SIG_V_V,
0)
DEFINE_OPCODE(
OPCODE_IS_FALSE,
"is_false",
OPCODE_SIG_V_V,
0)
DEFINE_OPCODE( DEFINE_OPCODE(
OPCODE_IS_NAN, OPCODE_IS_NAN,
@ -706,4 +695,27 @@ DEFINE_OPCODE(
OPCODE_SIG_X_V, OPCODE_SIG_X_V,
0 0
) )
DEFINE_OPCODE(
OPCODE_LVL,
"loadv_left",
OPCODE_SIG_V_V,
OPCODE_FLAG_MEMORY
)
DEFINE_OPCODE(
OPCODE_LVR,
"loadv_right",
OPCODE_SIG_V_V,
OPCODE_FLAG_MEMORY
)
DEFINE_OPCODE(
OPCODE_STVL,
"storev_left",
OPCODE_SIG_X_V_V,
OPCODE_FLAG_MEMORY)
DEFINE_OPCODE(
OPCODE_STVR,
"storev_right",
OPCODE_SIG_X_V_V,
OPCODE_FLAG_MEMORY)

View File

@ -418,6 +418,10 @@ void PPCHIRBuilder::UpdateCR6(Value* src_value) {
// Testing for all 1's and all 0's. // Testing for all 1's and all 0's.
// if (Rc) CR6 = all_equal | 0 | none_equal | 0 // if (Rc) CR6 = all_equal | 0 | none_equal | 0
// TODO(benvanik): efficient instruction? // TODO(benvanik): efficient instruction?
// chrispy: nothing seems to write cr6_1, figure out if no documented
// instructions write anything other than 0 to it and remove these stores if
// so
StoreContext(offsetof(PPCContext, cr6.cr6_1), LoadZeroInt8()); StoreContext(offsetof(PPCContext, cr6.cr6_1), LoadZeroInt8());
StoreContext(offsetof(PPCContext, cr6.cr6_3), LoadZeroInt8()); StoreContext(offsetof(PPCContext, cr6.cr6_3), LoadZeroInt8());
StoreContext(offsetof(PPCContext, cr6.cr6_all_equal), StoreContext(offsetof(PPCContext, cr6.cr6_all_equal),

View File

@ -7,18 +7,17 @@
****************************************************************************** ******************************************************************************
*/ */
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
#include <algorithm> #include <algorithm>
#include <cstring> #include <cstring>
#include <sstream> #include <sstream>
#include <utility> #include <utility>
#include "xenia/base/assert.h" #include "xenia/base/assert.h"
#include "xenia/base/byte_order.h" #include "xenia/base/byte_order.h"
#include "xenia/base/cvar.h" #include "xenia/base/cvar.h"
#include "xenia/base/logging.h" #include "xenia/base/logging.h"
#include "xenia/base/math.h" #include "xenia/base/math.h"
#include "xenia/base/profiling.h" #include "xenia/base/profiling.h"
#include "xenia/gpu/d3d12/d3d12_command_processor.h"
#include "xenia/gpu/d3d12/d3d12_graphics_system.h" #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
#include "xenia/gpu/d3d12/d3d12_shader.h" #include "xenia/gpu/d3d12/d3d12_shader.h"
#include "xenia/gpu/draw_util.h" #include "xenia/gpu/draw_util.h"
@ -843,6 +842,7 @@ bool D3D12CommandProcessor::SetupContext() {
bool draw_resolution_scale_not_clamped = bool draw_resolution_scale_not_clamped =
TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x, TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x,
draw_resolution_scale_y); draw_resolution_scale_y);
if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported( if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported(
draw_resolution_scale_x, draw_resolution_scale_y, provider)) { draw_resolution_scale_x, draw_resolution_scale_y, provider)) {
draw_resolution_scale_not_clamped = false; draw_resolution_scale_not_clamped = false;
@ -1676,37 +1676,52 @@ void D3D12CommandProcessor::ShutdownContext() {
CommandProcessor::ShutdownContext(); CommandProcessor::ShutdownContext();
} }
// todo: bit-pack the bools and use bitarith to reduce branches
void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) {
CommandProcessor::WriteRegister(index, value); CommandProcessor::WriteRegister(index, value);
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && bool cbuf_binding_float_pixel_utd = cbuffer_binding_float_pixel_.up_to_date;
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { bool cbuf_binding_float_vertex_utd = cbuffer_binding_float_vertex_.up_to_date;
if (frame_open_) { bool cbuf_binding_bool_loop_utd = cbuffer_binding_bool_loop_.up_to_date;
uint32_t float_constant_index =
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
if (float_constant_index >= 256) { index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
float_constant_index -= 256; cbuffer_binding_fetch_.up_to_date = false;
if (current_float_constant_map_pixel_[float_constant_index >> 6] & // texture cache is never nullptr
(1ull << (float_constant_index & 63))) { // if (texture_cache_ != nullptr) {
cbuffer_binding_float_pixel_.up_to_date = false; texture_cache_->TextureFetchConstantWritten(
} (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
} else { // }
if (current_float_constant_map_vertex_[float_constant_index >> 6] & } else {
(1ull << (float_constant_index & 63))) { if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd |
cbuffer_binding_float_vertex_.up_to_date = false; cbuf_binding_bool_loop_utd)) {
return;
}
if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X &&
index <= XE_GPU_REG_SHADER_CONSTANT_511_W) {
if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd)) {
return;
}
if (frame_open_) {
uint32_t float_constant_index =
(index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2;
if (float_constant_index >= 256) {
float_constant_index -= 256;
if (current_float_constant_map_pixel_[float_constant_index >> 6] &
(1ull << (float_constant_index & 63))) {
cbuffer_binding_float_pixel_.up_to_date = false;
}
} else {
if (current_float_constant_map_vertex_[float_constant_index >> 6] &
(1ull << (float_constant_index & 63))) {
cbuffer_binding_float_vertex_.up_to_date = false;
}
} }
} }
} } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 &&
} else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 && index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) {
index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) { cbuffer_binding_bool_loop_.up_to_date = false;
cbuffer_binding_bool_loop_.up_to_date = false;
} else if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 &&
index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) {
cbuffer_binding_fetch_.up_to_date = false;
if (texture_cache_ != nullptr) {
texture_cache_->TextureFetchConstantWritten(
(index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6);
} }
} }
} }
@ -2301,14 +2316,26 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x();
uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y();
draw_util::ViewportInfo viewport_info; draw_util::ViewportInfo viewport_info;
draw_util::GetHostViewportInfo( draw_util::GetViewportInfoArgs gviargs{};
regs, draw_resolution_scale_x, draw_resolution_scale_y, true,
gviargs.Setup(
draw_resolution_scale_x, draw_resolution_scale_y,
texture_cache_->draw_resolution_scale_x_divisor(),
texture_cache_->draw_resolution_scale_y_divisor(), true,
D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false, D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false,
normalized_depth_control, normalized_depth_control,
host_render_targets_used && host_render_targets_used &&
render_target_cache_->depth_float24_convert_in_pixel_shader(), render_target_cache_->depth_float24_convert_in_pixel_shader(),
host_render_targets_used, pixel_shader && pixel_shader->writes_depth(), host_render_targets_used, pixel_shader && pixel_shader->writes_depth());
viewport_info); gviargs.SetupRegisterValues(regs);
if (gviargs == previous_viewport_info_args_) {
viewport_info = previous_viewport_info_;
} else {
draw_util::GetHostViewportInfo(&gviargs, viewport_info);
previous_viewport_info_args_ = gviargs;
previous_viewport_info_ = viewport_info;
}
draw_util::Scissor scissor; draw_util::Scissor scissor;
draw_util::GetScissor(regs, scissor); draw_util::GetScissor(regs, scissor);
scissor.offset[0] *= draw_resolution_scale_x; scissor.offset[0] *= draw_resolution_scale_x;
@ -2711,6 +2738,24 @@ void D3D12CommandProcessor::InitializeTrace() {
shared_memory_->InitializeTraceCompleteDownloads(); shared_memory_->InitializeTraceCompleteDownloads();
} }
} }
static void DmaPrefunc(dma::XeDMAJob* job) {
D3D12_RANGE readback_range;
readback_range.Begin = 0;
readback_range.End = job->size;
void* readback_mapping;
ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
xenia_assert(SUCCEEDED(mapres));
job->source = (uint8_t*)readback_mapping;
}
static void DmaPostfunc(dma::XeDMAJob* job) {
D3D12_RANGE readback_write_range = {};
ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
readback_buffer->Unmap(0, &readback_write_range);
}
bool D3D12CommandProcessor::IssueCopy() { bool D3D12CommandProcessor::IssueCopy() {
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@ -2736,17 +2781,35 @@ bool D3D12CommandProcessor::IssueCopy() {
readback_buffer, 0, shared_memory_buffer, written_address, readback_buffer, 0, shared_memory_buffer, written_address,
written_length); written_length);
if (AwaitAllQueueOperationsCompletion()) { if (AwaitAllQueueOperationsCompletion()) {
#if 1
D3D12_RANGE readback_range; D3D12_RANGE readback_range;
readback_range.Begin = 0; readback_range.Begin = 0;
readback_range.End = written_length; readback_range.End = written_length;
void* readback_mapping; void* readback_mapping;
if (SUCCEEDED( if (SUCCEEDED(
readback_buffer->Map(0, &readback_range, &readback_mapping))) { readback_buffer->Map(0, &readback_range, &readback_mapping))) {
std::memcpy(memory_->TranslatePhysical(written_address), // chrispy: this memcpy needs to be optimized as much as possible
readback_mapping, written_length);
auto physaddr = memory_->TranslatePhysical(written_address);
dma::vastcpy(physaddr, (uint8_t*)readback_mapping,
written_length);
// XEDmaCpy(physaddr, readback_mapping, written_length);
D3D12_RANGE readback_write_range = {}; D3D12_RANGE readback_write_range = {};
readback_buffer->Unmap(0, &readback_write_range); readback_buffer->Unmap(0, &readback_write_range);
} }
#else
dma::XeDMAJob job{};
job.destination = memory_->TranslatePhysical(written_address);
job.size = written_length;
job.source = nullptr;
job.userdata1 = (void*)readback_buffer;
job.precall = DmaPrefunc;
job.postcall = DmaPostfunc;
readback_available_ = GetDMAC()->PushDMAJob(&job);
#endif
} }
} }
} }
@ -3885,9 +3948,10 @@ bool D3D12CommandProcessor::UpdateBindings(
if (bool_loop_constants == nullptr) { if (bool_loop_constants == nullptr) {
return false; return false;
} }
std::memcpy(bool_loop_constants, xe::smallcpy_const<kBoolLoopConstantsSize>(
&regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32, bool_loop_constants,
kBoolLoopConstantsSize); &regs[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32);
cbuffer_binding_bool_loop_.up_to_date = true; cbuffer_binding_bool_loop_.up_to_date = true;
current_graphics_root_up_to_date_ &= current_graphics_root_up_to_date_ &=
~(1u << root_parameter_bool_loop_constants); ~(1u << root_parameter_bool_loop_constants);
@ -3901,9 +3965,9 @@ bool D3D12CommandProcessor::UpdateBindings(
if (fetch_constants == nullptr) { if (fetch_constants == nullptr) {
return false; return false;
} }
std::memcpy(fetch_constants, xe::smallcpy_const<kFetchConstantsSize>(
&regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32, fetch_constants, &regs[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32);
kFetchConstantsSize);
cbuffer_binding_fetch_.up_to_date = true; cbuffer_binding_fetch_.up_to_date = true;
current_graphics_root_up_to_date_ &= current_graphics_root_up_to_date_ &=
~(1u << root_parameter_fetch_constants); ~(1u << root_parameter_fetch_constants);
@ -4542,6 +4606,12 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
if (size == 0) { if (size == 0) {
return nullptr; return nullptr;
} }
#if 0
if (readback_available_) {
GetDMAC()->WaitJobDone(readback_available_);
readback_available_ = 0;
}
#endif
size = xe::align(size, kReadbackBufferSizeIncrement); size = xe::align(size, kReadbackBufferSizeIncrement);
if (size > readback_buffer_size_) { if (size > readback_buffer_size_) {
const ui::d3d12::D3D12Provider& provider = GetD3D12Provider(); const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
@ -4561,6 +4631,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
readback_buffer_->Release(); readback_buffer_->Release();
} }
readback_buffer_ = buffer; readback_buffer_ = buffer;
readback_buffer_size_ = size;
} }
return readback_buffer_; return readback_buffer_;
} }

View File

@ -1,3 +1,4 @@
/**
/** /**
****************************************************************************** ******************************************************************************
* Xenia : Xbox 360 Emulator Research Project * * Xenia : Xbox 360 Emulator Research Project *
@ -19,6 +20,7 @@
#include <utility> #include <utility>
#include "xenia/base/assert.h" #include "xenia/base/assert.h"
#include "xenia/base/dma.h"
#include "xenia/gpu/command_processor.h" #include "xenia/gpu/command_processor.h"
#include "xenia/gpu/d3d12/d3d12_graphics_system.h" #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
#include "xenia/gpu/d3d12/d3d12_primitive_processor.h" #include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
@ -581,6 +583,7 @@ class D3D12CommandProcessor final : public CommandProcessor {
static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024; static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
ID3D12Resource* readback_buffer_ = nullptr; ID3D12Resource* readback_buffer_ = nullptr;
dma::DMACJobHandle readback_available_ = 0;
uint32_t readback_buffer_size_ = 0; uint32_t readback_buffer_size_ = 0;
std::atomic<bool> pix_capture_requested_ = false; std::atomic<bool> pix_capture_requested_ = false;
@ -614,9 +617,11 @@ class D3D12CommandProcessor final : public CommandProcessor {
DxbcShaderTranslator::SystemConstants system_constants_; DxbcShaderTranslator::SystemConstants system_constants_;
// Float constant usage masks of the last draw call. // Float constant usage masks of the last draw call.
uint64_t current_float_constant_map_vertex_[4]; // chrispy: make sure accesses to these cant cross cacheline boundaries
uint64_t current_float_constant_map_pixel_[4]; struct alignas(XE_HOST_CACHE_LINE_SIZE) {
uint64_t current_float_constant_map_vertex_[4];
uint64_t current_float_constant_map_pixel_[4];
};
// Constant buffer bindings. // Constant buffer bindings.
struct ConstantBufferBinding { struct ConstantBufferBinding {
D3D12_GPU_VIRTUAL_ADDRESS address; D3D12_GPU_VIRTUAL_ADDRESS address;
@ -670,6 +675,9 @@ class D3D12CommandProcessor final : public CommandProcessor {
// Current primitive topology. // Current primitive topology.
D3D_PRIMITIVE_TOPOLOGY primitive_topology_; D3D_PRIMITIVE_TOPOLOGY primitive_topology_;
draw_util::GetViewportInfoArgs previous_viewport_info_args_;
draw_util::ViewportInfo previous_viewport_info_;
}; };
} // namespace d3d12 } // namespace d3d12

View File

@ -406,14 +406,15 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
} }
bool D3D12SharedMemory::UploadRanges( bool D3D12SharedMemory::UploadRanges(
const std::pair<uint32_t, uint32_t>* upload_page_ranges, unsigned num_upload_page_ranges) { const std::pair<uint32_t, uint32_t>* upload_page_ranges,
unsigned num_upload_page_ranges) {
if (!num_upload_page_ranges) { if (!num_upload_page_ranges) {
return true; return true;
} }
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST); CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
command_processor_.SubmitBarriers(); command_processor_.SubmitBarriers();
auto& command_list = command_processor_.GetDeferredCommandList(); auto& command_list = command_processor_.GetDeferredCommandList();
//for (auto upload_range : upload_page_ranges) { // for (auto upload_range : upload_page_ranges) {
for (unsigned int i = 0; i < num_upload_page_ranges; ++i) { for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
auto& upload_range = upload_page_ranges[i]; auto& upload_range = upload_page_ranges[i];
uint32_t upload_range_start = upload_range.first; uint32_t upload_range_start = upload_range.first;
@ -434,10 +435,20 @@ bool D3D12SharedMemory::UploadRanges(
} }
MakeRangeValid(upload_range_start << page_size_log2(), MakeRangeValid(upload_range_start << page_size_log2(),
uint32_t(upload_buffer_size), false, false); uint32_t(upload_buffer_size), false, false);
std::memcpy(
upload_buffer_mapping, if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) {
memory().TranslatePhysical(upload_range_start << page_size_log2()), dma::vastcpy(
upload_buffer_size); upload_buffer_mapping,
memory().TranslatePhysical(upload_range_start << page_size_log2()),
static_cast<uint32_t>(upload_buffer_size));
swcache::WriteFence();
} else {
memcpy(
upload_buffer_mapping,
memory().TranslatePhysical(upload_range_start << page_size_log2()),
upload_buffer_size);
}
command_list.D3DCopyBufferRegion( command_list.D3DCopyBufferRegion(
buffer_, upload_range_start << page_size_log2(), upload_buffer, buffer_, upload_range_start << page_size_log2(), upload_buffer,
UINT64(upload_buffer_offset), UINT64(upload_buffer_size)); UINT64(upload_buffer_offset), UINT64(upload_buffer_size));

View File

@ -167,17 +167,17 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader,
return false; return false;
} }
void GetHostViewportInfo(const RegisterFile& regs, static float ViewportRecip2_0(float f) {
uint32_t draw_resolution_scale_x, float f1 = ArchReciprocalRefined(f);
uint32_t draw_resolution_scale_y, return f1 + f1;
bool origin_bottom_left, uint32_t x_max, }
uint32_t y_max, bool allow_reverse_z,
reg::RB_DEPTHCONTROL normalized_depth_control, // chrispy: todo, the int/float divides and the nan-checked mins show up
bool convert_z_to_float24, bool full_float24_in_0_to_1, // relatively high on uprof when i uc to 1.7ghz
bool pixel_shader_writes_depth, void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
ViewportInfo& viewport_info_out) { ViewportInfo& viewport_info_out) {
assert_not_zero(draw_resolution_scale_x); assert_not_zero(args->draw_resolution_scale_x);
assert_not_zero(draw_resolution_scale_y); assert_not_zero(args->draw_resolution_scale_y);
// A vertex position goes the following path: // A vertex position goes the following path:
// //
@ -304,38 +304,32 @@ void GetHostViewportInfo(const RegisterFile& regs,
// TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for // TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for
// D24FS8 as well. // D24FS8 as well.
auto pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>(); auto pa_cl_clip_cntl = args->pa_cl_clip_cntl;
auto pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>(); auto pa_cl_vte_cntl = args->pa_cl_vte_cntl;
auto pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>(); auto pa_su_sc_mode_cntl = args->pa_su_sc_mode_cntl;
auto pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>(); auto pa_su_vtx_cntl = args->pa_su_vtx_cntl;
// Obtain the original viewport values in a normalized way. // Obtain the original viewport values in a normalized way.
float scale_xy[] = { float scale_xy[] = {
pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32
: 1.0f, pa_cl_vte_cntl.vport_x_scale_ena ? args->PA_CL_VPORT_XSCALE : 1.0f,
pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 pa_cl_vte_cntl.vport_y_scale_ena ? args->PA_CL_VPORT_YSCALE : 1.0f,
: 1.0f,
}; };
float scale_z = pa_cl_vte_cntl.vport_z_scale_ena float scale_z =
? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 pa_cl_vte_cntl.vport_z_scale_ena ? args->PA_CL_VPORT_ZSCALE : 1.0f;
: 1.0f;
float offset_base_xy[] = { float offset_base_xy[] = {
pa_cl_vte_cntl.vport_x_offset_ena pa_cl_vte_cntl.vport_x_offset_ena ? args->PA_CL_VPORT_XOFFSET : 0.0f,
? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 pa_cl_vte_cntl.vport_y_offset_ena ? args->PA_CL_VPORT_YOFFSET : 0.0f,
: 0.0f,
pa_cl_vte_cntl.vport_y_offset_ena
? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32
: 0.0f,
}; };
float offset_z = pa_cl_vte_cntl.vport_z_offset_ena float offset_z =
? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 pa_cl_vte_cntl.vport_z_offset_ena ? args->PA_CL_VPORT_ZOFFSET : 0.0f;
: 0.0f;
// Calculate all the integer.0 or integer.5 offsetting exactly at full // Calculate all the integer.0 or integer.5 offsetting exactly at full
// precision, separately so it can be used in other integer calculations // precision, separately so it can be used in other integer calculations
// without double rounding if needed. // without double rounding if needed.
float offset_add_xy[2] = {}; float offset_add_xy[2] = {};
if (pa_su_sc_mode_cntl.vtx_window_offset_enable) { if (pa_su_sc_mode_cntl.vtx_window_offset_enable) {
auto pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>(); auto pa_sc_window_offset = args->pa_sc_window_offset;
offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset); offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset);
offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset); offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset);
} }
@ -346,8 +340,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
// The maximum value is at least the maximum host render target size anyway - // The maximum value is at least the maximum host render target size anyway -
// and a guest pixel is always treated as a whole with resolution scaling. // and a guest pixel is always treated as a whole with resolution scaling.
uint32_t xy_max_unscaled[] = {x_max / draw_resolution_scale_x, // cbrispy: todo, this integer divides show up high on the profiler somehow
y_max / draw_resolution_scale_y}; // (it was a very long session, too)
uint32_t xy_max_unscaled[] = {
args->draw_resolution_scale_x_divisor.Apply(args->x_max),
args->draw_resolution_scale_y_divisor.Apply(args->y_max)};
assert_not_zero(xy_max_unscaled[0]); assert_not_zero(xy_max_unscaled[0]);
assert_not_zero(xy_max_unscaled[1]); assert_not_zero(xy_max_unscaled[1]);
@ -367,9 +364,11 @@ void GetHostViewportInfo(const RegisterFile& regs,
std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]); std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]);
viewport_info_out.xy_extent[i] = viewport_info_out.xy_extent[i] =
extent_axis_unscaled * extent_axis_unscaled *
(i ? draw_resolution_scale_y : draw_resolution_scale_x); (i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x);
float extent_axis_unscaled_float = float(extent_axis_unscaled); float extent_axis_unscaled_float = float(extent_axis_unscaled);
float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float;
float pixels_to_ndc_axis = ViewportRecip2_0(extent_axis_unscaled_float);
ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis; ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis;
ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f + ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f +
offset_add_xy[i]) * offset_add_xy[i]) *
@ -394,7 +393,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
// doing truncation for simplicity - since maxing with 0 is done anyway // doing truncation for simplicity - since maxing with 0 is done anyway
// (we only return viewports in the positive quarter-plane). // (we only return viewports in the positive quarter-plane).
uint32_t axis_resolution_scale = uint32_t axis_resolution_scale =
i ? draw_resolution_scale_y : draw_resolution_scale_x; i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x;
float offset_axis = offset_base_xy[i] + offset_add_xy[i]; float offset_axis = offset_base_xy[i] + offset_add_xy[i];
float scale_axis = scale_xy[i]; float scale_axis = scale_xy[i];
float scale_axis_abs = std::abs(scale_xy[i]); float scale_axis_abs = std::abs(scale_xy[i]);
@ -422,11 +421,14 @@ void GetHostViewportInfo(const RegisterFile& regs,
// space, a region previously outside -W...W should end up within it, so // space, a region previously outside -W...W should end up within it, so
// the scale should be < 1. // the scale should be < 1.
float axis_extent_rounded = float(axis_extent_int); float axis_extent_rounded = float(axis_extent_int);
ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded; float inv_axis_extent_rounded =
ArchReciprocalRefined(axis_extent_rounded);
ndc_scale_axis = scale_axis * 2.0f * inv_axis_extent_rounded;
// Move the origin of the snapped coordinates back to the original one. // Move the origin of the snapped coordinates back to the original one.
ndc_offset_axis = (float(offset_axis) - ndc_offset_axis = (float(offset_axis) -
(float(axis_0_int) + axis_extent_rounded * 0.5f)) * (float(axis_0_int) + axis_extent_rounded * 0.5f)) *
2.0f / axis_extent_rounded; 2.0f * inv_axis_extent_rounded;
} else { } else {
// Empty viewport (everything outside the viewport scissor). // Empty viewport (everything outside the viewport scissor).
ndc_scale_axis = 1.0f; ndc_scale_axis = 1.0f;
@ -497,7 +499,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
ndc_scale[2] = 0.5f; ndc_scale[2] = 0.5f;
ndc_offset[2] = 0.5f; ndc_offset[2] = 0.5f;
} }
if (pixel_shader_writes_depth) { if (args->pixel_shader_writes_depth) {
// Allow the pixel shader to write any depth value since // Allow the pixel shader to write any depth value since
// PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel // PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel
// shaders don't have access to the original Z in the viewport space // shaders don't have access to the original Z in the viewport space
@ -515,7 +517,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
// Direct3D 12 doesn't allow reverse depth range - on some drivers it // Direct3D 12 doesn't allow reverse depth range - on some drivers it
// works, on some drivers it doesn't, actually, but it was never // works, on some drivers it doesn't, actually, but it was never
// explicitly allowed by the specification. // explicitly allowed by the specification.
if (!allow_reverse_z && z_min > z_max) { if (!args->allow_reverse_z && z_min > z_max) {
std::swap(z_min, z_max); std::swap(z_min, z_max);
ndc_scale[2] = -ndc_scale[2]; ndc_scale[2] = -ndc_scale[2];
ndc_offset[2] = 1.0f - ndc_offset[2]; ndc_offset[2] = 1.0f - ndc_offset[2];
@ -523,10 +525,9 @@ void GetHostViewportInfo(const RegisterFile& regs,
} }
} }
if (normalized_depth_control.z_enable && if (args->normalized_depth_control.z_enable &&
regs.Get<reg::RB_DEPTH_INFO>().depth_format == args->depth_format == xenos::DepthRenderTargetFormat::kD24FS8) {
xenos::DepthRenderTargetFormat::kD24FS8) { if (args->convert_z_to_float24) {
if (convert_z_to_float24) {
// Need to adjust the bounds that the resulting depth values will be // Need to adjust the bounds that the resulting depth values will be
// clamped to after the pixel shader. Preferring adding some error to // clamped to after the pixel shader. Preferring adding some error to
// interpolated Z instead if conversion can't be done exactly, without // interpolated Z instead if conversion can't be done exactly, without
@ -537,7 +538,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true)); z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true));
z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true)); z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true));
} }
if (full_float24_in_0_to_1) { if (args->full_float24_in_0_to_1) {
// Remap the full [0...2) float24 range to [0...1) support data round-trip // Remap the full [0...2) float24 range to [0...1) support data round-trip
// during render target ownership transfer of EDRAM tiles through depth // during render target ownership transfer of EDRAM tiles through depth
// input without unrestricted depth range. // input without unrestricted depth range.
@ -548,7 +549,7 @@ void GetHostViewportInfo(const RegisterFile& regs,
viewport_info_out.z_min = z_min; viewport_info_out.z_min = z_min;
viewport_info_out.z_max = z_max; viewport_info_out.z_max = z_max;
if (origin_bottom_left) { if (args->origin_bottom_left) {
ndc_scale[1] = -ndc_scale[1]; ndc_scale[1] = -ndc_scale[1];
ndc_offset[1] = -ndc_offset[1]; ndc_offset[1] = -ndc_offset[1];
} }
@ -557,7 +558,6 @@ void GetHostViewportInfo(const RegisterFile& regs,
viewport_info_out.ndc_offset[i] = ndc_offset[i]; viewport_info_out.ndc_offset[i] = ndc_offset[i];
} }
} }
void GetScissor(const RegisterFile& regs, Scissor& scissor_out, void GetScissor(const RegisterFile& regs, Scissor& scissor_out,
bool clamp_to_surface_pitch) { bool clamp_to_surface_pitch) {
auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>(); auto pa_sc_window_scissor_tl = regs.Get<reg::PA_SC_WINDOW_SCISSOR_TL>();
@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory,
xenos::kMaxResolveSize); xenos::kMaxResolveSize);
y1 = y0 + int32_t(xenos::kMaxResolveSize); y1 = y0 + int32_t(xenos::kMaxResolveSize);
} }
//fails in forza horizon 1 // fails in forza horizon 1
assert_true(x0 < x1 && y0 < y1); assert_true(x0 < x1 && y0 < y1);
if (x0 >= x1 || y0 >= y1) { if (x0 >= x1 || y0 >= y1) {
XELOGE("Resolve region is empty"); XELOGE("Resolve region is empty");

View File

@ -277,18 +277,151 @@ struct ViewportInfo {
float ndc_scale[3]; float ndc_scale[3];
float ndc_offset[3]; float ndc_offset[3];
}; };
static_assert(sizeof(xenos::DepthRenderTargetFormat) == sizeof(uint32_t),
"Change in depthrendertargetformat throws off "
"getviewportinfoargs by a bit");
struct GetViewportInfoArgs {
union alignas(64) {
struct {
// group 1
uint32_t x_max;
uint32_t y_max;
union {
struct {
uint32_t origin_bottom_left : 1;
uint32_t allow_reverse_z : 1;
uint32_t convert_z_to_float24 : 1;
uint32_t full_float24_in_0_to_1 : 1;
uint32_t pixel_shader_writes_depth : 1;
xenos::DepthRenderTargetFormat depth_format : 1;
};
uint32_t packed_portions;
};
reg::RB_DEPTHCONTROL normalized_depth_control;
// group 2
reg::PA_CL_CLIP_CNTL pa_cl_clip_cntl;
reg::PA_CL_VTE_CNTL pa_cl_vte_cntl;
reg::PA_SU_SC_MODE_CNTL pa_su_sc_mode_cntl;
reg::PA_SU_VTX_CNTL pa_su_vtx_cntl;
// group 3
reg::PA_SC_WINDOW_OFFSET pa_sc_window_offset;
float PA_CL_VPORT_XSCALE;
float PA_CL_VPORT_YSCALE;
float PA_CL_VPORT_ZSCALE;
float PA_CL_VPORT_XOFFSET;
float PA_CL_VPORT_YOFFSET;
float PA_CL_VPORT_ZOFFSET;
uint32_t padding_set_to_0;
};
#if XE_ARCH_AMD64 == 1
struct {
__m128i first4; // x_max, y_max, packed_portions,
// normalized_depth_control
__m128i second4; // pa_cl_clip_cntl, pa_cl_vte_cntl, pa_su_sc_mode_cntl,
// pa_su_vtx_cntl
__m128i third4; // pa_sc_window_offset, PA_CL_VPORT_XSCALE,
// PA_CL_VPORT_YSCALE, PA_CL_VPORT_ZSCALE
__m128i last4; // PA_CL_VPORT_XOFFSET, PA_CL_VPORT_YOFFSET,
// PA_CL_VPORT_ZOFFSET, padding_set_to_0
};
#endif
};
// everything that follows here does not need to be compared
uint32_t draw_resolution_scale_x;
uint32_t draw_resolution_scale_y;
divisors::MagicDiv draw_resolution_scale_x_divisor;
divisors::MagicDiv draw_resolution_scale_y_divisor;
void Setup(uint32_t _draw_resolution_scale_x,
uint32_t _draw_resolution_scale_y,
divisors::MagicDiv _draw_resolution_scale_x_divisor,
divisors::MagicDiv _draw_resolution_scale_y_divisor,
bool _origin_bottom_left, uint32_t _x_max, uint32_t _y_max,
bool _allow_reverse_z,
reg::RB_DEPTHCONTROL _normalized_depth_control,
bool _convert_z_to_float24, bool _full_float24_in_0_to_1,
bool _pixel_shader_writes_depth) {
packed_portions = 0;
padding_set_to_0 = 0; // important to zero this
draw_resolution_scale_x = _draw_resolution_scale_x;
draw_resolution_scale_y = _draw_resolution_scale_y;
draw_resolution_scale_x_divisor = _draw_resolution_scale_x_divisor;
draw_resolution_scale_y_divisor = _draw_resolution_scale_y_divisor;
origin_bottom_left = _origin_bottom_left;
x_max = _x_max;
y_max = _y_max;
allow_reverse_z = _allow_reverse_z;
normalized_depth_control = _normalized_depth_control;
convert_z_to_float24 = _convert_z_to_float24;
full_float24_in_0_to_1 = _full_float24_in_0_to_1;
pixel_shader_writes_depth = _pixel_shader_writes_depth;
}
void SetupRegisterValues(const RegisterFile& regs) {
pa_cl_clip_cntl = regs.Get<reg::PA_CL_CLIP_CNTL>();
pa_cl_vte_cntl = regs.Get<reg::PA_CL_VTE_CNTL>();
pa_su_sc_mode_cntl = regs.Get<reg::PA_SU_SC_MODE_CNTL>();
pa_su_vtx_cntl = regs.Get<reg::PA_SU_VTX_CNTL>();
PA_CL_VPORT_XSCALE = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32;
PA_CL_VPORT_YSCALE = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32;
PA_CL_VPORT_ZSCALE = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32;
PA_CL_VPORT_XOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32;
PA_CL_VPORT_YOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32;
PA_CL_VPORT_ZOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32;
pa_sc_window_offset = regs.Get<reg::PA_SC_WINDOW_OFFSET>();
depth_format = regs.Get<reg::RB_DEPTH_INFO>().depth_format;
}
XE_FORCEINLINE
bool operator==(const GetViewportInfoArgs& prev) {
#if XE_ARCH_AMD64 == 0
bool result = true;
auto accum_eq = [&result](auto x, auto y) { result &= (x == y); };
#define EQC(field) accum_eq(field, prev.field)
EQC(x_max);
EQC(y_max);
EQC(packed_portions);
EQC(normalized_depth_control.value);
EQC(pa_cl_clip_cntl.value);
EQC(pa_cl_vte_cntl.value);
EQC(pa_su_sc_mode_cntl.value);
EQC(pa_su_vtx_cntl.value);
EQC(PA_CL_VPORT_XSCALE);
EQC(PA_CL_VPORT_YSCALE);
EQC(PA_CL_VPORT_ZSCALE);
EQC(PA_CL_VPORT_XOFFSET);
EQC(PA_CL_VPORT_YOFFSET);
EQC(PA_CL_VPORT_ZOFFSET);
EQC(pa_sc_window_offset.value);
#undef EQC
return result;
#else
__m128i mask1 = _mm_cmpeq_epi32(first4, prev.first4);
__m128i mask2 = _mm_cmpeq_epi32(second4, prev.second4);
__m128i mask3 = _mm_cmpeq_epi32(third4, prev.third4);
__m128i unified1 = _mm_and_si128(mask1, mask2);
__m128i mask4 = _mm_cmpeq_epi32(last4, prev.last4);
__m128i unified2 = _mm_and_si128(unified1, mask3);
__m128i unified3 = _mm_and_si128(unified2, mask4);
return _mm_movemask_epi8(unified3) == 0xFFFF;
#endif
}
};
// Converts the guest viewport (or fakes one if drawing without a viewport) to // Converts the guest viewport (or fakes one if drawing without a viewport) to
// a viewport, plus values to multiply-add the returned position by, usable on // a viewport, plus values to multiply-add the returned position by, usable on
// host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the
// Direct3D clip space with 0...W Z rather than -W...W. // Direct3D clip space with 0...W Z rather than -W...W.
void GetHostViewportInfo(const RegisterFile& regs, void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args,
uint32_t draw_resolution_scale_x,
uint32_t draw_resolution_scale_y,
bool origin_bottom_left, uint32_t x_max,
uint32_t y_max, bool allow_reverse_z,
reg::RB_DEPTHCONTROL normalized_depth_control,
bool convert_z_to_float24, bool full_float24_in_0_to_1,
bool pixel_shader_writes_depth,
ViewportInfo& viewport_info_out); ViewportInfo& viewport_info_out);
struct Scissor { struct Scissor {

View File

@ -813,20 +813,22 @@ bool RenderTargetCache::Update(bool is_rasterization_done,
} }
// Make sure the same render target isn't bound into two different slots // Make sure the same render target isn't bound into two different slots
// over time. // over time.
for (uint32_t i = 1; are_accumulated_render_targets_valid_ && // chrispy: this needs optimization!
i < 1 + xenos::kMaxColorRenderTargets; if (are_accumulated_render_targets_valid_) {
++i) { for (uint32_t i = 1; i < 1 + xenos::kMaxColorRenderTargets; ++i) {
const RenderTarget* render_target = const RenderTarget* render_target =
last_update_accumulated_render_targets_[i]; last_update_accumulated_render_targets_[i];
if (!render_target) { if (!render_target) {
continue; continue;
} }
for (uint32_t j = 0; j < i; ++j) { for (uint32_t j = 0; j < i; ++j) {
if (last_update_accumulated_render_targets_[j] == render_target) { if (last_update_accumulated_render_targets_[j] == render_target) {
are_accumulated_render_targets_valid_ = false; are_accumulated_render_targets_valid_ = false;
break; goto exit_slot_check_loop;
}
} }
} }
exit_slot_check_loop:;
} }
} }
if (!are_accumulated_render_targets_valid_) { if (!are_accumulated_render_targets_valid_) {

View File

@ -154,7 +154,9 @@ TextureCache::TextureCache(const RegisterFile& register_file,
: register_file_(register_file), : register_file_(register_file),
shared_memory_(shared_memory), shared_memory_(shared_memory),
draw_resolution_scale_x_(draw_resolution_scale_x), draw_resolution_scale_x_(draw_resolution_scale_x),
draw_resolution_scale_y_(draw_resolution_scale_y) { draw_resolution_scale_y_(draw_resolution_scale_y),
draw_resolution_scale_x_divisor_(draw_resolution_scale_x),
draw_resolution_scale_y_divisor_(draw_resolution_scale_y) {
assert_true(draw_resolution_scale_x >= 1); assert_true(draw_resolution_scale_x >= 1);
assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis); assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis);
assert_true(draw_resolution_scale_y >= 1); assert_true(draw_resolution_scale_y >= 1);
@ -187,6 +189,7 @@ bool TextureCache::GetConfigDrawResolutionScale(uint32_t& x_out,
uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x)); uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x));
uint32_t config_y = uint32_t config_y =
uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y)); uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y));
uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x); uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x);
uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y); uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y);
x_out = clamped_x; x_out = clamped_x;
@ -552,8 +555,7 @@ void TextureCache::Texture::MarkAsUsed() {
} }
void TextureCache::Texture::WatchCallback( void TextureCache::Texture::WatchCallback(
[[maybe_unused]] const global_unique_lock_type& global_lock, [[maybe_unused]] const global_unique_lock_type& global_lock, bool is_mip) {
bool is_mip) {
if (is_mip) { if (is_mip) {
assert_not_zero(GetGuestMipsSize()); assert_not_zero(GetGuestMipsSize());
mips_outdated_ = true; mips_outdated_ = true;
@ -566,8 +568,8 @@ void TextureCache::Texture::WatchCallback(
} }
void TextureCache::WatchCallback(const global_unique_lock_type& global_lock, void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
void* context, void* context, void* data, uint64_t argument,
void* data, uint64_t argument, bool invalidated_by_gpu) { bool invalidated_by_gpu) {
Texture& texture = *static_cast<Texture*>(context); Texture& texture = *static_cast<Texture*>(context);
texture.WatchCallback(global_lock, argument != 0); texture.WatchCallback(global_lock, argument != 0);
texture.texture_cache().texture_became_outdated_.store( texture.texture_cache().texture_became_outdated_.store(
@ -910,8 +912,8 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
} }
void TextureCache::ScaledResolveGlobalWatchCallback( void TextureCache::ScaledResolveGlobalWatchCallback(
const global_unique_lock_type& global_lock, const global_unique_lock_type& global_lock, uint32_t address_first,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) { uint32_t address_last, bool invalidated_by_gpu) {
assert_true(IsDrawResolutionScaled()); assert_true(IsDrawResolutionScaled());
if (invalidated_by_gpu) { if (invalidated_by_gpu) {
// Resolves themselves do exactly the opposite of what this should do. // Resolves themselves do exactly the opposite of what this should do.

View File

@ -19,6 +19,7 @@
#include "xenia/base/assert.h" #include "xenia/base/assert.h"
#include "xenia/base/hash.h" #include "xenia/base/hash.h"
#include "xenia/base/math.h"
#include "xenia/base/mutex.h" #include "xenia/base/mutex.h"
#include "xenia/gpu/register_file.h" #include "xenia/gpu/register_file.h"
#include "xenia/gpu/shared_memory.h" #include "xenia/gpu/shared_memory.h"
@ -70,6 +71,14 @@ class TextureCache {
static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out); static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out);
uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; } uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; }
uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; } uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; }
divisors::MagicDiv draw_resolution_scale_x_divisor() const {
return draw_resolution_scale_x_divisor_;
}
divisors::MagicDiv draw_resolution_scale_y_divisor() const {
return draw_resolution_scale_y_divisor_;
}
bool IsDrawResolutionScaled() const { bool IsDrawResolutionScaled() const {
return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1; return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1;
} }
@ -576,8 +585,8 @@ class TextureCache {
// Shared memory callback for texture data invalidation. // Shared memory callback for texture data invalidation.
static void WatchCallback(const global_unique_lock_type& global_lock, static void WatchCallback(const global_unique_lock_type& global_lock,
void* context, void* context, void* data, uint64_t argument,
void* data, uint64_t argument, bool invalidated_by_gpu); bool invalidated_by_gpu);
// Checks if there are any pages that contain scaled resolve data within the // Checks if there are any pages that contain scaled resolve data within the
// range. // range.
@ -588,14 +597,15 @@ class TextureCache {
const global_unique_lock_type& global_lock, void* context, const global_unique_lock_type& global_lock, void* context,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
void ScaledResolveGlobalWatchCallback( void ScaledResolveGlobalWatchCallback(
const global_unique_lock_type& global_lock, const global_unique_lock_type& global_lock, uint32_t address_first,
uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); uint32_t address_last, bool invalidated_by_gpu);
const RegisterFile& register_file_; const RegisterFile& register_file_;
SharedMemory& shared_memory_; SharedMemory& shared_memory_;
uint32_t draw_resolution_scale_x_; uint32_t draw_resolution_scale_x_;
uint32_t draw_resolution_scale_y_; uint32_t draw_resolution_scale_y_;
divisors::MagicDiv draw_resolution_scale_x_divisor_;
divisors::MagicDiv draw_resolution_scale_y_divisor_;
static const LoadShaderInfo load_shader_info_[kLoadShaderCount]; static const LoadShaderInfo load_shader_info_[kLoadShaderCount];
xe::global_critical_region global_critical_region_; xe::global_critical_region global_critical_region_;

View File

@ -2366,6 +2366,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
// Get dynamic rasterizer state. // Get dynamic rasterizer state.
draw_util::ViewportInfo viewport_info; draw_util::ViewportInfo viewport_info;
// Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must // Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must
// be at least 2 * max(maxViewportDimensions[0...1]) - 1, and // be at least 2 * max(maxViewportDimensions[0...1]) - 1, and
// maxViewportDimensions must be greater than or equal to the size of the // maxViewportDimensions must be greater than or equal to the size of the
@ -2382,11 +2383,16 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type,
// life. Or even disregard the viewport bounds range in the fragment shader // life. Or even disregard the viewport bounds range in the fragment shader
// interlocks case completely - apply the viewport and the scissor offset // interlocks case completely - apply the viewport and the scissor offset
// directly to pixel address and to things like ps_param_gen. // directly to pixel address and to things like ps_param_gen.
draw_util::GetHostViewportInfo( draw_util::GetViewportInfoArgs gviargs{};
regs, 1, 1, false, device_limits.maxViewportDimensions[0], gviargs.Setup(1, 1, divisors::MagicDiv{1}, divisors::MagicDiv{1}, false,
device_limits.maxViewportDimensions[1], true, normalized_depth_control, device_limits.maxViewportDimensions[0],
false, host_render_targets_used,
pixel_shader && pixel_shader->writes_depth(), viewport_info); device_limits.maxViewportDimensions[1], true,
normalized_depth_control, false, host_render_targets_used,
pixel_shader && pixel_shader->writes_depth());
gviargs.SetupRegisterValues(regs);
draw_util::GetHostViewportInfo(&gviargs, viewport_info);
// Update dynamic graphics pipeline state. // Update dynamic graphics pipeline state.
UpdateDynamicState(viewport_info, primitive_polygonal, UpdateDynamicState(viewport_info, primitive_polygonal,

View File

@ -326,7 +326,14 @@ constexpr bool IsColorRenderTargetFormat64bpp(ColorRenderTargetFormat format) {
format == ColorRenderTargetFormat::k_32_32_FLOAT; format == ColorRenderTargetFormat::k_32_32_FLOAT;
} }
inline uint32_t GetColorRenderTargetFormatComponentCount( // if 0, 1
// if 1, 2
// if 3, 4
// 2 bits per entry, shift and add 1
using ColorFormatComponentTable = uint32_t;
static constexpr uint32_t GetComponentCountConst(
ColorRenderTargetFormat format) { ColorRenderTargetFormat format) {
switch (format) { switch (format) {
case ColorRenderTargetFormat::k_8_8_8_8: case ColorRenderTargetFormat::k_8_8_8_8:
@ -337,19 +344,51 @@ inline uint32_t GetColorRenderTargetFormatComponentCount(
case ColorRenderTargetFormat::k_16_16_16_16_FLOAT: case ColorRenderTargetFormat::k_16_16_16_16_FLOAT:
case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10:
case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16:
return 4; return 4 - 1;
case ColorRenderTargetFormat::k_16_16: case ColorRenderTargetFormat::k_16_16:
case ColorRenderTargetFormat::k_16_16_FLOAT: case ColorRenderTargetFormat::k_16_16_FLOAT:
case ColorRenderTargetFormat::k_32_32_FLOAT: case ColorRenderTargetFormat::k_32_32_FLOAT:
return 2; return 2 - 1;
case ColorRenderTargetFormat::k_32_FLOAT: case ColorRenderTargetFormat::k_32_FLOAT:
return 1; return 1 - 1;
default: default:
assert_unhandled_case(format);
return 0; return 0;
} }
} }
namespace detail {
static constexpr uint32_t encode_format_component_table() {
uint32_t result = 0;
#define ADDFORMAT(name) \
result |= GetComponentCountConst(ColorRenderTargetFormat::name) \
<< (static_cast<uint32_t>(ColorRenderTargetFormat::name) * 2)
ADDFORMAT(k_8_8_8_8);
ADDFORMAT(k_8_8_8_8_GAMMA);
ADDFORMAT(k_2_10_10_10);
ADDFORMAT(k_2_10_10_10_FLOAT);
ADDFORMAT(k_16_16_16_16);
ADDFORMAT(k_16_16_16_16_FLOAT);
ADDFORMAT(k_2_10_10_10_AS_10_10_10_10);
ADDFORMAT(k_2_10_10_10_FLOAT_AS_16_16_16_16);
ADDFORMAT(k_16_16);
ADDFORMAT(k_16_16_FLOAT);
ADDFORMAT(k_32_32_FLOAT);
ADDFORMAT(k_32_FLOAT);
return result;
}
constexpr uint32_t color_format_component_table =
encode_format_component_table();
} // namespace detail
constexpr uint32_t GetColorRenderTargetFormatComponentCount(
ColorRenderTargetFormat format) {
return ((detail::color_format_component_table >>
(static_cast<uint32_t>(format) * 2)) &
0b11) +
1;
}
// Returns the version of the format with the same packing and meaning of values // Returns the version of the format with the same packing and meaning of values
// stored in it, but without blending precision modifiers. // stored in it, but without blending precision modifiers.
constexpr ColorRenderTargetFormat GetStorageColorFormat( constexpr ColorRenderTargetFormat GetStorageColorFormat(