diff --git a/src/xenia/base/dma.cc b/src/xenia/base/dma.cc new file mode 100644 index 000000000..7d2d9d80c --- /dev/null +++ b/src/xenia/base/dma.cc @@ -0,0 +1,415 @@ +#include "dma.h" + +template +static void xedmaloghelper(const char (&fmt)[N], Ts... args) { + char buffer[1024]; + sprintf_s(buffer, fmt, args...); + XELOGI("%s", buffer); +} + +//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__) +//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__) +#define XEDMALOG(...) static_cast(0) +using xe::swcache::CacheLine; +static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine); + +XE_FORCEINLINE +static void XeCopy16384Streaming(CacheLine* XE_RESTRICT to, + CacheLine* XE_RESTRICT from) { + uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE; + + CacheLine* dest1 = to; + CacheLine* src1 = from; + + CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE; + CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE; + + CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2); + CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2); + + CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3); + CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3); +#pragma loop(no_vector) + for (uint32_t i = 0; i < num_lines_for_8k; ++i) { + xe::swcache::CacheLine line0, line1, line2, line3; + + xe::swcache::ReadLine(&line0, src1 + i); + xe::swcache::ReadLine(&line1, src2 + i); + xe::swcache::ReadLine(&line2, src3 + i); + xe::swcache::ReadLine(&line3, src4 + i); + XE_MSVC_REORDER_BARRIER(); + xe::swcache::WriteLineNT(dest1 + i, &line0); + xe::swcache::WriteLineNT(dest2 + i, &line1); + + xe::swcache::WriteLineNT(dest3 + i, &line2); + xe::swcache::WriteLineNT(dest4 + i, &line3); + } + XE_MSVC_REORDER_BARRIER(); +} + +namespace xe::dma { +XE_FORCEINLINE +static void vastcpy_impl(CacheLine* XE_RESTRICT physaddr, + CacheLine* XE_RESTRICT rdmapping, + uint32_t written_length) { + static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE; + + while (written_length >= 16384) { + XeCopy16384Streaming(physaddr, rdmapping); + + physaddr += NUM_LINES_FOR_16K; + rdmapping += NUM_LINES_FOR_16K; + + written_length -= 16384; + } + + if (!written_length) { + return; + } + uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE; + + uint32_t i = 0; + + for (; i + 1 < num_written_lines; i += 2) { + xe::swcache::CacheLine line0, line1; + + xe::swcache::ReadLine(&line0, rdmapping + i); + + xe::swcache::ReadLine(&line1, rdmapping + i + 1); + XE_MSVC_REORDER_BARRIER(); + xe::swcache::WriteLineNT(physaddr + i, &line0); + xe::swcache::WriteLineNT(physaddr + i + 1, &line1); + } + + if (i < num_written_lines) { + xe::swcache::CacheLine line0; + + xe::swcache::ReadLine(&line0, rdmapping + i); + xe::swcache::WriteLineNT(physaddr + i, &line0); + } +} + +XE_NOINLINE +void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping, + uint32_t written_length) { + return vastcpy_impl((CacheLine*)physaddr, (CacheLine*)rdmapping, + written_length); +} + +#define XEDMA_NUM_WORKERS 4 +class alignas(256) XeDMACGeneric : public XeDMAC { + struct alignas(XE_HOST_CACHE_LINE_SIZE) { + std::atomic free_job_slots_; + std::atomic jobs_submitted_; + std::atomic jobs_completed_; + std::atomic num_workers_awoken_; + std::atomic current_job_serial_; + + } dma_volatile_; + + alignas(XE_HOST_CACHE_LINE_SIZE) XeDMAJob jobs_[64]; + + volatile uint32_t jobserials_[64]; + + alignas(XE_HOST_CACHE_LINE_SIZE) + std::unique_ptr job_done_signals_[64]; + // really dont like using unique pointer for this... + std::unique_ptr job_submitted_signal_; + std::unique_ptr job_completed_signal_; + + std::unique_ptr scheduler_thread_; + struct WorkSlice { + uint8_t* destination; + uint8_t* source; + size_t numbytes; + }; + std::unique_ptr workers_[XEDMA_NUM_WORKERS]; + std::unique_ptr worker_has_work_; //[XEDMA_NUM_WORKERS]; + std::unique_ptr worker_has_finished_[XEDMA_NUM_WORKERS]; + + threading::WaitHandle* worker_has_finished_nosafeptr_[XEDMA_NUM_WORKERS]; + WorkSlice worker_workslice_[XEDMA_NUM_WORKERS]; + + // chrispy: this is bad + static uint32_t find_free_hole_in_dword(uint64_t dw) { + XEDMALOG("Finding free hole in 0x%llX", dw); + + for (uint32_t i = 0; i < 64; ++i) { + if (dw & (1ULL << i)) { + continue; + } + + return i; + } + return ~0U; + } + + uint32_t allocate_free_dma_slot() { + XEDMALOG("Allocating free slot"); + uint32_t got_slot = 0; + uint64_t slots; + uint64_t allocated_slot; + + do { + slots = dma_volatile_.free_job_slots_.load(); + + got_slot = find_free_hole_in_dword(slots); + if (!~got_slot) { + XEDMALOG("Didn't get a slot!"); + return ~0U; + } + allocated_slot = slots | (1ULL << got_slot); + + } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong( + slots, allocated_slot))); + XEDMALOG("Allocated slot %d", got_slot); + return got_slot; + } + // chrispy: on x86 this can just be interlockedbittestandreset... + void free_dma_slot(uint32_t slot) { + XEDMALOG("Freeing slot %d", slot); + uint64_t slots; + + uint64_t deallocated_slot; + + do { + slots = dma_volatile_.free_job_slots_.load(); + + deallocated_slot = slots & (~(1ULL << slot)); + + } while (XE_UNLIKELY(!dma_volatile_.free_job_slots_.compare_exchange_strong( + slots, deallocated_slot))); + } + + void DoDMAJob(uint32_t idx) { + XeDMAJob& job = jobs_[idx]; + if (job.precall) { + job.precall(&job); + } + // memcpy(job.destination, job.source, job.size); + + size_t job_size = job.size; + + size_t job_num_lines = job_size / XE_HOST_CACHE_LINE_SIZE; + + size_t line_rounded = job_num_lines * XE_HOST_CACHE_LINE_SIZE; + + size_t rem = job_size - line_rounded; + + size_t num_per_worker = line_rounded / XEDMA_NUM_WORKERS; + + XEDMALOG( + "Distributing %d bytes from %p to %p across %d workers, remainder is " + "%d", + line_rounded, job.source, job.destination, XEDMA_NUM_WORKERS, rem); + if (num_per_worker < 2048) { + XEDMALOG("not distributing across workers, num_per_worker < 8192"); + // not worth splitting up + memcpy(job.destination, job.source, job.size); + job.signal_on_done->Set(); + } else { + for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) { + worker_workslice_[i].destination = + (i * num_per_worker) + job.destination; + worker_workslice_[i].source = (i * num_per_worker) + job.source; + + worker_workslice_[i].numbytes = num_per_worker; + } + if (rem) { + __movsb(job.destination + line_rounded, job.source + line_rounded, rem); + } + // wake them up + worker_has_work_->Set(); + XEDMALOG("Starting waitall for job"); + threading::WaitAll(worker_has_finished_nosafeptr_, XEDMA_NUM_WORKERS, + false); + + XEDMALOG("Waitall for job completed!"); + job.signal_on_done->Set(); + } + if (job.postcall) { + job.postcall(&job); + } + ++dma_volatile_.jobs_completed_; + } + + void WorkerIter(uint32_t worker_index) { + xenia_assert(worker_index < XEDMA_NUM_WORKERS); + auto [dest, src, size] = worker_workslice_[worker_index]; + + // if (++dma_volatile_.num_workers_awoken_ == XEDMA_NUM_WORKERS ) { + worker_has_work_->Reset(); + //} + xenia_assert(size < (1ULL << 32)); + // memcpy(dest, src, size); + dma::vastcpy(dest, src, static_cast(size)); + } + XE_NOINLINE + void WorkerMainLoop(uint32_t worker_index) { + do { + XEDMALOG("Worker iter for worker %d", worker_index); + WorkerIter(worker_index); + + XEDMALOG("Worker %d is done\n", worker_index); + threading::SignalAndWait(worker_has_finished_[worker_index].get(), + worker_has_work_.get(), false); + } while (true); + } + void WorkerMain(uint32_t worker_index) { + XEDMALOG("Entered worker main loop, index %d", worker_index); + threading::Wait(worker_has_work_.get(), false); + XEDMALOG("First wait for worker %d completed, first job ever", + worker_index); + WorkerMainLoop(worker_index); + } + + static void WorkerMainForwarder(void* ptr) { + // we aligned XeDma to 256 bytes and encode extra info in the low 8 + uintptr_t uptr = (uintptr_t)ptr; + + uint32_t worker_index = (uint8_t)uptr; + + uptr &= ~0xFFULL; + + char name_buffer[64]; + sprintf_s(name_buffer, "dma_worker_%d", worker_index); + + xe::threading::set_name(name_buffer); + + reinterpret_cast(uptr)->WorkerMain(worker_index); + } + + void DMAMain() { + XEDMALOG("DmaMain"); + do { + threading::Wait(job_submitted_signal_.get(), false); + + auto slots = dma_volatile_.free_job_slots_.load(); + + for (uint32_t i = 0; i < 64; ++i) { + if (slots & (1ULL << i)) { + XEDMALOG("Got new job at index %d in DMAMain", i); + DoDMAJob(i); + + free_dma_slot(i); + + job_completed_signal_->Set(); + // break; + } + } + + } while (true); + } + + static void DMAMainForwarder(void* ud) { + xe::threading::set_name("dma_main"); + reinterpret_cast(ud)->DMAMain(); + } + + public: + virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override { + XEDMALOG("New job, %p to %p with size %d", job->source, job->destination, + job->size); + uint32_t slot; + do { + slot = allocate_free_dma_slot(); + if (!~slot) { + XEDMALOG( + "Didn't get a free slot, waiting for a job to complete before " + "resuming."); + threading::Wait(job_completed_signal_.get(), false); + + } else { + break; + } + + } while (true); + jobs_[slot] = *job; + + jobs_[slot].signal_on_done = job_done_signals_[slot].get(); + jobs_[slot].signal_on_done->Reset(); + XEDMALOG("Setting job submit signal, pushed into slot %d", slot); + + uint32_t new_serial = dma_volatile_.current_job_serial_++; + + jobserials_[slot] = new_serial; + + ++dma_volatile_.jobs_submitted_; + job_submitted_signal_->Set(); + return (static_cast(new_serial) << 32) | + static_cast(slot); + + // return job_done_signals_[slot].get(); + } + + bool AllJobsDone() { + return dma_volatile_.jobs_completed_ == dma_volatile_.jobs_submitted_; + } + virtual void WaitJobDone(DMACJobHandle handle) override { + uint32_t serial = static_cast(handle >> 32); + uint32_t jobid = static_cast(handle); + do { + if (jobserials_[jobid] != serial) { + return; // done, our slot was reused + } + + auto waitres = threading::Wait(job_done_signals_[jobid].get(), false, + std::chrono::milliseconds{1}); + + if (waitres == threading::WaitResult::kTimeout) { + continue; + } else { + return; + } + } while (true); + } + virtual void WaitForIdle() override { + while (!AllJobsDone()) { + threading::MaybeYield(); + } + } + XeDMACGeneric() { + XEDMALOG("Constructing xedma at addr %p", this); + dma_volatile_.free_job_slots_.store(0ULL); + dma_volatile_.jobs_submitted_.store(0ULL); + dma_volatile_.jobs_completed_.store(0ULL); + dma_volatile_.current_job_serial_.store( + 1ULL); // so that a jobhandle is never 0 + std::memset(jobs_, 0, sizeof(jobs_)); + job_submitted_signal_ = threading::Event::CreateAutoResetEvent(false); + job_completed_signal_ = threading::Event::CreateAutoResetEvent(false); + worker_has_work_ = threading::Event::CreateManualResetEvent(false); + threading::Thread::CreationParameters worker_params{}; + worker_params.create_suspended = false; + worker_params.initial_priority = threading::ThreadPriority::kBelowNormal; + worker_params.stack_size = 65536; // dont need much stack at all + + for (uint32_t i = 0; i < 64; ++i) { + job_done_signals_[i] = threading::Event::CreateManualResetEvent(false); + } + for (uint32_t i = 0; i < XEDMA_NUM_WORKERS; ++i) { + // worker_has_work_[i] = threading::Event::CreateAutoResetEvent(false); + worker_has_finished_[i] = threading::Event::CreateAutoResetEvent(false); + worker_has_finished_nosafeptr_[i] = worker_has_finished_[i].get(); + + uintptr_t encoded = reinterpret_cast(this); + xenia_assert(!(encoded & 0xFFULL)); + xenia_assert(i < 256); + + encoded |= i; + + workers_[i] = threading::Thread::Create(worker_params, [encoded]() { + XeDMACGeneric::WorkerMainForwarder((void*)encoded); + }); + } + threading::Thread::CreationParameters scheduler_params{}; + scheduler_params.create_suspended = false; + scheduler_params.initial_priority = threading::ThreadPriority::kBelowNormal; + scheduler_params.stack_size = 65536; + scheduler_thread_ = threading::Thread::Create(scheduler_params, [this]() { + XeDMACGeneric::DMAMainForwarder((void*)this); + }); + } +}; +XeDMAC* CreateDMAC() { return new XeDMACGeneric(); } +} // namespace xe::dma diff --git a/src/xenia/base/dma.h b/src/xenia/base/dma.h new file mode 100644 index 000000000..e95639753 --- /dev/null +++ b/src/xenia/base/dma.h @@ -0,0 +1,46 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2020 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_BASE_DMA_H_ +#define XENIA_BASE_DMA_H_ +#include "memory.h" +#include "threading.h" +namespace xe::dma { +struct XeDMAJob; +using DmaPrecall = void (*)(XeDMAJob* job); +using DmaPostcall = void (*)(XeDMAJob* job); +struct XeDMAJob { + threading::Event* signal_on_done; + uint8_t* destination; + uint8_t* source; + size_t size; + DmaPrecall precall; + DmaPostcall postcall; + void* userdata1; + void* userdata2; +}; +using DMACJobHandle = uint64_t; + +class XeDMAC { + public: + virtual ~XeDMAC() {} + virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0; + virtual void WaitJobDone(DMACJobHandle handle) = 0; + virtual void WaitForIdle() = 0; +}; + +XeDMAC* CreateDMAC(); +// must be divisible by cache line size +XE_NOINLINE +void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping, + uint32_t written_length); + +} // namespace xe::dma + +#endif // XENIA_BASE_DMA_H_ diff --git a/src/xenia/base/math.h b/src/xenia/base/math.h index 4cafc7178..6e323ede8 100644 --- a/src/xenia/base/math.h +++ b/src/xenia/base/math.h @@ -377,29 +377,45 @@ int64_t m128_i64(const __m128& v) { return m128_i64(_mm_castps_pd(v)); } /* - - std::min/max float has handling for nans, where if either argument is nan the first argument is returned - minss/maxss are different, if either argument is nan the second operand to the instruction is returned - this is problematic because we have no assurances from the compiler on the argument ordering + std::min/max float has handling for nans, where if either argument is + nan the first argument is returned - so only use in places where nan handling is not needed + minss/maxss are different, if either argument is nan the second operand + to the instruction is returned this is problematic because we have no + assurances from the compiler on the argument ordering + + so only use in places where nan handling is not needed */ -static float xe_minf(float x, float y) { +XE_FORCEINLINE +static float ArchMin(float x, float y) { return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(x), _mm_set_ss(y))); } -static float xe_maxf(float x, float y) { +XE_FORCEINLINE +static float ArchMax(float x, float y) { return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(x), _mm_set_ss(y))); } -static float xe_rcpf(float den) { +XE_FORCEINLINE +static float ArchReciprocal(float den) { return _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ss(den))); } #else -static float xe_minf(float x, float y) { return std::min(x, y); } -static float xe_maxf(float x, float y) { return std::max(x, y); } -static float xe_rcpf(float den) { return 1.0f / den; } +static float ArchMin(float x, float y) { return std::min(x, y); } +static float ArchMax(float x, float y) { return std::max(x, y); } +static float ArchReciprocal(float den) { return 1.0f / den; } #endif +XE_FORCEINLINE +static float RefineReciprocal(float initial, float den) { + float t0 = initial * den; + float t1 = t0 * initial; + float rcp2 = initial + initial; + return rcp2 - t1; +} +XE_FORCEINLINE +static float ArchReciprocalRefined(float den) { + return RefineReciprocal(ArchReciprocal(den), den); +} // Similar to the C++ implementation of XMConvertFloatToHalf and // XMConvertHalfToFloat from DirectXMath 3.00 (pre-3.04, which switched from the @@ -494,7 +510,101 @@ inline T sat_sub(T a, T b) { } return T(result); } +namespace divisors { +union IDivExtraInfo { + uint32_t value_; + struct { + uint32_t shift_ : 31; + uint32_t add_ : 1; + } info; +}; +// returns magicnum multiplier +static uint32_t PregenerateUint32Div(uint32_t _denom, uint32_t& out_extra) { + IDivExtraInfo extra; + uint32_t d = _denom; + int p; + uint32_t nc, delta, q1, r1, q2, r2; + struct { + unsigned M; + int a; + int s; + } magu; + magu.a = 0; + nc = -1 - ((uint32_t) - (int32_t)d) % d; + p = 31; + q1 = 0x80000000 / nc; + r1 = 0x80000000 - q1 * nc; + q2 = 0x7FFFFFFF / d; + r2 = 0x7FFFFFFF - q2 * d; + do { + p += 1; + if (r1 >= nc - r1) { + q1 = 2 * q1 + 1; + r1 = 2 * r1 - nc; + } else { + q1 = 2 * q1; + r1 = 2 * r1; + } + if (r2 + 1 >= d - r2) { + if (q2 >= 0x7FFFFFFF) { + magu.a = 1; + } + q2 = 2 * q2 + 1; + r2 = 2 * r2 + 1 - d; + + } else { + if (q2 >= 0x80000000U) { + magu.a = 1; + } + q2 = 2 * q2; + r2 = 2 * r2 + 1; + } + delta = d - 1 - r2; + } while (p < 64 && (q1 < delta || r1 == 0)); + + extra.info.add_ = magu.a; + extra.info.shift_ = p - 32; + out_extra = extra.value_; + return static_cast(q2 + 1); +} + +static inline uint32_t ApplyUint32Div(uint32_t num, uint32_t mul, + uint32_t extradata) { + IDivExtraInfo extra; + + extra.value_ = extradata; + + uint32_t result = ((uint64_t)(num) * (uint64_t)mul) >> 32; + if (extra.info.add_) { + uint32_t addend = result + num; + addend = ((addend < result ? 0x80000000 : 0) | addend); + result = addend; + } + return result >> extra.info.shift_; +} + +static inline uint32_t ApplyUint32UMod(uint32_t num, uint32_t mul, + uint32_t extradata, uint32_t original) { + uint32_t dived = ApplyUint32Div(num, mul, extradata); + unsigned result = num - (dived * original); + + return result; +} + +struct MagicDiv { + uint32_t multiplier_; + uint32_t extradata_; + MagicDiv() : multiplier_(0), extradata_(0) {} + MagicDiv(uint32_t original) { + multiplier_ = PregenerateUint32Div(original, extradata_); + } + + uint32_t Apply(uint32_t numerator) const { + return ApplyUint32Div(numerator, multiplier_, extradata_); + } +}; +} // namespace divisors } // namespace xe #endif // XENIA_BASE_MATH_H_ diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index 1afeedc14..b924c267c 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -672,25 +672,58 @@ static void Prefetch(const void* addr) { #define XE_MSVC_REORDER_BARRIER() static_cast(0) #endif #if XE_ARCH_AMD64 == 1 - +union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine { + struct { + __m256 low32; + __m256 high32; + }; + struct { + __m128i xmms[4]; + }; + float floats[XE_HOST_CACHE_LINE_SIZE / sizeof(float)]; +}; XE_FORCEINLINE -static void WriteLineNT(void* destination, const void* source) { - assert((reinterpret_cast(destination) & 63ULL) == 0); - __m256i low = _mm256_loadu_si256((const __m256i*)source); - __m256i high = _mm256_loadu_si256(&((const __m256i*)source)[1]); - XE_MSVC_REORDER_BARRIER(); - _mm256_stream_si256((__m256i*)destination, low); - _mm256_stream_si256(&((__m256i*)destination)[1], high); +static void WriteLineNT(CacheLine* XE_RESTRICT destination, + const CacheLine* XE_RESTRICT source) { + assert_true((reinterpret_cast(destination) & 63ULL) == 0); + __m256 low = _mm256_loadu_ps(&source->floats[0]); + __m256 high = _mm256_loadu_ps(&source->floats[8]); + _mm256_stream_ps(&destination->floats[0], low); + _mm256_stream_ps(&destination->floats[8], high); } XE_FORCEINLINE -static void ReadLineNT(void* destination, const void* source) { - assert((reinterpret_cast(source) & 63ULL) == 0); - __m256i low = _mm256_stream_load_si256((const __m256i*)source); - __m256i high = _mm256_stream_load_si256(&((const __m256i*)source)[1]); - XE_MSVC_REORDER_BARRIER(); - _mm256_storeu_si256((__m256i*)destination, low); - _mm256_storeu_si256(&((__m256i*)destination)[1], high); +static void ReadLineNT(CacheLine* XE_RESTRICT destination, + const CacheLine* XE_RESTRICT source) { + assert_true((reinterpret_cast(source) & 63ULL) == 0); + + __m128i first = _mm_stream_load_si128(&source->xmms[0]); + __m128i second = _mm_stream_load_si128(&source->xmms[1]); + __m128i third = _mm_stream_load_si128(&source->xmms[2]); + __m128i fourth = _mm_stream_load_si128(&source->xmms[3]); + + destination->xmms[0] = first; + destination->xmms[1] = second; + destination->xmms[2] = third; + destination->xmms[3] = fourth; +} +XE_FORCEINLINE +static void ReadLine(CacheLine* XE_RESTRICT destination, + const CacheLine* XE_RESTRICT source) { + assert_true((reinterpret_cast(source) & 63ULL) == 0); + __m256 low = _mm256_loadu_ps(&source->floats[0]); + __m256 high = _mm256_loadu_ps(&source->floats[8]); + _mm256_storeu_ps(&destination->floats[0], low); + _mm256_storeu_ps(&destination->floats[8], high); +} +XE_FORCEINLINE +static void WriteLine(CacheLine* XE_RESTRICT destination, + const CacheLine* XE_RESTRICT source) { + assert_true((reinterpret_cast(destination) & 63ULL) == 0); + __m256 low = _mm256_loadu_ps(&source->floats[0]); + __m256 high = _mm256_loadu_ps(&source->floats[8]); + _mm256_storeu_ps(&destination->floats[0], low); + _mm256_storeu_ps(&destination->floats[8], high); } XE_FORCEINLINE @@ -699,19 +732,29 @@ XE_FORCEINLINE static void ReadFence() { _mm_lfence(); } XE_FORCEINLINE static void ReadWriteFence() { _mm_mfence(); } + #else - +union alignas(XE_HOST_CACHE_LINE_SIZE) CacheLine { + uint8_t bvals[XE_HOST_CACHE_LINE_SIZE]; +}; XE_FORCEINLINE -static void WriteLineNT(void* destination, const void* source) { - assert((reinterpret_cast(destination) & 63ULL) == 0); - memcpy(destination, source, 64); +static void WriteLineNT(CacheLine* destination, const CacheLine* source) { + memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE); } XE_FORCEINLINE -static void ReadLineNT(void* destination, const void* source) { - assert((reinterpret_cast(source) & 63ULL) == 0); - memcpy(destination, source, 64); +static void ReadLineNT(CacheLine* destination, const CacheLine* source) { + memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE); } +XE_FORCEINLINE +static void WriteLine(CacheLine* destination, const CacheLine* source) { + memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE); +} +XE_FORCEINLINE +static void ReadLine(CacheLine* destination, const CacheLine* source) { + memcpy(destination, source, XE_HOST_CACHE_LINE_SIZE); +} + XE_FORCEINLINE static void WriteFence() {} XE_FORCEINLINE @@ -720,6 +763,47 @@ XE_FORCEINLINE static void ReadWriteFence() {} #endif } // namespace swcache + +template +static void smallcpy_const(void* destination, const void* source) { +#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1 + if constexpr ((Size & 7) == 0) { + __movsq((unsigned long long*)destination, (const unsigned long long*)source, + Size / 8); + } else if constexpr ((Size & 3) == 0) { + __movsd((unsigned long*)destination, (const unsigned long*)source, + Size / 4); + // dont even bother with movsw, i think the operand size override prefix + // slows it down + } else { + __movsb((unsigned char*)destination, (const unsigned char*)source, Size); + } +#else + memcpy(destination, source, Size); +#endif +} +template +static void smallset_const(void* destination, unsigned char fill_value) { +#if XE_ARCH_AMD64 == 1 && XE_COMPILER_MSVC == 1 + if constexpr ((Size & 7) == 0) { + unsigned long long fill = + static_cast(fill_value) * 0x0101010101010101ULL; + + __stosq((unsigned long long*)destination, fill, Size / 8); + } else if constexpr ((Size & 3) == 0) { + static constexpr unsigned long fill = + static_cast(fill_value) * 0x01010101U; + __stosd((unsigned long*)destination, fill, Size / 4); + // dont even bother with movsw, i think the operand size override prefix + // slows it down + } else { + __stosb((unsigned char*)destination, fill_value, Size); + } +#else + memset(destination, fill_value, Size); +#endif +} + } // namespace xe #endif // XENIA_BASE_MEMORY_H_ diff --git a/src/xenia/base/ring_buffer.h b/src/xenia/base/ring_buffer.h index a4befb686..84d5893b5 100644 --- a/src/xenia/base/ring_buffer.h +++ b/src/xenia/base/ring_buffer.h @@ -59,16 +59,8 @@ class RingBuffer { // subtract instead void set_read_offset(size_t offset) { read_offset_ = offset % capacity_; } ring_size_t read_count() const { -// chrispy: these branches are unpredictable -#if 0 - if (read_offset_ == write_offset_) { - return 0; - } else if (read_offset_ < write_offset_) { - return write_offset_ - read_offset_; - } else { - return (capacity_ - read_offset_) + write_offset_; - } -#else + // chrispy: these branches are unpredictable + ring_size_t read_offs = read_offset_; ring_size_t write_offs = write_offset_; ring_size_t cap = capacity_; @@ -77,14 +69,6 @@ class RingBuffer { ring_size_t wrap_read_count = (cap - read_offs) + write_offs; ring_size_t comparison_value = read_offs <= write_offs; -#if 0 - size_t selector = - static_cast(-static_cast(comparison_value)); - offset_delta &= selector; - - wrap_read_count &= ~selector; - return offset_delta | wrap_read_count; -#else if (XE_LIKELY(read_offs <= write_offs)) { return offset_delta; // will be 0 if they are equal, semantically @@ -93,8 +77,6 @@ class RingBuffer { } else { return wrap_read_count; } -#endif -#endif } ring_size_t write_offset() const { return write_offset_; } @@ -116,9 +98,9 @@ class RingBuffer { void AdvanceWrite(size_t count); struct ReadRange { - const uint8_t* first; + const uint8_t* XE_RESTRICT first; - const uint8_t* second; + const uint8_t* XE_RESTRICT second; ring_size_t first_length; ring_size_t second_length; }; @@ -126,9 +108,11 @@ class RingBuffer { void EndRead(ReadRange read_range); /* - BeginRead, but if there is a second Range it will prefetch all lines of it + BeginRead, but if there is a second Range it will prefetch all lines of + it - this does not prefetch the first range, because software prefetching can do that faster than we can + this does not prefetch the first range, because software + prefetching can do that faster than we can */ template XE_FORCEINLINE ReadRange BeginPrefetchedRead(size_t count) { @@ -138,7 +122,7 @@ class RingBuffer { ring_size_t numlines = xe::align(range.second_length, XE_HOST_CACHE_LINE_SIZE) / XE_HOST_CACHE_LINE_SIZE; - //chrispy: maybe unroll? + // chrispy: maybe unroll? for (ring_size_t i = 0; i < numlines; ++i) { swcache::Prefetch(range.second + (i * XE_HOST_CACHE_LINE_SIZE)); } @@ -187,7 +171,7 @@ class RingBuffer { } private: - uint8_t* buffer_ = nullptr; + uint8_t* XE_RESTRICT buffer_ = nullptr; ring_size_t capacity_ = 0; ring_size_t read_offset_ = 0; ring_size_t write_offset_ = 0; diff --git a/src/xenia/base/split_map.h b/src/xenia/base/split_map.h new file mode 100644 index 000000000..510c2ed70 --- /dev/null +++ b/src/xenia/base/split_map.h @@ -0,0 +1,87 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2019 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_BASE_SPLIT_MAP_H_ +#define XENIA_BASE_SPLIT_MAP_H_ +#include +#include +namespace xe { +/* + a map structure that is optimized for infrequent + reallocation/resizing/erasure and frequent searches by key implemented as 2 + std::vectors, one of the keys and one of the values +*/ +template +class split_map { + using key_vector = std::vector; + using value_vector = std::vector; + + key_vector keys_; + value_vector values_; + + public: + using my_type = split_map; + + uint32_t IndexForKey(const TKey& k) { + auto lbound = std::lower_bound(keys_.begin(), keys_.end(), k); + return static_cast(lbound - keys_.begin()); + } + + uint32_t size() const { return static_cast(keys_.size()); } + key_vector& Keys() { return keys_; } + value_vector& Values() { return values_; } + void clear() { + keys_.clear(); + values_.clear(); + } + void resize(uint32_t new_size) { + keys_.resize(static_cast(new_size)); + values_.resize(static_cast(new_size)); + } + + void reserve(uint32_t new_size) { + keys_.reserve(static_cast(new_size)); + values_.reserve(static_cast(new_size)); + } + const TKey* KeyAt(uint32_t index) const { + if (index == size()) { + return nullptr; + } else { + return &keys_[index]; + } + } + const TValue* ValueAt(uint32_t index) const { + if (index == size()) { + return nullptr; + } else { + return &values_[index]; + } + } + + void InsertAt(TKey k, TValue v, uint32_t idx) { + uint32_t old_size = size(); + + bool needs_shiftup = idx != old_size; + + values_.insert(values_.begin() + idx, v); + keys_.insert(keys_.begin() + idx, k); + } + void EraseAt(uint32_t idx) { + uint32_t old_size = size(); + if (idx == old_size) { + return; // trying to erase nonexistent entry + } else { + values_.erase(values_.begin() + idx); + keys_.erase(keys_.begin() + idx); + } + } +}; +} // namespace xe + +#endif // XENIA_BASE_SPLIT_MAP_H_ \ No newline at end of file diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index d1394d202..17abb72e7 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -57,7 +57,11 @@ DEFINE_bool(enable_incorrect_roundingmode_behavior, false, "code. The workaround may cause reduced CPU performance but is a " "more accurate emulation", "x64"); - +DEFINE_uint32(align_all_basic_blocks, 0, + "Aligns the start of all basic blocks to N bytes. Only specify a " + "power of 2, 16 is the recommended value. Results in larger " + "icache usage, but potentially faster loops", + "x64"); #if XE_X64_PROFILER_AVAILABLE == 1 DEFINE_bool(instrument_call_times, false, "Compute time taken for functions, for profiling guest code", @@ -110,7 +114,6 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) TEST_EMIT_FEATURE(kX64EmitLZCNT, Xbyak::util::Cpu::tLZCNT); TEST_EMIT_FEATURE(kX64EmitBMI1, Xbyak::util::Cpu::tBMI1); TEST_EMIT_FEATURE(kX64EmitBMI2, Xbyak::util::Cpu::tBMI2); - TEST_EMIT_FEATURE(kX64EmitF16C, Xbyak::util::Cpu::tF16C); TEST_EMIT_FEATURE(kX64EmitMovbe, Xbyak::util::Cpu::tMOVBE); TEST_EMIT_FEATURE(kX64EmitGFNI, Xbyak::util::Cpu::tGFNI); TEST_EMIT_FEATURE(kX64EmitAVX512F, Xbyak::util::Cpu::tAVX512F); @@ -200,7 +203,55 @@ bool X64Emitter::Emit(GuestFunction* function, HIRBuilder* builder, return true; } +#pragma pack(push, 1) +struct RGCEmitted { + uint8_t ff_; + uint32_t rgcid_; +}; +#pragma pack(pop) +#if 0 +void X64Emitter::InjectCallAddresses(void* new_execute_address) { + for (auto&& callsite : call_sites_) { + RGCEmitted* hunter = (RGCEmitted*)new_execute_address; + while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) { + hunter = + reinterpret_cast(reinterpret_cast(hunter) + 1); + } + + hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8; + hunter->rgcid_ = + static_cast(static_cast(callsite.destination_) - + reinterpret_cast(hunter + 1)); + } +} + +#else +void X64Emitter::InjectCallAddresses(void* new_execute_address) { +#if 0 + RGCEmitted* hunter = (RGCEmitted*)new_execute_address; + + std::map id_to_rgc{}; + + for (auto&& callsite : call_sites_) { + id_to_rgc[callsite.offset_] = &callsite; + } +#else + RGCEmitted* hunter = (RGCEmitted*)new_execute_address; + for (auto&& callsite : call_sites_) { + while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) { + hunter = + reinterpret_cast(reinterpret_cast(hunter) + 1); + } + + hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8; + hunter->rgcid_ = + static_cast(static_cast(callsite.destination_) - + reinterpret_cast(hunter + 1)); + } +#endif +} +#endif void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, GuestFunction* function) { // To avoid changing xbyak, we do a switcharoo here. @@ -218,25 +269,9 @@ void* X64Emitter::Emplace(const EmitFunctionInfo& func_info, if (function) { code_cache_->PlaceGuestCode(function->address(), top_, func_info, function, new_execute_address, new_write_address); - if (cvars::resolve_rel32_guest_calls) { - for (auto&& callsite : call_sites_) { -#pragma pack(push, 1) - struct RGCEmitted { - uint8_t ff_; - uint32_t rgcid_; - }; -#pragma pack(pop) - RGCEmitted* hunter = (RGCEmitted*)new_execute_address; - while (hunter->ff_ != 0xFF || hunter->rgcid_ != callsite.offset_) { - hunter = reinterpret_cast( - reinterpret_cast(hunter) + 1); - } - hunter->ff_ = callsite.is_jump_ ? 0xE9 : 0xE8; - hunter->rgcid_ = - static_cast(static_cast(callsite.destination_) - - reinterpret_cast(hunter + 1)); - } + if (cvars::resolve_rel32_guest_calls) { + InjectCallAddresses(new_execute_address); } } else { code_cache_->PlaceHostCode(0, top_, func_info, new_execute_address, @@ -367,6 +402,9 @@ bool X64Emitter::Emit(HIRBuilder* builder, EmitFunctionInfo& func_info) { label = label->next; } + if (cvars::align_all_basic_blocks) { + align(cvars::align_all_basic_blocks, true); + } // Process instructions. const Instr* instr = block->instr_head; while (instr) { @@ -1000,12 +1038,6 @@ static const vec128_t xmm_consts[] = { vec128i(0x7f800000), /* XMMThreeFloatMask */ vec128i(~0U, ~0U, ~0U, 0U), - /*XMMXenosF16ExtRangeStart*/ - vec128f(65504), - /*XMMVSRShlByteshuf*/ - v128_setr_bytes(13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 0x80), - // XMMVSRMask - vec128b(1), /* XMMF16UnpackLCPI2 */ @@ -1036,8 +1068,7 @@ static const vec128_t xmm_consts[] = { /*XMMXOPWordShiftMask*/ vec128s(15), /*XMMXOPDwordShiftMask*/ - vec128i(31) -}; + vec128i(31)}; void* X64Emitter::FindByteConstantOffset(unsigned bytevalue) { for (auto& vec : xmm_consts) { diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 01027cc0c..b20978ea3 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -157,9 +157,6 @@ enum XmmConst { XMMLVSRTableBase, XMMSingleDenormalMask, XMMThreeFloatMask, // for clearing the fourth float prior to DOT_PRODUCT_3 - XMMXenosF16ExtRangeStart, - XMMVSRShlByteshuf, - XMMVSRMask, XMMF16UnpackLCPI2, // 0x38000000, 1/ 32768 XMMF16UnpackLCPI3, // 0x0x7fe000007fe000 XMMF16PackLCPI0, @@ -194,7 +191,7 @@ enum X64EmitterFeatureFlags { kX64EmitLZCNT = 1 << 2, // this is actually ABM and includes popcount kX64EmitBMI1 = 1 << 3, kX64EmitBMI2 = 1 << 4, - kX64EmitF16C = 1 << 5, + kX64EmitPrefetchW = 1 << 5, kX64EmitMovbe = 1 << 6, kX64EmitGFNI = 1 << 7, @@ -215,11 +212,14 @@ enum X64EmitterFeatureFlags { // inc/dec) do not introduce false dependencies on EFLAGS // because the individual flags are treated as different vars by // the processor. (this applies to zen) - kX64EmitPrefetchW = 1 << 16, - kX64EmitXOP = 1 << 17, // chrispy: xop maps really well to many vmx + kX64EmitXOP = 1 << 16, // chrispy: xop maps really well to many vmx // instructions, and FX users need the boost - kX64EmitFMA4 = 1 << 18, // todo: also use on zen1? - kX64EmitTBM = 1 << 19 + kX64EmitFMA4 = 1 << 17, // todo: also use on zen1? + kX64EmitTBM = 1 << 18, + // kX64XMMRegisterMergeOptimization = 1 << 19, //section 2.11.5, amd family + // 17h/19h optimization manuals. allows us to save 1 byte on certain xmm + // instructions by using the legacy sse version if we recently cleared the + // high 128 bits of the }; class ResolvableGuestCall { public: @@ -251,6 +251,7 @@ class X64Emitter : public Xbyak::CodeGenerator { uint32_t debug_info_flags, FunctionDebugInfo* debug_info, void** out_code_address, size_t* out_code_size, std::vector* out_source_map); + void InjectCallAddresses(void* new_execute_addr); public: // Reserved: rsp, rsi, rdi diff --git a/src/xenia/cpu/backend/x64/x64_op.h b/src/xenia/cpu/backend/x64/x64_op.h index b9257f179..78c459101 100644 --- a/src/xenia/cpu/backend/x64/x64_op.h +++ b/src/xenia/cpu/backend/x64/x64_op.h @@ -43,23 +43,23 @@ enum KeyType { KEY_TYPE_V_F64 = OPCODE_SIG_TYPE_V + FLOAT64_TYPE, KEY_TYPE_V_V128 = OPCODE_SIG_TYPE_V + VEC128_TYPE, }; - +using InstrKeyValue = uint32_t; #pragma pack(push, 1) union InstrKey { - uint32_t value; + InstrKeyValue value; struct { - uint32_t opcode : 8; - uint32_t dest : 5; - uint32_t src1 : 5; - uint32_t src2 : 5; - uint32_t src3 : 5; - uint32_t reserved : 4; + InstrKeyValue opcode : 8; + InstrKeyValue dest : 5; + InstrKeyValue src1 : 5; + InstrKeyValue src2 : 5; + InstrKeyValue src3 : 5; + InstrKeyValue reserved : 4; }; - operator uint32_t() const { return value; } + operator InstrKeyValue() const { return value; } InstrKey() : value(0) { static_assert_size(*this, sizeof(value)); } - InstrKey(uint32_t v) : value(v) {} + InstrKey(InstrKeyValue v) : value(v) {} // this used to take about 1% cpu while precompiling // it kept reloading opcode, and also constantly repacking and unpacking the @@ -67,16 +67,16 @@ union InstrKey { InstrKey(const Instr* i) : value(0) { const OpcodeInfo* info = i->GetOpcodeInfo(); - uint32_t sig = info->signature; + InstrKeyValue sig = info->signature; OpcodeSignatureType dest_type, src1_type, src2_type, src3_type; UnpackOpcodeSig(sig, dest_type, src1_type, src2_type, src3_type); - uint32_t out_desttype = (uint32_t)dest_type; - uint32_t out_src1type = (uint32_t)src1_type; - uint32_t out_src2type = (uint32_t)src2_type; - uint32_t out_src3type = (uint32_t)src3_type; + InstrKeyValue out_desttype = (InstrKeyValue)dest_type; + InstrKeyValue out_src1type = (InstrKeyValue)src1_type; + InstrKeyValue out_src2type = (InstrKeyValue)src2_type; + InstrKeyValue out_src3type = (InstrKeyValue)src3_type; Value* destv = i->dest; // pre-deref, even if not value @@ -105,7 +105,7 @@ union InstrKey { template struct Construct { - static const uint32_t value = + static const InstrKeyValue value = (OPCODE) | (DEST << 8) | (SRC1 << 13) | (SRC2 << 18) | (SRC3 << 23); }; }; @@ -307,8 +307,8 @@ struct I : DestField { protected: template friend struct Sequence; - bool Load(const Instr* i) { - if (InstrKey(i).value == key && BASE::LoadDest(i)) { + bool Load(const Instr* i, InstrKeyValue kv) { + if (kv == key && BASE::LoadDest(i)) { instr = i; return true; } @@ -329,8 +329,8 @@ struct I : DestField { protected: template friend struct Sequence; - bool Load(const Instr* i) { - if (InstrKey(i).value == key && BASE::LoadDest(i)) { + bool Load(const Instr* i, InstrKeyValue kv) { + if (kv == key && BASE::LoadDest(i)) { instr = i; src1.Load(i->src1); return true; @@ -355,8 +355,8 @@ struct I : DestField { protected: template friend struct Sequence; - bool Load(const Instr* i) { - if (InstrKey(i).value == key && BASE::LoadDest(i)) { + bool Load(const Instr* i, InstrKeyValue kv) { + if (kv == key && BASE::LoadDest(i)) { instr = i; src1.Load(i->src1); src2.Load(i->src2); @@ -385,8 +385,8 @@ struct I : DestField { protected: template friend struct Sequence; - bool Load(const Instr* i) { - if (InstrKey(i).value == key && BASE::LoadDest(i)) { + bool Load(const Instr* i, InstrKeyValue ikey) { + if (ikey == key && BASE::LoadDest(i)) { instr = i; src1.Load(i->src1); src2.Load(i->src2); @@ -422,9 +422,9 @@ struct Sequence { static constexpr uint32_t head_key() { return T::key; } - static bool Select(X64Emitter& e, const Instr* i) { + static bool Select(X64Emitter& e, const Instr* i, InstrKeyValue ikey) { T args; - if (!args.Load(i)) { + if (!args.Load(i, ikey)) { return false; } SEQ::Emit(e, args); diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc index dc5fa7d3d..54e7ac8a0 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_control.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc @@ -27,12 +27,6 @@ static void EmitFusedBranch(X64Emitter& e, const T& i) { if (valid) { auto name = i.src2.value->name; switch (opcode) { - case OPCODE_IS_TRUE: - e.jnz(name, e.T_NEAR); - break; - case OPCODE_IS_FALSE: - e.jz(name, e.T_NEAR); - break; case OPCODE_COMPARE_EQ: e.je(name, e.T_NEAR); break; @@ -299,26 +293,14 @@ struct CALL_TRUE_I64 struct CALL_TRUE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->is_guest()); - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Call(i.instr, static_cast(i.src2.value)); - e.L(skip); - e.ForgetMxcsrMode(); + assert_impossible_sequence(CALL_TRUE_F32); } }; struct CALL_TRUE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->is_guest()); - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip); - e.Call(i.instr, static_cast(i.src2.value)); - e.L(skip); - e.ForgetMxcsrMode(); + assert_impossible_sequence(CALL_TRUE_F64); } }; EMITTER_OPCODE_TABLE(OPCODE_CALL_TRUE, CALL_TRUE_I8, CALL_TRUE_I16, @@ -404,22 +386,14 @@ struct CALL_INDIRECT_TRUE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); + assert_impossible_sequence(CALL_INDIRECT_TRUE_F32); } }; struct CALL_INDIRECT_TRUE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - Xbyak::Label skip; - e.jz(skip, CodeGenerator::T_NEAR); - e.CallIndirect(i.instr, i.src2); - e.L(skip); + assert_impossible_sequence(CALL_INDIRECT_TRUE_F64); } }; EMITTER_OPCODE_TABLE(OPCODE_CALL_INDIRECT_TRUE, CALL_INDIRECT_TRUE_I8, @@ -486,15 +460,13 @@ struct RETURN_TRUE_I64 struct RETURN_TRUE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); + assert_impossible_sequence(RETURN_TRUE_F32); } }; struct RETURN_TRUE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jnz(e.epilog_label(), CodeGenerator::T_NEAR); + assert_impossible_sequence(RETURN_TRUE_F64); } }; EMITTER_OPCODE_TABLE(OPCODE_RETURN_TRUE, RETURN_TRUE_I8, RETURN_TRUE_I16, @@ -553,33 +525,25 @@ struct BRANCH_TRUE_I64 struct BRANCH_TRUE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info && - i.instr->prev->dest == i.src1.value) { - e.jnz(i.src2.value->name, e.T_NEAR); - } else if (i.instr->prev && - i.instr->prev->opcode == &OPCODE_IS_FALSE_info && - i.instr->prev->dest == i.src1.value) { - e.jz(i.src2.value->name, e.T_NEAR); - } else { - e.vptest(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); - } + /* + chrispy: right now, im not confident that we are always clearing + the upper 96 bits of registers, making vptest extremely unsafe. many + ss/sd operations copy over the upper 96 from the source, and for abs we + negate ALL elements, making the top 64 bits contain 0x80000000 etc + */ + Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); + e.vmovd(e.eax, input); + e.test(e.eax, e.eax); + e.jnz(i.src2.value->name, e.T_NEAR); } }; struct BRANCH_TRUE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info && - i.instr->prev->dest == i.src1.value) { - e.jnz(i.src2.value->name, e.T_NEAR); - } else if (i.instr->prev && - i.instr->prev->opcode == &OPCODE_IS_FALSE_info && - i.instr->prev->dest == i.src1.value) { - e.jz(i.src2.value->name, e.T_NEAR); - } else { - e.vptest(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); - } + Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); + e.vmovq(e.rax, input); + e.test(e.rax, e.rax); + e.jnz(i.src2.value->name, e.T_NEAR); } }; EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, @@ -624,7 +588,9 @@ struct BRANCH_FALSE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); + Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); + e.vmovd(e.eax, input); + e.test(e.eax, e.eax); e.jz(i.src2.value->name, e.T_NEAR); } }; @@ -632,7 +598,9 @@ struct BRANCH_FALSE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); + Xmm input = GetInputRegOrConstant(e, i.src1, e.xmm0); + e.vmovq(e.rax, input); + e.test(e.rax, e.rax); e.jz(i.src2.value->name, e.T_NEAR); } }; diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index 70865c30e..cc6c1cf32 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -975,6 +975,9 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) { if (!cvars::emit_mmio_aware_stores_for_recorded_exception_addresses) { return false; } + if (IsTracingData()) { // incompatible with tracing + return false; + } uint32_t guestaddr = i->GuestAddressFor(); if (!guestaddr) { return false; @@ -984,7 +987,54 @@ static bool IsPossibleMMIOInstruction(X64Emitter& e, const hir::Instr* i) { return flags && flags->accessed_mmio; } +template +static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) { + if (swap) { + value = xe::byte_swap(value); + } + if (guestaddr >= 0xE0000000) { + guestaddr += 0x1000; + } + auto ctx = reinterpret_cast(_ctx); + + auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr); + if (!gaddr) { + *reinterpret_cast(ctx->virtual_membase + guestaddr) = value; + } else { + value = xe::byte_swap(value); /* + was having issues, found by comparing the values used with exceptions + to these that we were reversed... + */ + gaddr->write(nullptr, gaddr->callback_context, guestaddr, value); + } +} + +template +static T MMIOAwareLoad(void* _ctx, unsigned int guestaddr) { + T value; + + if (guestaddr >= 0xE0000000) { + guestaddr += 0x1000; + } + + auto ctx = reinterpret_cast(_ctx); + + auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr); + if (!gaddr) { + value = *reinterpret_cast(ctx->virtual_membase + guestaddr); + if (swap) { + value = xe::byte_swap(value); + } + } else { + /* + was having issues, found by comparing the values used with exceptions + to these that we were reversed... + */ + value = gaddr->read(nullptr, gaddr->callback_context, guestaddr); + } + return value; +} // ============================================================================ // OPCODE_LOAD_OFFSET // ============================================================================ @@ -1016,16 +1066,38 @@ struct LOAD_OFFSET_I16 struct LOAD_OFFSET_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(i.dest, e.dword[addr]); + if (IsPossibleMMIOInstruction(e, i.instr)) { + void* addrptr = (void*)&MMIOAwareLoad; + + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + addrptr = (void*)&MMIOAwareLoad; + } + if (i.src1.is_constant) { + e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant()); + } else { + e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32()); + } + + if (i.src2.is_constant) { + e.add(e.GetNativeParam(0).cvt32(), (uint32_t)i.src2.constant()); + } else { + e.add(e.GetNativeParam(0).cvt32(), i.src2.reg().cvt32()); + } + + e.CallNativeSafe(addrptr); + e.mov(i.dest, e.eax); + } else { + auto addr = ComputeMemoryAddressOffset(e, i.src1, i.src2); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.dword[addr]); + } else { + e.mov(i.dest, e.dword[addr]); + e.bswap(i.dest); + } } else { e.mov(i.dest, e.dword[addr]); - e.bswap(i.dest); } - } else { - e.mov(i.dest, e.dword[addr]); } } }; @@ -1049,28 +1121,6 @@ struct LOAD_OFFSET_I64 EMITTER_OPCODE_TABLE(OPCODE_LOAD_OFFSET, LOAD_OFFSET_I8, LOAD_OFFSET_I16, LOAD_OFFSET_I32, LOAD_OFFSET_I64); -template -static void MMIOAwareStore(void* _ctx, unsigned int guestaddr, T value) { - if (swap) { - value = xe::byte_swap(value); - } - if (guestaddr >= 0xE0000000) { - guestaddr += 0x1000; - } - - auto ctx = reinterpret_cast(_ctx); - - auto gaddr = ctx->processor->memory()->LookupVirtualMappedRange(guestaddr); - if (!gaddr) { - *reinterpret_cast(ctx->virtual_membase + guestaddr) = value; - } else { - value = xe::byte_swap(value); /* - was having issues, found by comparing the values used with exceptions - to these that we were reversed... - */ - gaddr->write(nullptr, gaddr->callback_context, guestaddr, value); - } -} // ============================================================================ // OPCODE_STORE_OFFSET // ============================================================================ @@ -1225,21 +1275,37 @@ struct LOAD_I16 : Sequence> { }; struct LOAD_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - auto addr = ComputeMemoryAddress(e, i.src1); - if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { - if (e.IsFeatureEnabled(kX64EmitMovbe)) { - e.movbe(i.dest, e.dword[addr]); + if (IsPossibleMMIOInstruction(e, i.instr)) { + void* addrptr = (void*)&MMIOAwareLoad; + + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + addrptr = (void*)&MMIOAwareLoad; + } + if (i.src1.is_constant) { + e.mov(e.GetNativeParam(0).cvt32(), (uint32_t)i.src1.constant()); + } else { + e.mov(e.GetNativeParam(0).cvt32(), i.src1.reg().cvt32()); + } + + e.CallNativeSafe(addrptr); + e.mov(i.dest, e.eax); + } else { + auto addr = ComputeMemoryAddress(e, i.src1); + if (i.instr->flags & LoadStoreFlags::LOAD_STORE_BYTE_SWAP) { + if (e.IsFeatureEnabled(kX64EmitMovbe)) { + e.movbe(i.dest, e.dword[addr]); + } else { + e.mov(i.dest, e.dword[addr]); + e.bswap(i.dest); + } } else { e.mov(i.dest, e.dword[addr]); - e.bswap(i.dest); } - } else { - e.mov(i.dest, e.dword[addr]); - } - if (IsTracingData()) { - e.mov(e.GetNativeParam(1).cvt32(), i.dest); - e.lea(e.GetNativeParam(0), e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryLoadI32)); + if (IsTracingData()) { + e.mov(e.GetNativeParam(1).cvt32(), i.dest); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryLoadI32)); + } } } }; @@ -1390,14 +1456,13 @@ struct STORE_I32 : Sequence> { } else { e.mov(e.dword[addr], i.src2); } + if (IsTracingData()) { + e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]); + e.lea(e.GetNativeParam(0), e.ptr[addr]); + e.CallNative(reinterpret_cast(TraceMemoryStoreI32)); + } } } - if (IsTracingData()) { - auto addr = ComputeMemoryAddress(e, i.src1); - e.mov(e.GetNativeParam(1).cvt32(), e.dword[addr]); - e.lea(e.GetNativeParam(0), e.ptr[addr]); - e.CallNative(reinterpret_cast(TraceMemoryStoreI32)); - } } }; struct STORE_I64 : Sequence> { diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 46eb285cf..e5843f37d 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -19,15 +19,15 @@ #include "xenia/base/cvar.h" #include "xenia/cpu/backend/x64/x64_stack_layout.h" -DEFINE_bool(xop_rotates, false, "rotate via xop", "X64"); +DEFINE_bool(xop_rotates, false, "rotate via xop", "x64"); -DEFINE_bool(xop_left_shifts, false, "shl via xop", "X64"); +DEFINE_bool(xop_left_shifts, false, "shl via xop", "x64"); -DEFINE_bool(xop_right_shifts, false, "shr via xop", "X64"); +DEFINE_bool(xop_right_shifts, false, "shr via xop", "x64"); -DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "X64"); +DEFINE_bool(xop_arithmetic_right_shifts, false, "sar via xop", "x64"); -DEFINE_bool(xop_compares, true, "compare via xop", "X64"); +DEFINE_bool(xop_compares, true, "compare via xop", "x64"); namespace xe { namespace cpu { diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 48e340d42..bf5addd3d 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -67,7 +67,7 @@ using namespace xe::cpu::hir; using xe::cpu::hir::Instr; -typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*); +typedef bool (*SequenceSelectFn)(X64Emitter&, const Instr*, InstrKeyValue ikey); std::unordered_map sequence_table; // ============================================================================ @@ -868,59 +868,6 @@ static bool MayCombineSetxWithFollowingCtxStore(const hir::Instr* setx_insn, } return false; } -#define EMITTER_IS_TRUE(typ, tester) \ - struct IS_TRUE_##typ \ - : Sequence> { \ - static void Emit(X64Emitter& e, const EmitArgType& i) { \ - e.tester(i.src1, i.src1); \ - unsigned ctxoffset = 0; \ - if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \ - e.setnz(e.byte[e.GetContextReg() + ctxoffset]); \ - } else { \ - e.setnz(i.dest); \ - } \ - } \ - } - -#define EMITTER_IS_TRUE_INT(typ) EMITTER_IS_TRUE(typ, test) - -EMITTER_IS_TRUE_INT(I8); -EMITTER_IS_TRUE_INT(I16); -EMITTER_IS_TRUE_INT(I32); -EMITTER_IS_TRUE_INT(I64); -EMITTER_IS_TRUE(F32, vtestps); -EMITTER_IS_TRUE(F64, vtestpd); - -EMITTER_IS_TRUE(V128, vptest); - -EMITTER_OPCODE_TABLE(OPCODE_IS_TRUE, IS_TRUE_I8, IS_TRUE_I16, IS_TRUE_I32, - IS_TRUE_I64, IS_TRUE_F32, IS_TRUE_F64, IS_TRUE_V128); - -#define EMITTER_IS_FALSE(typ, tester) \ - struct IS_FALSE_##typ \ - : Sequence> { \ - static void Emit(X64Emitter& e, const EmitArgType& i) { \ - e.tester(i.src1, i.src1); \ - unsigned ctxoffset = 0; \ - if (MayCombineSetxWithFollowingCtxStore(i.instr, ctxoffset)) { \ - e.setz(e.byte[e.GetContextReg() + ctxoffset]); \ - } else { \ - e.setz(i.dest); \ - } \ - } \ - } -#define EMITTER_IS_FALSE_INT(typ) EMITTER_IS_FALSE(typ, test) -EMITTER_IS_FALSE_INT(I8); -EMITTER_IS_FALSE_INT(I16); -EMITTER_IS_FALSE_INT(I32); -EMITTER_IS_FALSE_INT(I64); -EMITTER_IS_FALSE(F32, vtestps); -EMITTER_IS_FALSE(F64, vtestpd); - -EMITTER_IS_FALSE(V128, vptest); - -EMITTER_OPCODE_TABLE(OPCODE_IS_FALSE, IS_FALSE_I8, IS_FALSE_I16, IS_FALSE_I32, - IS_FALSE_I64, IS_FALSE_F32, IS_FALSE_F64, IS_FALSE_V128); // ============================================================================ // OPCODE_IS_NAN @@ -3308,7 +3255,7 @@ bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) { auto it = sequence_table.find(key); if (it != sequence_table.end()) { - if (it->second(*e, i)) { + if (it->second(*e, i, InstrKey(i))) { *new_tail = i->next; return true; } diff --git a/src/xenia/cpu/backend/x64/x64_sequences.h b/src/xenia/cpu/backend/x64/x64_sequences.h index 0ce6a7f57..05a5b42d4 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.h +++ b/src/xenia/cpu/backend/x64/x64_sequences.h @@ -25,7 +25,7 @@ namespace x64 { class X64Emitter; -typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*); +typedef bool (*SequenceSelectFn)(X64Emitter&, const hir::Instr*, uint32_t ikey); extern std::unordered_map sequence_table; template diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index 26f0fecb4..2d1654030 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -361,28 +361,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { } } break; - case OPCODE_IS_TRUE: - if (i->src1.value->IsConstant()) { - if (i->src1.value->IsConstantTrue()) { - v->set_constant(uint8_t(1)); - } else { - v->set_constant(uint8_t(0)); - } - i->Remove(); - result = true; - } - break; - case OPCODE_IS_FALSE: - if (i->src1.value->IsConstant()) { - if (i->src1.value->IsConstantFalse()) { - v->set_constant(uint8_t(1)); - } else { - v->set_constant(uint8_t(0)); - } - i->Remove(); - result = true; - } - break; case OPCODE_IS_NAN: if (i->src1.value->IsConstant()) { if (i->src1.value->type == FLOAT32_TYPE && @@ -602,7 +580,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { if (should_skip_because_of_float) { break; - } + } v->set_from(i->src1.value); v->Max(i->src2.value); i->Remove(); diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index e94b570dc..a8c93c3b6 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -214,12 +214,7 @@ bool SimplificationPass::CheckBooleanXor1(hir::Instr* i, bool need_zx = (tunflags & MOVTUNNEL_MOVZX) != 0; Value* new_value = nullptr; - if (xorop == OPCODE_IS_FALSE) { - new_value = builder->IsTrue(xordef->src1.value); - - } else if (xorop == OPCODE_IS_TRUE) { - new_value = builder->IsFalse(xordef->src1.value); - } else if (xorop == OPCODE_COMPARE_EQ) { + if (xorop == OPCODE_COMPARE_EQ) { new_value = builder->CompareNE(xordef->src1.value, xordef->src2.value); } else if (xorop == OPCODE_COMPARE_NE) { @@ -294,7 +289,7 @@ bool SimplificationPass::CheckXor(hir::Instr* i, hir::HIRBuilder* builder) { return false; } bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) { - return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE; + return def_opcode >= OPCODE_COMPARE_EQ && def_opcode <= OPCODE_DID_SATURATE; } inline static uint64_t RotateOfSize(ScalarNZM nzm, unsigned rotation, @@ -804,24 +799,12 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, if (!var_definition) { return false; } - // x == 0 -> !x - if (cmpop == OPCODE_COMPARE_EQ && constant_unpacked == 0) { - i->Replace(&OPCODE_IS_FALSE_info, 0); - i->set_src1(variable); - return true; - } - // x != 0 -> !!x - if (cmpop == OPCODE_COMPARE_NE && constant_unpacked == 0) { - i->Replace(&OPCODE_IS_TRUE_info, 0); - i->set_src1(variable); - return true; - } if (cmpop == OPCODE_COMPARE_ULE && constant_unpacked == 0) { // less than or equal to zero = (== 0) = IS_FALSE - i->Replace(&OPCODE_IS_FALSE_info, 0); - i->set_src1(variable); + i->opcode = &OPCODE_COMPARE_EQ_info; + return true; } // todo: OPCODE_COMPARE_NE too? @@ -840,15 +823,20 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, } if (cmpop == OPCODE_COMPARE_ULT && constant_unpacked == 1) { // unsigned lt 1 means == 0 - i->Replace(&OPCODE_IS_FALSE_info, 0); - i->set_src1(variable); + // i->Replace(&OPCODE_IS_FALSE_info, 0); + + i->opcode = &OPCODE_COMPARE_EQ_info; + + // i->set_src1(variable); + i->set_src2(builder->LoadZero(variable->type)); return true; } if (cmpop == OPCODE_COMPARE_UGT && constant_unpacked == 0) { // unsigned gt 1 means != 0 - i->Replace(&OPCODE_IS_TRUE_info, 0); - i->set_src1(variable); + // i->Replace(&OPCODE_IS_TRUE_info, 0); + // i->set_src1(variable); + i->opcode = &OPCODE_COMPARE_NE_info; return true; } @@ -870,8 +858,11 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, } else if (cmpop == OPCODE_COMPARE_SGT && signbit_definitely_0 && constant_unpacked == 0) { // signbit cant be set, and checking if gt 0, so actually checking != 0 - i->Replace(&OPCODE_IS_TRUE_info, 0); - i->set_src1(variable); + // i->Replace(&OPCODE_IS_TRUE_info, 0); + + // i->set_src1(variable); + i->opcode = &OPCODE_COMPARE_NE_info; + return true; } @@ -885,9 +876,9 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, Value* constant_replacement = nullptr; if (cmpop == OPCODE_COMPARE_EQ || cmpop == OPCODE_COMPARE_UGE) { - repl = &OPCODE_IS_TRUE_info; + repl = &OPCODE_COMPARE_NE_info; } else if (cmpop == OPCODE_COMPARE_NE || cmpop == OPCODE_COMPARE_ULT) { - repl = &OPCODE_IS_FALSE_info; + repl = &OPCODE_COMPARE_EQ_info; } else if (cmpop == OPCODE_COMPARE_UGT) { // impossible, cannot be greater than mask @@ -906,6 +897,7 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, if (repl) { i->Replace(repl, 0); i->set_src1(variable); + i->set_src2(builder->LoadZero(variable->type)); return true; } if (constant_replacement) { @@ -919,10 +911,16 @@ bool SimplificationPass::CheckScalarConstCmp(hir::Instr* i, } bool SimplificationPass::CheckIsTrueIsFalse(hir::Instr* i, hir::HIRBuilder* builder) { - bool istrue = i->opcode == &OPCODE_IS_TRUE_info; - bool isfalse = i->opcode == &OPCODE_IS_FALSE_info; + bool istrue = i->opcode == &OPCODE_COMPARE_NE_info; + bool isfalse = i->opcode == &OPCODE_COMPARE_EQ_info; - Value* input = i->src1.value; + auto [input_cosntant, input] = i->BinaryValueArrangeAsConstAndVar(); + + if (!input_cosntant || input_cosntant->AsUint64() != 0) { + return false; + } + + // Value* input = i->src1.value; TypeName input_type = input->type; if (!IsScalarIntegralType(input_type)) { return false; @@ -1012,8 +1010,10 @@ bool SimplificationPass::CheckSHRByConst(hir::Instr* i, i->set_src1(isfalsetest); } else { - i->Replace(&OPCODE_IS_FALSE_info, 0); + // i->Replace(&OPCODE_IS_FALSE_info, 0); + i->Replace(&OPCODE_COMPARE_EQ_info, 0); i->set_src1(lz_input); + i->set_src2(builder->LoadZero(lz_input->type)); } return true; } @@ -1067,7 +1067,7 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) { while (i) { // vector types use the same opcodes as scalar ones for AND/OR/XOR! we // don't handle these in our simplifications, so skip - if (i->dest && IsScalarIntegralType(i->dest->type)) { + if (i->AllScalarIntegral()) { Opcode iop = i->opcode->num; if (iop == OPCODE_OR) { @@ -1080,7 +1080,6 @@ bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) { result |= CheckAdd(i, builder); } else if (IsScalarBasicCmp(iop)) { result |= CheckScalarConstCmp(i, builder); - } else if (iop == OPCODE_IS_FALSE || iop == OPCODE_IS_TRUE) { result |= CheckIsTrueIsFalse(i, builder); } else if (iop == OPCODE_SHR) { result |= CheckSHR(i, builder); diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index fda6812b4..82e3021f9 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -1023,7 +1023,6 @@ Value* HIRBuilder::Truncate(Value* value, TypeName target_type) { Value* HIRBuilder::Convert(Value* value, TypeName target_type, RoundMode round_mode) { - Instr* i = AppendInstr(OPCODE_CONVERT_info, round_mode, AllocValue(target_type)); i->set_src1(value); @@ -1034,7 +1033,6 @@ Value* HIRBuilder::Convert(Value* value, TypeName target_type, Value* HIRBuilder::Round(Value* value, RoundMode round_mode) { ASSERT_FLOAT_OR_VECTOR_TYPE(value); - Instr* i = AppendInstr(OPCODE_ROUND_info, round_mode, AllocValue(value->type)); i->set_src1(value); @@ -1248,7 +1246,34 @@ void HIRBuilder::Store(Value* address, Value* value, uint32_t store_flags) { i->set_src2(value); i->src3.value = NULL; } - +Value* HIRBuilder::LoadVectorLeft(Value* address) { + ASSERT_ADDRESS_TYPE(address); + Instr* i = AppendInstr(OPCODE_LVL_info, 0, AllocValue(VEC128_TYPE)); + i->set_src1(address); + i->src2.value = i->src3.value = NULL; + return i->dest; +} +Value* HIRBuilder::LoadVectorRight(Value* address) { + ASSERT_ADDRESS_TYPE(address); + Instr* i = AppendInstr(OPCODE_LVR_info, 0, AllocValue(VEC128_TYPE)); + i->set_src1(address); + i->src2.value = i->src3.value = NULL; + return i->dest; +} +void HIRBuilder::StoreVectorLeft(Value* address, Value* value) { + ASSERT_ADDRESS_TYPE(address); + Instr* i = AppendInstr(OPCODE_STVL_info, 0); + i->set_src1(address); + i->set_src2(value); + i->src3.value = NULL; +} +void HIRBuilder::StoreVectorRight(Value* address, Value* value) { + ASSERT_ADDRESS_TYPE(address); + Instr* i = AppendInstr(OPCODE_STVR_info, 0); + i->set_src1(address); + i->set_src2(value); + i->src3.value = NULL; +} void HIRBuilder::Memset(Value* address, Value* value, Value* length) { ASSERT_ADDRESS_TYPE(address); ASSERT_TYPES_EQUAL(address, length); @@ -1283,7 +1308,7 @@ void HIRBuilder::SetNJM(Value* value) { Value* HIRBuilder::Max(Value* value1, Value* value2) { ASSERT_TYPES_EQUAL(value1, value2); - if (IsScalarIntegralType( value1->type) && value1->IsConstant() && + if (IsScalarIntegralType(value1->type) && value1->IsConstant() && value2->IsConstant()) { return value1->Compare(OPCODE_COMPARE_SLT, value2) ? value2 : value1; } @@ -1351,27 +1376,51 @@ Value* HIRBuilder::Select(Value* cond, Value* value1, Value* value2) { i->set_src3(value2); return i->dest; } +static Value* OrLanes32(HIRBuilder& f, Value* value) { + hir::Value* v1 = f.Extract(value, (uint8_t)0, INT32_TYPE); + hir::Value* v2 = f.Extract(value, (uint8_t)1, INT32_TYPE); + hir::Value* v3 = f.Extract(value, (uint8_t)2, INT32_TYPE); + hir::Value* ored = f.Or(v1, v2); + hir::Value* v4 = f.Extract(value, (uint8_t)3, INT32_TYPE); + ored = f.Or(ored, v3); + + ored = f.Or(ored, v4); + return ored; +} Value* HIRBuilder::IsTrue(Value* value) { + assert_true(value); + if (value->type == VEC128_TYPE) { + // chrispy: this probably doesnt happen often enough to be worth its own + // opcode or special code path but this could be optimized to not require as + // many extracts, we can shuffle and or v128 and then extract the low + + return CompareEQ(OrLanes32(*this, value), LoadZeroInt32()); + } + if (value->IsConstant()) { return LoadConstantInt8(value->IsConstantTrue() ? 1 : 0); } - Instr* i = AppendInstr(OPCODE_IS_TRUE_info, 0, AllocValue(INT8_TYPE)); - i->set_src1(value); - i->src2.value = i->src3.value = NULL; - return i->dest; + return CompareNE(value, LoadZero(value->type)); } Value* HIRBuilder::IsFalse(Value* value) { + assert_true(value); + + if (value->type == VEC128_TYPE) { + // chrispy: this probably doesnt happen often enough to be worth its own + // opcode or special code path but this could be optimized to not require as + // many extracts, we can shuffle and or v128 and then extract the low + + return CompareEQ(OrLanes32(*this, value), LoadZeroInt32()); + } + if (value->IsConstant()) { return LoadConstantInt8(value->IsConstantFalse() ? 1 : 0); } - Instr* i = AppendInstr(OPCODE_IS_FALSE_info, 0, AllocValue(INT8_TYPE)); - i->set_src1(value); - i->src2.value = i->src3.value = NULL; - return i->dest; + return CompareEQ(value, LoadZero(value->type)); } Value* HIRBuilder::IsNan(Value* value) { diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index 62164e52d..4a200e0e5 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -166,6 +166,11 @@ class HIRBuilder { uint32_t store_flags = 0); Value* Load(Value* address, TypeName type, uint32_t load_flags = 0); + + Value* LoadVectorLeft(Value* address); + Value* LoadVectorRight(Value* address); + void StoreVectorLeft(Value* address, Value* value); + void StoreVectorRight(Value* address, Value* value); void Store(Value* address, Value* value, uint32_t store_flags = 0); void Memset(Value* address, Value* value, Value* length); void CacheControl(Value* address, size_t cache_line_size, @@ -268,6 +273,7 @@ class HIRBuilder { Value* new_value); Value* AtomicAdd(Value* address, Value* value); Value* AtomicSub(Value* address, Value* value); + void SetNJM(Value* value); protected: diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc index 149103d43..0e4a7c2fb 100644 --- a/src/xenia/cpu/hir/instr.cc +++ b/src/xenia/cpu/hir/instr.cc @@ -213,7 +213,19 @@ uint32_t Instr::GuestAddressFor() const { return 0; // eek. } +bool Instr::AllScalarIntegral() { + bool result = true; + if (dest) { + if (!IsScalarIntegralType(dest->type)) { + return false; + } + } + VisitValueOperands([&result](Value* v, uint32_t idx) { + result = result && IsScalarIntegralType(v->type); + }); + return result; +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index 38afef241..a50219ceb 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -171,6 +171,8 @@ if both are constant, return nullptr, nullptr const hir::Instr* GetNonFakePrev() const; uint32_t GuestAddressFor() const; + + bool AllScalarIntegral(); // dest and all srcs are scalar integral }; } // namespace hir diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index a51fda6d5..a59b1c72e 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -210,10 +210,10 @@ enum Opcode { OPCODE_STORE, // chrispy: todo: implement, our current codegen for the unaligned loads is // very bad - OPCODE_LVLX, - OPCODE_LVRX, - OPCODE_STVLX, - OPCODE_STVRX, + OPCODE_LVL, + OPCODE_LVR, + OPCODE_STVL, + OPCODE_STVR, OPCODE_MEMSET, OPCODE_CACHE_CONTROL, OPCODE_MEMORY_BARRIER, @@ -222,8 +222,6 @@ enum Opcode { OPCODE_MIN, OPCODE_VECTOR_MIN, OPCODE_SELECT, - OPCODE_IS_TRUE, - OPCODE_IS_FALSE, OPCODE_IS_NAN, OPCODE_COMPARE_EQ, OPCODE_COMPARE_NE, diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index 46a328903..783e9a439 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -303,17 +303,6 @@ DEFINE_OPCODE( OPCODE_SIG_V_V_V_V, 0) -DEFINE_OPCODE( - OPCODE_IS_TRUE, - "is_true", - OPCODE_SIG_V_V, - 0) - -DEFINE_OPCODE( - OPCODE_IS_FALSE, - "is_false", - OPCODE_SIG_V_V, - 0) DEFINE_OPCODE( OPCODE_IS_NAN, @@ -706,4 +695,27 @@ DEFINE_OPCODE( OPCODE_SIG_X_V, 0 ) +DEFINE_OPCODE( + OPCODE_LVL, + "loadv_left", + OPCODE_SIG_V_V, + OPCODE_FLAG_MEMORY +) +DEFINE_OPCODE( + OPCODE_LVR, + "loadv_right", + OPCODE_SIG_V_V, + OPCODE_FLAG_MEMORY +) +DEFINE_OPCODE( + OPCODE_STVL, + "storev_left", + OPCODE_SIG_X_V_V, + OPCODE_FLAG_MEMORY) + +DEFINE_OPCODE( + OPCODE_STVR, + "storev_right", + OPCODE_SIG_X_V_V, + OPCODE_FLAG_MEMORY) diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.cc b/src/xenia/cpu/ppc/ppc_hir_builder.cc index 4aedfdd26..263d3675a 100644 --- a/src/xenia/cpu/ppc/ppc_hir_builder.cc +++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc @@ -418,6 +418,10 @@ void PPCHIRBuilder::UpdateCR6(Value* src_value) { // Testing for all 1's and all 0's. // if (Rc) CR6 = all_equal | 0 | none_equal | 0 // TODO(benvanik): efficient instruction? + + // chrispy: nothing seems to write cr6_1, figure out if no documented + // instructions write anything other than 0 to it and remove these stores if + // so StoreContext(offsetof(PPCContext, cr6.cr6_1), LoadZeroInt8()); StoreContext(offsetof(PPCContext, cr6.cr6_3), LoadZeroInt8()); StoreContext(offsetof(PPCContext, cr6.cr6_all_equal), diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc index db8c874f2..a670cc9d6 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc @@ -7,18 +7,17 @@ ****************************************************************************** */ +#include "xenia/gpu/d3d12/d3d12_command_processor.h" #include #include #include #include - #include "xenia/base/assert.h" #include "xenia/base/byte_order.h" #include "xenia/base/cvar.h" #include "xenia/base/logging.h" #include "xenia/base/math.h" #include "xenia/base/profiling.h" -#include "xenia/gpu/d3d12/d3d12_command_processor.h" #include "xenia/gpu/d3d12/d3d12_graphics_system.h" #include "xenia/gpu/d3d12/d3d12_shader.h" #include "xenia/gpu/draw_util.h" @@ -843,6 +842,7 @@ bool D3D12CommandProcessor::SetupContext() { bool draw_resolution_scale_not_clamped = TextureCache::GetConfigDrawResolutionScale(draw_resolution_scale_x, draw_resolution_scale_y); + if (!D3D12TextureCache::ClampDrawResolutionScaleToMaxSupported( draw_resolution_scale_x, draw_resolution_scale_y, provider)) { draw_resolution_scale_not_clamped = false; @@ -1676,37 +1676,52 @@ void D3D12CommandProcessor::ShutdownContext() { CommandProcessor::ShutdownContext(); } - +// todo: bit-pack the bools and use bitarith to reduce branches void D3D12CommandProcessor::WriteRegister(uint32_t index, uint32_t value) { CommandProcessor::WriteRegister(index, value); - if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && - index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { - if (frame_open_) { - uint32_t float_constant_index = - (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; - if (float_constant_index >= 256) { - float_constant_index -= 256; - if (current_float_constant_map_pixel_[float_constant_index >> 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_binding_float_pixel_.up_to_date = false; - } - } else { - if (current_float_constant_map_vertex_[float_constant_index >> 6] & - (1ull << (float_constant_index & 63))) { - cbuffer_binding_float_vertex_.up_to_date = false; + bool cbuf_binding_float_pixel_utd = cbuffer_binding_float_pixel_.up_to_date; + bool cbuf_binding_float_vertex_utd = cbuffer_binding_float_vertex_.up_to_date; + bool cbuf_binding_bool_loop_utd = cbuffer_binding_bool_loop_.up_to_date; + + if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 && + index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) { + cbuffer_binding_fetch_.up_to_date = false; + // texture cache is never nullptr + // if (texture_cache_ != nullptr) { + texture_cache_->TextureFetchConstantWritten( + (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); + // } + } else { + if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd | + cbuf_binding_bool_loop_utd)) { + return; + } + + if (index >= XE_GPU_REG_SHADER_CONSTANT_000_X && + index <= XE_GPU_REG_SHADER_CONSTANT_511_W) { + if (!(cbuf_binding_float_pixel_utd | cbuf_binding_float_vertex_utd)) { + return; + } + if (frame_open_) { + uint32_t float_constant_index = + (index - XE_GPU_REG_SHADER_CONSTANT_000_X) >> 2; + if (float_constant_index >= 256) { + float_constant_index -= 256; + if (current_float_constant_map_pixel_[float_constant_index >> 6] & + (1ull << (float_constant_index & 63))) { + cbuffer_binding_float_pixel_.up_to_date = false; + } + } else { + if (current_float_constant_map_vertex_[float_constant_index >> 6] & + (1ull << (float_constant_index & 63))) { + cbuffer_binding_float_vertex_.up_to_date = false; + } } } - } - } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 && - index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) { - cbuffer_binding_bool_loop_.up_to_date = false; - } else if (index >= XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 && - index <= XE_GPU_REG_SHADER_CONSTANT_FETCH_31_5) { - cbuffer_binding_fetch_.up_to_date = false; - if (texture_cache_ != nullptr) { - texture_cache_->TextureFetchConstantWritten( - (index - XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0) / 6); + } else if (index >= XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031 && + index <= XE_GPU_REG_SHADER_CONSTANT_LOOP_31) { + cbuffer_binding_bool_loop_.up_to_date = false; } } } @@ -2301,14 +2316,26 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type, uint32_t draw_resolution_scale_x = texture_cache_->draw_resolution_scale_x(); uint32_t draw_resolution_scale_y = texture_cache_->draw_resolution_scale_y(); draw_util::ViewportInfo viewport_info; - draw_util::GetHostViewportInfo( - regs, draw_resolution_scale_x, draw_resolution_scale_y, true, + draw_util::GetViewportInfoArgs gviargs{}; + + gviargs.Setup( + draw_resolution_scale_x, draw_resolution_scale_y, + texture_cache_->draw_resolution_scale_x_divisor(), + texture_cache_->draw_resolution_scale_y_divisor(), true, D3D12_VIEWPORT_BOUNDS_MAX, D3D12_VIEWPORT_BOUNDS_MAX, false, normalized_depth_control, host_render_targets_used && render_target_cache_->depth_float24_convert_in_pixel_shader(), - host_render_targets_used, pixel_shader && pixel_shader->writes_depth(), - viewport_info); + host_render_targets_used, pixel_shader && pixel_shader->writes_depth()); + gviargs.SetupRegisterValues(regs); + + if (gviargs == previous_viewport_info_args_) { + viewport_info = previous_viewport_info_; + } else { + draw_util::GetHostViewportInfo(&gviargs, viewport_info); + previous_viewport_info_args_ = gviargs; + previous_viewport_info_ = viewport_info; + } draw_util::Scissor scissor; draw_util::GetScissor(regs, scissor); scissor.offset[0] *= draw_resolution_scale_x; @@ -2711,6 +2738,24 @@ void D3D12CommandProcessor::InitializeTrace() { shared_memory_->InitializeTraceCompleteDownloads(); } } +static void DmaPrefunc(dma::XeDMAJob* job) { + D3D12_RANGE readback_range; + readback_range.Begin = 0; + readback_range.End = job->size; + void* readback_mapping; + ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1; + + HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping); + xenia_assert(SUCCEEDED(mapres)); + + job->source = (uint8_t*)readback_mapping; +} + +static void DmaPostfunc(dma::XeDMAJob* job) { + D3D12_RANGE readback_write_range = {}; + ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1; + readback_buffer->Unmap(0, &readback_write_range); +} bool D3D12CommandProcessor::IssueCopy() { #if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES @@ -2736,17 +2781,35 @@ bool D3D12CommandProcessor::IssueCopy() { readback_buffer, 0, shared_memory_buffer, written_address, written_length); if (AwaitAllQueueOperationsCompletion()) { +#if 1 D3D12_RANGE readback_range; readback_range.Begin = 0; readback_range.End = written_length; void* readback_mapping; if (SUCCEEDED( readback_buffer->Map(0, &readback_range, &readback_mapping))) { - std::memcpy(memory_->TranslatePhysical(written_address), - readback_mapping, written_length); + // chrispy: this memcpy needs to be optimized as much as possible + + auto physaddr = memory_->TranslatePhysical(written_address); + dma::vastcpy(physaddr, (uint8_t*)readback_mapping, + written_length); + // XEDmaCpy(physaddr, readback_mapping, written_length); D3D12_RANGE readback_write_range = {}; readback_buffer->Unmap(0, &readback_write_range); } + +#else + dma::XeDMAJob job{}; + job.destination = memory_->TranslatePhysical(written_address); + job.size = written_length; + job.source = nullptr; + job.userdata1 = (void*)readback_buffer; + job.precall = DmaPrefunc; + job.postcall = DmaPostfunc; + + readback_available_ = GetDMAC()->PushDMAJob(&job); + +#endif } } } @@ -3885,9 +3948,10 @@ bool D3D12CommandProcessor::UpdateBindings( if (bool_loop_constants == nullptr) { return false; } - std::memcpy(bool_loop_constants, - ®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32, - kBoolLoopConstantsSize); + xe::smallcpy_const( + bool_loop_constants, + ®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].u32); + cbuffer_binding_bool_loop_.up_to_date = true; current_graphics_root_up_to_date_ &= ~(1u << root_parameter_bool_loop_constants); @@ -3901,9 +3965,9 @@ bool D3D12CommandProcessor::UpdateBindings( if (fetch_constants == nullptr) { return false; } - std::memcpy(fetch_constants, - ®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32, - kFetchConstantsSize); + xe::smallcpy_const( + fetch_constants, ®s[XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0].u32); + cbuffer_binding_fetch_.up_to_date = true; current_graphics_root_up_to_date_ &= ~(1u << root_parameter_fetch_constants); @@ -4542,6 +4606,12 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) { if (size == 0) { return nullptr; } + #if 0 + if (readback_available_) { + GetDMAC()->WaitJobDone(readback_available_); + readback_available_ = 0; + } + #endif size = xe::align(size, kReadbackBufferSizeIncrement); if (size > readback_buffer_size_) { const ui::d3d12::D3D12Provider& provider = GetD3D12Provider(); @@ -4561,6 +4631,7 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) { readback_buffer_->Release(); } readback_buffer_ = buffer; + readback_buffer_size_ = size; } return readback_buffer_; } diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h index 2b43233b9..e64447a4e 100644 --- a/src/xenia/gpu/d3d12/d3d12_command_processor.h +++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h @@ -1,3 +1,4 @@ +/** /** ****************************************************************************** * Xenia : Xbox 360 Emulator Research Project * @@ -19,6 +20,7 @@ #include #include "xenia/base/assert.h" +#include "xenia/base/dma.h" #include "xenia/gpu/command_processor.h" #include "xenia/gpu/d3d12/d3d12_graphics_system.h" #include "xenia/gpu/d3d12/d3d12_primitive_processor.h" @@ -581,6 +583,7 @@ class D3D12CommandProcessor final : public CommandProcessor { static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024; ID3D12Resource* readback_buffer_ = nullptr; + dma::DMACJobHandle readback_available_ = 0; uint32_t readback_buffer_size_ = 0; std::atomic pix_capture_requested_ = false; @@ -614,9 +617,11 @@ class D3D12CommandProcessor final : public CommandProcessor { DxbcShaderTranslator::SystemConstants system_constants_; // Float constant usage masks of the last draw call. - uint64_t current_float_constant_map_vertex_[4]; - uint64_t current_float_constant_map_pixel_[4]; - + // chrispy: make sure accesses to these cant cross cacheline boundaries + struct alignas(XE_HOST_CACHE_LINE_SIZE) { + uint64_t current_float_constant_map_vertex_[4]; + uint64_t current_float_constant_map_pixel_[4]; + }; // Constant buffer bindings. struct ConstantBufferBinding { D3D12_GPU_VIRTUAL_ADDRESS address; @@ -670,6 +675,9 @@ class D3D12CommandProcessor final : public CommandProcessor { // Current primitive topology. D3D_PRIMITIVE_TOPOLOGY primitive_topology_; + + draw_util::GetViewportInfoArgs previous_viewport_info_args_; + draw_util::ViewportInfo previous_viewport_info_; }; } // namespace d3d12 diff --git a/src/xenia/gpu/draw_util.cc b/src/xenia/gpu/draw_util.cc index 10017fff1..02d1f6750 100644 --- a/src/xenia/gpu/draw_util.cc +++ b/src/xenia/gpu/draw_util.cc @@ -167,17 +167,17 @@ bool IsPixelShaderNeededWithRasterization(const Shader& shader, return false; } -void GetHostViewportInfo(const RegisterFile& regs, - uint32_t draw_resolution_scale_x, - uint32_t draw_resolution_scale_y, - bool origin_bottom_left, uint32_t x_max, - uint32_t y_max, bool allow_reverse_z, - reg::RB_DEPTHCONTROL normalized_depth_control, - bool convert_z_to_float24, bool full_float24_in_0_to_1, - bool pixel_shader_writes_depth, +static float ViewportRecip2_0(float f) { + float f1 = ArchReciprocalRefined(f); + return f1 + f1; +} + +// chrispy: todo, the int/float divides and the nan-checked mins show up +// relatively high on uprof when i uc to 1.7ghz +void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args, ViewportInfo& viewport_info_out) { - assert_not_zero(draw_resolution_scale_x); - assert_not_zero(draw_resolution_scale_y); + assert_not_zero(args->draw_resolution_scale_x); + assert_not_zero(args->draw_resolution_scale_y); // A vertex position goes the following path: // @@ -304,38 +304,32 @@ void GetHostViewportInfo(const RegisterFile& regs, // TODO(Triang3l): Investigate the need for clamping of oDepth to 0...1 for // D24FS8 as well. - auto pa_cl_clip_cntl = regs.Get(); - auto pa_cl_vte_cntl = regs.Get(); - auto pa_su_sc_mode_cntl = regs.Get(); - auto pa_su_vtx_cntl = regs.Get(); + auto pa_cl_clip_cntl = args->pa_cl_clip_cntl; + auto pa_cl_vte_cntl = args->pa_cl_vte_cntl; + auto pa_su_sc_mode_cntl = args->pa_su_sc_mode_cntl; + auto pa_su_vtx_cntl = args->pa_su_vtx_cntl; // Obtain the original viewport values in a normalized way. float scale_xy[] = { - pa_cl_vte_cntl.vport_x_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 - : 1.0f, - pa_cl_vte_cntl.vport_y_scale_ena ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 - : 1.0f, + + pa_cl_vte_cntl.vport_x_scale_ena ? args->PA_CL_VPORT_XSCALE : 1.0f, + pa_cl_vte_cntl.vport_y_scale_ena ? args->PA_CL_VPORT_YSCALE : 1.0f, }; - float scale_z = pa_cl_vte_cntl.vport_z_scale_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 - : 1.0f; + float scale_z = + pa_cl_vte_cntl.vport_z_scale_ena ? args->PA_CL_VPORT_ZSCALE : 1.0f; + float offset_base_xy[] = { - pa_cl_vte_cntl.vport_x_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 - : 0.0f, - pa_cl_vte_cntl.vport_y_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 - : 0.0f, + pa_cl_vte_cntl.vport_x_offset_ena ? args->PA_CL_VPORT_XOFFSET : 0.0f, + pa_cl_vte_cntl.vport_y_offset_ena ? args->PA_CL_VPORT_YOFFSET : 0.0f, }; - float offset_z = pa_cl_vte_cntl.vport_z_offset_ena - ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 - : 0.0f; + float offset_z = + pa_cl_vte_cntl.vport_z_offset_ena ? args->PA_CL_VPORT_ZOFFSET : 0.0f; // Calculate all the integer.0 or integer.5 offsetting exactly at full // precision, separately so it can be used in other integer calculations // without double rounding if needed. float offset_add_xy[2] = {}; if (pa_su_sc_mode_cntl.vtx_window_offset_enable) { - auto pa_sc_window_offset = regs.Get(); + auto pa_sc_window_offset = args->pa_sc_window_offset; offset_add_xy[0] += float(pa_sc_window_offset.window_x_offset); offset_add_xy[1] += float(pa_sc_window_offset.window_y_offset); } @@ -346,8 +340,11 @@ void GetHostViewportInfo(const RegisterFile& regs, // The maximum value is at least the maximum host render target size anyway - // and a guest pixel is always treated as a whole with resolution scaling. - uint32_t xy_max_unscaled[] = {x_max / draw_resolution_scale_x, - y_max / draw_resolution_scale_y}; + // cbrispy: todo, this integer divides show up high on the profiler somehow + // (it was a very long session, too) + uint32_t xy_max_unscaled[] = { + args->draw_resolution_scale_x_divisor.Apply(args->x_max), + args->draw_resolution_scale_y_divisor.Apply(args->y_max)}; assert_not_zero(xy_max_unscaled[0]); assert_not_zero(xy_max_unscaled[1]); @@ -367,9 +364,11 @@ void GetHostViewportInfo(const RegisterFile& regs, std::min(xenos::kTexture2DCubeMaxWidthHeight, xy_max_unscaled[i]); viewport_info_out.xy_extent[i] = extent_axis_unscaled * - (i ? draw_resolution_scale_y : draw_resolution_scale_x); + (i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x); float extent_axis_unscaled_float = float(extent_axis_unscaled); - float pixels_to_ndc_axis = 2.0f / extent_axis_unscaled_float; + + float pixels_to_ndc_axis = ViewportRecip2_0(extent_axis_unscaled_float); + ndc_scale[i] = scale_xy[i] * pixels_to_ndc_axis; ndc_offset[i] = (offset_base_xy[i] - extent_axis_unscaled_float * 0.5f + offset_add_xy[i]) * @@ -394,7 +393,7 @@ void GetHostViewportInfo(const RegisterFile& regs, // doing truncation for simplicity - since maxing with 0 is done anyway // (we only return viewports in the positive quarter-plane). uint32_t axis_resolution_scale = - i ? draw_resolution_scale_y : draw_resolution_scale_x; + i ? args->draw_resolution_scale_y : args->draw_resolution_scale_x; float offset_axis = offset_base_xy[i] + offset_add_xy[i]; float scale_axis = scale_xy[i]; float scale_axis_abs = std::abs(scale_xy[i]); @@ -422,11 +421,14 @@ void GetHostViewportInfo(const RegisterFile& regs, // space, a region previously outside -W...W should end up within it, so // the scale should be < 1. float axis_extent_rounded = float(axis_extent_int); - ndc_scale_axis = scale_axis * 2.0f / axis_extent_rounded; + float inv_axis_extent_rounded = + ArchReciprocalRefined(axis_extent_rounded); + + ndc_scale_axis = scale_axis * 2.0f * inv_axis_extent_rounded; // Move the origin of the snapped coordinates back to the original one. ndc_offset_axis = (float(offset_axis) - (float(axis_0_int) + axis_extent_rounded * 0.5f)) * - 2.0f / axis_extent_rounded; + 2.0f * inv_axis_extent_rounded; } else { // Empty viewport (everything outside the viewport scissor). ndc_scale_axis = 1.0f; @@ -497,7 +499,7 @@ void GetHostViewportInfo(const RegisterFile& regs, ndc_scale[2] = 0.5f; ndc_offset[2] = 0.5f; } - if (pixel_shader_writes_depth) { + if (args->pixel_shader_writes_depth) { // Allow the pixel shader to write any depth value since // PA_SC_VPORT_ZMIN/ZMAX isn't present on the Adreno 200; guest pixel // shaders don't have access to the original Z in the viewport space @@ -515,7 +517,7 @@ void GetHostViewportInfo(const RegisterFile& regs, // Direct3D 12 doesn't allow reverse depth range - on some drivers it // works, on some drivers it doesn't, actually, but it was never // explicitly allowed by the specification. - if (!allow_reverse_z && z_min > z_max) { + if (!args->allow_reverse_z && z_min > z_max) { std::swap(z_min, z_max); ndc_scale[2] = -ndc_scale[2]; ndc_offset[2] = 1.0f - ndc_offset[2]; @@ -523,10 +525,9 @@ void GetHostViewportInfo(const RegisterFile& regs, } } - if (normalized_depth_control.z_enable && - regs.Get().depth_format == - xenos::DepthRenderTargetFormat::kD24FS8) { - if (convert_z_to_float24) { + if (args->normalized_depth_control.z_enable && + args->depth_format == xenos::DepthRenderTargetFormat::kD24FS8) { + if (args->convert_z_to_float24) { // Need to adjust the bounds that the resulting depth values will be // clamped to after the pixel shader. Preferring adding some error to // interpolated Z instead if conversion can't be done exactly, without @@ -537,7 +538,7 @@ void GetHostViewportInfo(const RegisterFile& regs, z_min = xenos::Float20e4To32(xenos::Float32To20e4(z_min, true)); z_max = xenos::Float20e4To32(xenos::Float32To20e4(z_max, true)); } - if (full_float24_in_0_to_1) { + if (args->full_float24_in_0_to_1) { // Remap the full [0...2) float24 range to [0...1) support data round-trip // during render target ownership transfer of EDRAM tiles through depth // input without unrestricted depth range. @@ -548,7 +549,7 @@ void GetHostViewportInfo(const RegisterFile& regs, viewport_info_out.z_min = z_min; viewport_info_out.z_max = z_max; - if (origin_bottom_left) { + if (args->origin_bottom_left) { ndc_scale[1] = -ndc_scale[1]; ndc_offset[1] = -ndc_offset[1]; } @@ -557,7 +558,6 @@ void GetHostViewportInfo(const RegisterFile& regs, viewport_info_out.ndc_offset[i] = ndc_offset[i]; } } - void GetScissor(const RegisterFile& regs, Scissor& scissor_out, bool clamp_to_surface_pitch) { auto pa_sc_window_scissor_tl = regs.Get(); @@ -868,7 +868,7 @@ bool GetResolveInfo(const RegisterFile& regs, const Memory& memory, xenos::kMaxResolveSize); y1 = y0 + int32_t(xenos::kMaxResolveSize); } - //fails in forza horizon 1 + // fails in forza horizon 1 assert_true(x0 < x1 && y0 < y1); if (x0 >= x1 || y0 >= y1) { XELOGE("Resolve region is empty"); diff --git a/src/xenia/gpu/draw_util.h b/src/xenia/gpu/draw_util.h index a365b5436..2f27f6e01 100644 --- a/src/xenia/gpu/draw_util.h +++ b/src/xenia/gpu/draw_util.h @@ -277,18 +277,151 @@ struct ViewportInfo { float ndc_scale[3]; float ndc_offset[3]; }; +static_assert(sizeof(xenos::DepthRenderTargetFormat) == sizeof(uint32_t), + "Change in depthrendertargetformat throws off " + "getviewportinfoargs by a bit"); +struct GetViewportInfoArgs { + union alignas(64) { + struct { + // group 1 + uint32_t x_max; + uint32_t y_max; + union { + struct { + uint32_t origin_bottom_left : 1; + uint32_t allow_reverse_z : 1; + uint32_t convert_z_to_float24 : 1; + uint32_t full_float24_in_0_to_1 : 1; + uint32_t pixel_shader_writes_depth : 1; + xenos::DepthRenderTargetFormat depth_format : 1; + }; + uint32_t packed_portions; + }; + reg::RB_DEPTHCONTROL normalized_depth_control; + // group 2 + reg::PA_CL_CLIP_CNTL pa_cl_clip_cntl; + reg::PA_CL_VTE_CNTL pa_cl_vte_cntl; + reg::PA_SU_SC_MODE_CNTL pa_su_sc_mode_cntl; + reg::PA_SU_VTX_CNTL pa_su_vtx_cntl; + // group 3 + reg::PA_SC_WINDOW_OFFSET pa_sc_window_offset; + float PA_CL_VPORT_XSCALE; + float PA_CL_VPORT_YSCALE; + float PA_CL_VPORT_ZSCALE; + + float PA_CL_VPORT_XOFFSET; + float PA_CL_VPORT_YOFFSET; + float PA_CL_VPORT_ZOFFSET; + uint32_t padding_set_to_0; + }; +#if XE_ARCH_AMD64 == 1 + struct { + __m128i first4; // x_max, y_max, packed_portions, + // normalized_depth_control + __m128i second4; // pa_cl_clip_cntl, pa_cl_vte_cntl, pa_su_sc_mode_cntl, + // pa_su_vtx_cntl + __m128i third4; // pa_sc_window_offset, PA_CL_VPORT_XSCALE, + // PA_CL_VPORT_YSCALE, PA_CL_VPORT_ZSCALE + __m128i last4; // PA_CL_VPORT_XOFFSET, PA_CL_VPORT_YOFFSET, + // PA_CL_VPORT_ZOFFSET, padding_set_to_0 + }; +#endif + }; + + // everything that follows here does not need to be compared + uint32_t draw_resolution_scale_x; + uint32_t draw_resolution_scale_y; + divisors::MagicDiv draw_resolution_scale_x_divisor; + divisors::MagicDiv draw_resolution_scale_y_divisor; + void Setup(uint32_t _draw_resolution_scale_x, + uint32_t _draw_resolution_scale_y, + divisors::MagicDiv _draw_resolution_scale_x_divisor, + divisors::MagicDiv _draw_resolution_scale_y_divisor, + bool _origin_bottom_left, uint32_t _x_max, uint32_t _y_max, + bool _allow_reverse_z, + reg::RB_DEPTHCONTROL _normalized_depth_control, + bool _convert_z_to_float24, bool _full_float24_in_0_to_1, + bool _pixel_shader_writes_depth) { + packed_portions = 0; + padding_set_to_0 = 0; // important to zero this + draw_resolution_scale_x = _draw_resolution_scale_x; + draw_resolution_scale_y = _draw_resolution_scale_y; + draw_resolution_scale_x_divisor = _draw_resolution_scale_x_divisor; + draw_resolution_scale_y_divisor = _draw_resolution_scale_y_divisor; + origin_bottom_left = _origin_bottom_left; + x_max = _x_max; + y_max = _y_max; + allow_reverse_z = _allow_reverse_z; + normalized_depth_control = _normalized_depth_control; + convert_z_to_float24 = _convert_z_to_float24; + full_float24_in_0_to_1 = _full_float24_in_0_to_1; + pixel_shader_writes_depth = _pixel_shader_writes_depth; + } + + void SetupRegisterValues(const RegisterFile& regs) { + pa_cl_clip_cntl = regs.Get(); + pa_cl_vte_cntl = regs.Get(); + pa_su_sc_mode_cntl = regs.Get(); + pa_su_vtx_cntl = regs.Get(); + PA_CL_VPORT_XSCALE = regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; + PA_CL_VPORT_YSCALE = regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; + PA_CL_VPORT_ZSCALE = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; + PA_CL_VPORT_XOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; + PA_CL_VPORT_YOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; + PA_CL_VPORT_ZOFFSET = regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; + pa_sc_window_offset = regs.Get(); + depth_format = regs.Get().depth_format; + } + XE_FORCEINLINE + bool operator==(const GetViewportInfoArgs& prev) { +#if XE_ARCH_AMD64 == 0 + bool result = true; + + auto accum_eq = [&result](auto x, auto y) { result &= (x == y); }; + +#define EQC(field) accum_eq(field, prev.field) + EQC(x_max); + EQC(y_max); + EQC(packed_portions); + EQC(normalized_depth_control.value); + EQC(pa_cl_clip_cntl.value); + EQC(pa_cl_vte_cntl.value); + + EQC(pa_su_sc_mode_cntl.value); + EQC(pa_su_vtx_cntl.value); + EQC(PA_CL_VPORT_XSCALE); + EQC(PA_CL_VPORT_YSCALE); + EQC(PA_CL_VPORT_ZSCALE); + EQC(PA_CL_VPORT_XOFFSET); + EQC(PA_CL_VPORT_YOFFSET); + EQC(PA_CL_VPORT_ZOFFSET); + EQC(pa_sc_window_offset.value); + +#undef EQC + return result; +#else + __m128i mask1 = _mm_cmpeq_epi32(first4, prev.first4); + __m128i mask2 = _mm_cmpeq_epi32(second4, prev.second4); + + __m128i mask3 = _mm_cmpeq_epi32(third4, prev.third4); + __m128i unified1 = _mm_and_si128(mask1, mask2); + __m128i mask4 = _mm_cmpeq_epi32(last4, prev.last4); + + __m128i unified2 = _mm_and_si128(unified1, mask3); + + __m128i unified3 = _mm_and_si128(unified2, mask4); + + return _mm_movemask_epi8(unified3) == 0xFFFF; + +#endif + } +}; + // Converts the guest viewport (or fakes one if drawing without a viewport) to // a viewport, plus values to multiply-add the returned position by, usable on // host graphics APIs such as Direct3D 11+ and Vulkan, also forcing it to the // Direct3D clip space with 0...W Z rather than -W...W. -void GetHostViewportInfo(const RegisterFile& regs, - uint32_t draw_resolution_scale_x, - uint32_t draw_resolution_scale_y, - bool origin_bottom_left, uint32_t x_max, - uint32_t y_max, bool allow_reverse_z, - reg::RB_DEPTHCONTROL normalized_depth_control, - bool convert_z_to_float24, bool full_float24_in_0_to_1, - bool pixel_shader_writes_depth, +void GetHostViewportInfo(GetViewportInfoArgs* XE_RESTRICT args, ViewportInfo& viewport_info_out); struct Scissor { diff --git a/src/xenia/gpu/render_target_cache.cc b/src/xenia/gpu/render_target_cache.cc index 2695b22d9..54c8b3655 100644 --- a/src/xenia/gpu/render_target_cache.cc +++ b/src/xenia/gpu/render_target_cache.cc @@ -813,20 +813,22 @@ bool RenderTargetCache::Update(bool is_rasterization_done, } // Make sure the same render target isn't bound into two different slots // over time. - for (uint32_t i = 1; are_accumulated_render_targets_valid_ && - i < 1 + xenos::kMaxColorRenderTargets; - ++i) { - const RenderTarget* render_target = - last_update_accumulated_render_targets_[i]; - if (!render_target) { - continue; - } - for (uint32_t j = 0; j < i; ++j) { - if (last_update_accumulated_render_targets_[j] == render_target) { - are_accumulated_render_targets_valid_ = false; - break; + // chrispy: this needs optimization! + if (are_accumulated_render_targets_valid_) { + for (uint32_t i = 1; i < 1 + xenos::kMaxColorRenderTargets; ++i) { + const RenderTarget* render_target = + last_update_accumulated_render_targets_[i]; + if (!render_target) { + continue; + } + for (uint32_t j = 0; j < i; ++j) { + if (last_update_accumulated_render_targets_[j] == render_target) { + are_accumulated_render_targets_valid_ = false; + goto exit_slot_check_loop; + } } } + exit_slot_check_loop:; } } if (!are_accumulated_render_targets_valid_) { diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc index 05f6e2090..b697ff73c 100644 --- a/src/xenia/gpu/texture_cache.cc +++ b/src/xenia/gpu/texture_cache.cc @@ -154,7 +154,9 @@ TextureCache::TextureCache(const RegisterFile& register_file, : register_file_(register_file), shared_memory_(shared_memory), draw_resolution_scale_x_(draw_resolution_scale_x), - draw_resolution_scale_y_(draw_resolution_scale_y) { + draw_resolution_scale_y_(draw_resolution_scale_y), + draw_resolution_scale_x_divisor_(draw_resolution_scale_x), + draw_resolution_scale_y_divisor_(draw_resolution_scale_y) { assert_true(draw_resolution_scale_x >= 1); assert_true(draw_resolution_scale_x <= kMaxDrawResolutionScaleAlongAxis); assert_true(draw_resolution_scale_y >= 1); @@ -187,6 +189,7 @@ bool TextureCache::GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_x)); uint32_t config_y = uint32_t(std::max(INT32_C(1), cvars::draw_resolution_scale_y)); + uint32_t clamped_x = std::min(kMaxDrawResolutionScaleAlongAxis, config_x); uint32_t clamped_y = std::min(kMaxDrawResolutionScaleAlongAxis, config_y); x_out = clamped_x; @@ -552,8 +555,7 @@ void TextureCache::Texture::MarkAsUsed() { } void TextureCache::Texture::WatchCallback( - [[maybe_unused]] const global_unique_lock_type& global_lock, - bool is_mip) { + [[maybe_unused]] const global_unique_lock_type& global_lock, bool is_mip) { if (is_mip) { assert_not_zero(GetGuestMipsSize()); mips_outdated_ = true; @@ -566,8 +568,8 @@ void TextureCache::Texture::WatchCallback( } void TextureCache::WatchCallback(const global_unique_lock_type& global_lock, - void* context, - void* data, uint64_t argument, bool invalidated_by_gpu) { + void* context, void* data, uint64_t argument, + bool invalidated_by_gpu) { Texture& texture = *static_cast(context); texture.WatchCallback(global_lock, argument != 0); texture.texture_cache().texture_became_outdated_.store( @@ -910,8 +912,8 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk( } void TextureCache::ScaledResolveGlobalWatchCallback( - const global_unique_lock_type& global_lock, - uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) { + const global_unique_lock_type& global_lock, uint32_t address_first, + uint32_t address_last, bool invalidated_by_gpu) { assert_true(IsDrawResolutionScaled()); if (invalidated_by_gpu) { // Resolves themselves do exactly the opposite of what this should do. diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h index cb690a286..c028c1be4 100644 --- a/src/xenia/gpu/texture_cache.h +++ b/src/xenia/gpu/texture_cache.h @@ -19,6 +19,7 @@ #include "xenia/base/assert.h" #include "xenia/base/hash.h" +#include "xenia/base/math.h" #include "xenia/base/mutex.h" #include "xenia/gpu/register_file.h" #include "xenia/gpu/shared_memory.h" @@ -70,6 +71,14 @@ class TextureCache { static bool GetConfigDrawResolutionScale(uint32_t& x_out, uint32_t& y_out); uint32_t draw_resolution_scale_x() const { return draw_resolution_scale_x_; } uint32_t draw_resolution_scale_y() const { return draw_resolution_scale_y_; } + + divisors::MagicDiv draw_resolution_scale_x_divisor() const { + return draw_resolution_scale_x_divisor_; + } + divisors::MagicDiv draw_resolution_scale_y_divisor() const { + return draw_resolution_scale_y_divisor_; + } + bool IsDrawResolutionScaled() const { return draw_resolution_scale_x_ > 1 || draw_resolution_scale_y_ > 1; } @@ -576,8 +585,8 @@ class TextureCache { // Shared memory callback for texture data invalidation. static void WatchCallback(const global_unique_lock_type& global_lock, - void* context, - void* data, uint64_t argument, bool invalidated_by_gpu); + void* context, void* data, uint64_t argument, + bool invalidated_by_gpu); // Checks if there are any pages that contain scaled resolve data within the // range. @@ -588,14 +597,15 @@ class TextureCache { const global_unique_lock_type& global_lock, void* context, uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); void ScaledResolveGlobalWatchCallback( - const global_unique_lock_type& global_lock, - uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); + const global_unique_lock_type& global_lock, uint32_t address_first, + uint32_t address_last, bool invalidated_by_gpu); const RegisterFile& register_file_; SharedMemory& shared_memory_; uint32_t draw_resolution_scale_x_; uint32_t draw_resolution_scale_y_; - + divisors::MagicDiv draw_resolution_scale_x_divisor_; + divisors::MagicDiv draw_resolution_scale_y_divisor_; static const LoadShaderInfo load_shader_info_[kLoadShaderCount]; xe::global_critical_region global_critical_region_; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 47d7506e6..9ac9f13e2 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -2366,6 +2366,7 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // Get dynamic rasterizer state. draw_util::ViewportInfo viewport_info; + // Just handling maxViewportDimensions is enough - viewportBoundsRange[1] must // be at least 2 * max(maxViewportDimensions[0...1]) - 1, and // maxViewportDimensions must be greater than or equal to the size of the @@ -2382,11 +2383,16 @@ bool VulkanCommandProcessor::IssueDraw(xenos::PrimitiveType prim_type, // life. Or even disregard the viewport bounds range in the fragment shader // interlocks case completely - apply the viewport and the scissor offset // directly to pixel address and to things like ps_param_gen. - draw_util::GetHostViewportInfo( - regs, 1, 1, false, device_limits.maxViewportDimensions[0], - device_limits.maxViewportDimensions[1], true, normalized_depth_control, - false, host_render_targets_used, - pixel_shader && pixel_shader->writes_depth(), viewport_info); + draw_util::GetViewportInfoArgs gviargs{}; + gviargs.Setup(1, 1, divisors::MagicDiv{1}, divisors::MagicDiv{1}, false, + device_limits.maxViewportDimensions[0], + + device_limits.maxViewportDimensions[1], true, + normalized_depth_control, false, host_render_targets_used, + pixel_shader && pixel_shader->writes_depth()); + gviargs.SetupRegisterValues(regs); + + draw_util::GetHostViewportInfo(&gviargs, viewport_info); // Update dynamic graphics pipeline state. UpdateDynamicState(viewport_info, primitive_polygonal, diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 55a16df1d..df59b561c 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -326,7 +326,14 @@ constexpr bool IsColorRenderTargetFormat64bpp(ColorRenderTargetFormat format) { format == ColorRenderTargetFormat::k_32_32_FLOAT; } -inline uint32_t GetColorRenderTargetFormatComponentCount( +// if 0, 1 +// if 1, 2 +// if 3, 4 +// 2 bits per entry, shift and add 1 + +using ColorFormatComponentTable = uint32_t; + +static constexpr uint32_t GetComponentCountConst( ColorRenderTargetFormat format) { switch (format) { case ColorRenderTargetFormat::k_8_8_8_8: @@ -337,19 +344,51 @@ inline uint32_t GetColorRenderTargetFormatComponentCount( case ColorRenderTargetFormat::k_16_16_16_16_FLOAT: case ColorRenderTargetFormat::k_2_10_10_10_AS_10_10_10_10: case ColorRenderTargetFormat::k_2_10_10_10_FLOAT_AS_16_16_16_16: - return 4; + return 4 - 1; case ColorRenderTargetFormat::k_16_16: case ColorRenderTargetFormat::k_16_16_FLOAT: case ColorRenderTargetFormat::k_32_32_FLOAT: - return 2; + return 2 - 1; case ColorRenderTargetFormat::k_32_FLOAT: - return 1; + return 1 - 1; default: - assert_unhandled_case(format); return 0; } } +namespace detail { +static constexpr uint32_t encode_format_component_table() { + uint32_t result = 0; +#define ADDFORMAT(name) \ + result |= GetComponentCountConst(ColorRenderTargetFormat::name) \ + << (static_cast(ColorRenderTargetFormat::name) * 2) + ADDFORMAT(k_8_8_8_8); + ADDFORMAT(k_8_8_8_8_GAMMA); + ADDFORMAT(k_2_10_10_10); + ADDFORMAT(k_2_10_10_10_FLOAT); + + ADDFORMAT(k_16_16_16_16); + ADDFORMAT(k_16_16_16_16_FLOAT); + ADDFORMAT(k_2_10_10_10_AS_10_10_10_10); + ADDFORMAT(k_2_10_10_10_FLOAT_AS_16_16_16_16); + + ADDFORMAT(k_16_16); + ADDFORMAT(k_16_16_FLOAT); + ADDFORMAT(k_32_32_FLOAT); + ADDFORMAT(k_32_FLOAT); + return result; +} +constexpr uint32_t color_format_component_table = + encode_format_component_table(); + +} // namespace detail +constexpr uint32_t GetColorRenderTargetFormatComponentCount( + ColorRenderTargetFormat format) { + return ((detail::color_format_component_table >> + (static_cast(format) * 2)) & + 0b11) + + 1; +} // Returns the version of the format with the same packing and meaning of values // stored in it, but without blending precision modifiers. constexpr ColorRenderTargetFormat GetStorageColorFormat(