Merge pull request #81 from chrisps/canary_experimental
global lock changes, minor kernel changes, premake fix
This commit is contained in:
commit
08d38bdff6
|
@ -62,7 +62,9 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
|
|||
// Allocate ffmpeg stuff:
|
||||
av_packet_ = av_packet_alloc();
|
||||
assert_not_null(av_packet_);
|
||||
|
||||
//chrispy: preallocate this buffer so that ffmpeg isn't reallocating it for every packet,
|
||||
//these allocations were causing RtlSubsegmentInitialize
|
||||
av_packet_->buf = av_buffer_alloc(128 * 1024);
|
||||
// find the XMA2 audio decoder
|
||||
av_codec_ = avcodec_find_decoder(AV_CODEC_ID_XMAFRAMES);
|
||||
if (!av_codec_) {
|
||||
|
@ -91,18 +93,20 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
|
|||
}
|
||||
|
||||
bool XmaContext::Work() {
|
||||
std::lock_guard<xe_mutex> lock(lock_);
|
||||
if (!is_allocated() || !is_enabled()) {
|
||||
|
||||
if (!is_enabled() || !is_allocated()) {
|
||||
return false;
|
||||
}
|
||||
{
|
||||
std::lock_guard<xe_mutex> lock(lock_);
|
||||
set_is_enabled(false);
|
||||
|
||||
set_is_enabled(false);
|
||||
|
||||
auto context_ptr = memory()->TranslateVirtual(guest_ptr());
|
||||
XMA_CONTEXT_DATA data(context_ptr);
|
||||
Decode(&data);
|
||||
data.Store(context_ptr);
|
||||
return true;
|
||||
auto context_ptr = memory()->TranslateVirtual(guest_ptr());
|
||||
XMA_CONTEXT_DATA data(context_ptr);
|
||||
Decode(&data);
|
||||
data.Store(context_ptr);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void XmaContext::Enable() {
|
||||
|
|
|
@ -201,8 +201,8 @@ class XmaContext {
|
|||
uint32_t id_ = 0;
|
||||
uint32_t guest_ptr_ = 0;
|
||||
xe_mutex lock_;
|
||||
bool is_allocated_ = false;
|
||||
bool is_enabled_ = false;
|
||||
volatile bool is_allocated_ = false;
|
||||
volatile bool is_enabled_ = false;
|
||||
// bool is_dirty_ = true;
|
||||
|
||||
// ffmpeg structures
|
||||
|
|
|
@ -1,348 +0,0 @@
|
|||
#include "dma.h"
|
||||
#include "logging.h"
|
||||
#include "mutex.h"
|
||||
#include "platform_win.h"
|
||||
|
||||
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
|
||||
NtDelayExecutionPointer);
|
||||
XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer);
|
||||
XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId,
|
||||
NtAlertThreadByThreadId);
|
||||
|
||||
template <size_t N, typename... Ts>
|
||||
static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
|
||||
char buffer[1024];
|
||||
sprintf_s(buffer, fmt, args...);
|
||||
XELOGI("%s", buffer);
|
||||
}
|
||||
|
||||
//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
|
||||
//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
|
||||
#define XEDMALOG(...) static_cast<void>(0)
|
||||
using xe::swcache::CacheLine;
|
||||
static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
|
||||
#if defined(__clang__)
|
||||
XE_FORCEINLINE
|
||||
static void mvdir64b(void* to, const void* from) {
|
||||
__asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
|
||||
}
|
||||
#define _movdir64b mvdir64b
|
||||
#endif
|
||||
XE_FORCEINLINE
|
||||
static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
|
||||
CacheLine* XE_RESTRICT from) {
|
||||
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
CacheLine* dest1 = to;
|
||||
CacheLine* src1 = from;
|
||||
|
||||
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
|
||||
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
|
||||
|
||||
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
|
||||
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
|
||||
|
||||
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
#pragma loop(no_vector)
|
||||
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
|
||||
xe::swcache::CacheLine line0, line1, line2, line3;
|
||||
|
||||
xe::swcache::ReadLine(&line0, src1 + i);
|
||||
xe::swcache::ReadLine(&line1, src2 + i);
|
||||
xe::swcache::ReadLine(&line2, src3 + i);
|
||||
xe::swcache::ReadLine(&line3, src4 + i);
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
xe::swcache::WriteLineNT(dest1 + i, &line0);
|
||||
xe::swcache::WriteLineNT(dest2 + i, &line1);
|
||||
|
||||
xe::swcache::WriteLineNT(dest3 + i, &line2);
|
||||
xe::swcache::WriteLineNT(dest4 + i, &line3);
|
||||
}
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
|
||||
CacheLine* XE_RESTRICT from) {
|
||||
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
CacheLine* dest1 = to;
|
||||
CacheLine* src1 = from;
|
||||
|
||||
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
|
||||
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
|
||||
|
||||
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
|
||||
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
|
||||
|
||||
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
#pragma loop(no_vector)
|
||||
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
|
||||
#if 0
|
||||
xe::swcache::CacheLine line0, line1, line2, line3;
|
||||
|
||||
xe::swcache::ReadLine(&line0, src1 + i);
|
||||
xe::swcache::ReadLine(&line1, src2 + i);
|
||||
xe::swcache::ReadLine(&line2, src3 + i);
|
||||
xe::swcache::ReadLine(&line3, src4 + i);
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
xe::swcache::WriteLineNT(dest1 + i, &line0);
|
||||
xe::swcache::WriteLineNT(dest2 + i, &line1);
|
||||
|
||||
xe::swcache::WriteLineNT(dest3 + i, &line2);
|
||||
xe::swcache::WriteLineNT(dest4 + i, &line3);
|
||||
#else
|
||||
_movdir64b(dest1 + i, src1 + i);
|
||||
_movdir64b(dest2 + i, src2 + i);
|
||||
_movdir64b(dest3 + i, src3 + i);
|
||||
_movdir64b(dest4 + i, src4 + i);
|
||||
#endif
|
||||
}
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
}
|
||||
|
||||
namespace xe::dma {
|
||||
using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length);
|
||||
static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
while (written_length >= 16384) {
|
||||
XeCopy16384StreamingAVX(physaddr, rdmapping);
|
||||
|
||||
physaddr += NUM_LINES_FOR_16K;
|
||||
rdmapping += NUM_LINES_FOR_16K;
|
||||
|
||||
written_length -= 16384;
|
||||
}
|
||||
|
||||
if (!written_length) {
|
||||
return;
|
||||
}
|
||||
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
for (; i + 1 < num_written_lines; i += 2) {
|
||||
xe::swcache::CacheLine line0, line1;
|
||||
|
||||
xe::swcache::ReadLine(&line0, rdmapping + i);
|
||||
|
||||
xe::swcache::ReadLine(&line1, rdmapping + i + 1);
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
xe::swcache::WriteLineNT(physaddr + i, &line0);
|
||||
xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
|
||||
}
|
||||
|
||||
if (i < num_written_lines) {
|
||||
xe::swcache::CacheLine line0;
|
||||
|
||||
xe::swcache::ReadLine(&line0, rdmapping + i);
|
||||
xe::swcache::WriteLineNT(physaddr + i, &line0);
|
||||
}
|
||||
}
|
||||
|
||||
static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
while (written_length >= 16384) {
|
||||
XeCopy16384Movdir64M(physaddr, rdmapping);
|
||||
|
||||
physaddr += NUM_LINES_FOR_16K;
|
||||
rdmapping += NUM_LINES_FOR_16K;
|
||||
|
||||
written_length -= 16384;
|
||||
}
|
||||
|
||||
if (!written_length) {
|
||||
return;
|
||||
}
|
||||
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
for (; i + 1 < num_written_lines; i += 2) {
|
||||
_movdir64b(physaddr + i, rdmapping + i);
|
||||
_movdir64b(physaddr + i + 1, rdmapping + i + 1);
|
||||
}
|
||||
|
||||
if (i < num_written_lines) {
|
||||
_movdir64b(physaddr + i, rdmapping + i);
|
||||
}
|
||||
}
|
||||
|
||||
XE_COLD
|
||||
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length);
|
||||
|
||||
static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
|
||||
|
||||
XE_COLD
|
||||
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
VastCpyDispatch dispatch_to_use = nullptr;
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
|
||||
XELOGI("Selecting MOVDIR64M vastcpy.");
|
||||
dispatch_to_use = vastcpy_impl_movdir64m;
|
||||
} else {
|
||||
XELOGI("Selecting generic AVX vastcpy.");
|
||||
dispatch_to_use = vastcpy_impl_avx;
|
||||
}
|
||||
|
||||
vastcpy_dispatch =
|
||||
dispatch_to_use; // all future calls will go through our selected path
|
||||
return vastcpy_dispatch(physaddr, rdmapping, written_length);
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
|
||||
written_length);
|
||||
}
|
||||
|
||||
#define MAX_INFLIGHT_DMAJOBS 65536
|
||||
#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1)
|
||||
class XeDMACGeneric : public XeDMAC {
|
||||
std::unique_ptr<xe::threading::Thread> thrd_;
|
||||
XeDMAJob* jobs_ring_;
|
||||
volatile std::atomic<uintptr_t> write_ptr_;
|
||||
|
||||
struct alignas(XE_HOST_CACHE_LINE_SIZE) {
|
||||
volatile std::atomic<uintptr_t> read_ptr_;
|
||||
xe_mutex push_into_ring_lock_;
|
||||
};
|
||||
HANDLE gotjob_event;
|
||||
void WorkerWait();
|
||||
|
||||
public:
|
||||
virtual ~XeDMACGeneric() {}
|
||||
void WorkerThreadMain();
|
||||
XeDMACGeneric() {
|
||||
threading::Thread::CreationParameters crparams;
|
||||
crparams.create_suspended = true;
|
||||
crparams.initial_priority = threading::ThreadPriority::kNormal;
|
||||
crparams.stack_size = 65536;
|
||||
gotjob_event = CreateEventA(nullptr, false, false, nullptr);
|
||||
thrd_ = std::move(threading::Thread::Create(
|
||||
crparams, [this]() { this->WorkerThreadMain(); }));
|
||||
|
||||
jobs_ring_ = (XeDMAJob*)_aligned_malloc(
|
||||
MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE);
|
||||
|
||||
write_ptr_ = 0;
|
||||
read_ptr_ = 0;
|
||||
|
||||
thrd_->Resume();
|
||||
}
|
||||
|
||||
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
|
||||
// std::unique_lock<xe_mutex> pushlock{push_into_ring_lock_};
|
||||
HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr);
|
||||
{
|
||||
job->dmac_specific_ = (uintptr_t)dmacevent;
|
||||
|
||||
jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job;
|
||||
write_ptr_++;
|
||||
SetEvent(gotjob_event);
|
||||
}
|
||||
return (DMACJobHandle)dmacevent;
|
||||
}
|
||||
virtual void WaitJobDone(DMACJobHandle handle) override {
|
||||
while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) {
|
||||
// NtAlertThreadByThreadId.invoke<void>(thrd_->system_id());
|
||||
// while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) ==
|
||||
// WAIT_TIMEOUT) {
|
||||
// ;
|
||||
}
|
||||
//}
|
||||
|
||||
// SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false);
|
||||
CloseHandle((HANDLE)handle);
|
||||
}
|
||||
virtual void WaitForIdle() override {
|
||||
while (write_ptr_ != read_ptr_) {
|
||||
threading::MaybeYield();
|
||||
}
|
||||
}
|
||||
};
|
||||
void XeDMACGeneric::WorkerWait() {
|
||||
constexpr unsigned NUM_PAUSE_SPINS = 2048;
|
||||
constexpr unsigned NUM_YIELD_SPINS = 8;
|
||||
#if 0
|
||||
|
||||
for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) {
|
||||
if (write_ptr_ == read_ptr_) {
|
||||
_mm_pause();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) {
|
||||
if (write_ptr_ == read_ptr_) {
|
||||
threading::MaybeYield();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
LARGE_INTEGER yield_execution_delay{};
|
||||
yield_execution_delay.QuadPart =
|
||||
-2000; //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds
|
||||
while (write_ptr_ == read_ptr_) {
|
||||
NtDelayExecutionPointer.invoke<void>(0, &yield_execution_delay);
|
||||
}
|
||||
#else
|
||||
do {
|
||||
if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) {
|
||||
while (write_ptr_ == read_ptr_) {
|
||||
_mm_pause();
|
||||
}
|
||||
}
|
||||
|
||||
} while (write_ptr_ == read_ptr_);
|
||||
#endif
|
||||
}
|
||||
void XeDMACGeneric::WorkerThreadMain() {
|
||||
while (true) {
|
||||
this->WorkerWait();
|
||||
|
||||
XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS];
|
||||
swcache::ReadFence();
|
||||
|
||||
if (current_job.precall) {
|
||||
current_job.precall(¤t_job);
|
||||
}
|
||||
|
||||
size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE;
|
||||
size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
size_t line_rem = current_job.size - line_rounded;
|
||||
|
||||
vastcpy(current_job.destination, current_job.source,
|
||||
static_cast<uint32_t>(line_rounded));
|
||||
|
||||
if (line_rem) {
|
||||
__movsb(current_job.destination + line_rounded,
|
||||
current_job.source + line_rounded, line_rem);
|
||||
}
|
||||
|
||||
if (current_job.postcall) {
|
||||
current_job.postcall(¤t_job);
|
||||
}
|
||||
read_ptr_++;
|
||||
swcache::WriteFence();
|
||||
|
||||
SetEvent((HANDLE)current_job.dmac_specific_);
|
||||
}
|
||||
}
|
||||
|
||||
XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
|
||||
} // namespace xe::dma
|
|
@ -1,47 +0,0 @@
|
|||
/**
|
||||
******************************************************************************
|
||||
* Xenia : Xbox 360 Emulator Research Project *
|
||||
******************************************************************************
|
||||
* Copyright 2020 Ben Vanik. All rights reserved. *
|
||||
* Released under the BSD license - see LICENSE in the root for more details. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
#ifndef XENIA_BASE_DMA_H_
|
||||
#define XENIA_BASE_DMA_H_
|
||||
#include "memory.h"
|
||||
#include "threading.h"
|
||||
namespace xe::dma {
|
||||
struct XeDMAJob;
|
||||
using DmaPrecall = void (*)(XeDMAJob* job);
|
||||
using DmaPostcall = void (*)(XeDMAJob* job);
|
||||
struct XeDMAJob {
|
||||
//threading::Event* signal_on_done;
|
||||
uintptr_t dmac_specific_;
|
||||
uint8_t* destination;
|
||||
uint8_t* source;
|
||||
size_t size;
|
||||
DmaPrecall precall;
|
||||
DmaPostcall postcall;
|
||||
void* userdata1;
|
||||
void* userdata2;
|
||||
};
|
||||
using DMACJobHandle = uint64_t;
|
||||
|
||||
class XeDMAC {
|
||||
public:
|
||||
virtual ~XeDMAC() {}
|
||||
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
|
||||
virtual void WaitJobDone(DMACJobHandle handle) = 0;
|
||||
virtual void WaitForIdle() = 0;
|
||||
};
|
||||
|
||||
XeDMAC* CreateDMAC();
|
||||
// must be divisible by cache line size
|
||||
XE_NOINLINE
|
||||
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length);
|
||||
|
||||
} // namespace xe::dma
|
||||
|
||||
#endif // XENIA_BASE_DMA_H_
|
|
@ -44,8 +44,18 @@ template <typename T>
|
|||
constexpr bool is_pow2(T value) {
|
||||
return (value & (value - 1)) == 0;
|
||||
}
|
||||
/*
|
||||
Use this in place of the shift + and not sequence that is being used currently in bit iteration code. This is more efficient
|
||||
because it does not introduce a dependency on to the previous bit scanning operation. The shift and not sequence does get translated to a single instruction (the bit test and reset instruction),
|
||||
but this code can be executed alongside the scan
|
||||
*/
|
||||
template<typename T>
|
||||
constexpr T clear_lowest_bit(T value) {
|
||||
static_assert(std::is_integral_v<T>);
|
||||
|
||||
// Rounds up the given value to the given alignment.
|
||||
return (value - static_cast<T>(1)) & value;
|
||||
}
|
||||
// Rounds up the given value to the given alignment.
|
||||
template <typename T>
|
||||
constexpr T align(T value, T alignment) {
|
||||
return (value + alignment - 1) & ~(alignment - 1);
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
#include "xenia/base/memory.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/platform.h"
|
||||
#include "xenia/base/logging.h"
|
||||
|
||||
#if XE_ARCH_ARM64
|
||||
#include <arm_neon.h>
|
||||
|
@ -31,6 +32,180 @@ bool IsWritableExecutableMemoryPreferred() {
|
|||
cvars::writable_executable_memory;
|
||||
}
|
||||
|
||||
using xe::swcache::CacheLine;
|
||||
|
||||
static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
|
||||
|
||||
#if defined(__clang__)
|
||||
XE_FORCEINLINE
|
||||
static void mvdir64b(void* to, const void* from) {
|
||||
__asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
|
||||
}
|
||||
#define _movdir64b mvdir64b
|
||||
#endif
|
||||
XE_FORCEINLINE
|
||||
static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
|
||||
CacheLine* XE_RESTRICT from) {
|
||||
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
CacheLine* dest1 = to;
|
||||
CacheLine* src1 = from;
|
||||
|
||||
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
|
||||
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
|
||||
|
||||
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
|
||||
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
|
||||
|
||||
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
#pragma loop(no_vector)
|
||||
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
|
||||
xe::swcache::CacheLine line0, line1, line2, line3;
|
||||
|
||||
xe::swcache::ReadLine(&line0, src1 + i);
|
||||
xe::swcache::ReadLine(&line1, src2 + i);
|
||||
xe::swcache::ReadLine(&line2, src3 + i);
|
||||
xe::swcache::ReadLine(&line3, src4 + i);
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
xe::swcache::WriteLineNT(dest1 + i, &line0);
|
||||
xe::swcache::WriteLineNT(dest2 + i, &line1);
|
||||
|
||||
xe::swcache::WriteLineNT(dest3 + i, &line2);
|
||||
xe::swcache::WriteLineNT(dest4 + i, &line3);
|
||||
}
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
}
|
||||
XE_FORCEINLINE
|
||||
static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
|
||||
CacheLine* XE_RESTRICT from) {
|
||||
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
CacheLine* dest1 = to;
|
||||
CacheLine* src1 = from;
|
||||
|
||||
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
|
||||
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
|
||||
|
||||
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
|
||||
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
|
||||
|
||||
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
|
||||
#pragma loop(no_vector)
|
||||
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
|
||||
_movdir64b(dest1 + i, src1 + i);
|
||||
_movdir64b(dest2 + i, src2 + i);
|
||||
_movdir64b(dest3 + i, src3 + i);
|
||||
_movdir64b(dest4 + i, src4 + i);
|
||||
}
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
}
|
||||
using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length);
|
||||
static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
while (written_length >= 16384) {
|
||||
XeCopy16384StreamingAVX(physaddr, rdmapping);
|
||||
|
||||
physaddr += NUM_LINES_FOR_16K;
|
||||
rdmapping += NUM_LINES_FOR_16K;
|
||||
|
||||
written_length -= 16384;
|
||||
}
|
||||
|
||||
if (!written_length) {
|
||||
return;
|
||||
}
|
||||
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
for (; i + 1 < num_written_lines; i += 2) {
|
||||
xe::swcache::CacheLine line0, line1;
|
||||
|
||||
xe::swcache::ReadLine(&line0, rdmapping + i);
|
||||
|
||||
xe::swcache::ReadLine(&line1, rdmapping + i + 1);
|
||||
XE_MSVC_REORDER_BARRIER();
|
||||
xe::swcache::WriteLineNT(physaddr + i, &line0);
|
||||
xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
|
||||
}
|
||||
|
||||
if (i < num_written_lines) {
|
||||
xe::swcache::CacheLine line0;
|
||||
|
||||
xe::swcache::ReadLine(&line0, rdmapping + i);
|
||||
xe::swcache::WriteLineNT(physaddr + i, &line0);
|
||||
}
|
||||
}
|
||||
|
||||
static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
while (written_length >= 16384) {
|
||||
XeCopy16384Movdir64M(physaddr, rdmapping);
|
||||
|
||||
physaddr += NUM_LINES_FOR_16K;
|
||||
rdmapping += NUM_LINES_FOR_16K;
|
||||
|
||||
written_length -= 16384;
|
||||
}
|
||||
|
||||
if (!written_length) {
|
||||
return;
|
||||
}
|
||||
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
|
||||
|
||||
uint32_t i = 0;
|
||||
|
||||
for (; i + 1 < num_written_lines; i += 2) {
|
||||
_movdir64b(physaddr + i, rdmapping + i);
|
||||
_movdir64b(physaddr + i + 1, rdmapping + i + 1);
|
||||
}
|
||||
|
||||
if (i < num_written_lines) {
|
||||
_movdir64b(physaddr + i, rdmapping + i);
|
||||
}
|
||||
}
|
||||
|
||||
XE_COLD
|
||||
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length);
|
||||
|
||||
static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
|
||||
|
||||
XE_COLD
|
||||
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
|
||||
CacheLine* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
VastCpyDispatch dispatch_to_use = nullptr;
|
||||
if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
|
||||
XELOGI("Selecting MOVDIR64M vastcpy.");
|
||||
dispatch_to_use = vastcpy_impl_movdir64m;
|
||||
} else {
|
||||
XELOGI("Selecting generic AVX vastcpy.");
|
||||
dispatch_to_use = vastcpy_impl_avx;
|
||||
}
|
||||
|
||||
vastcpy_dispatch =
|
||||
dispatch_to_use; // all future calls will go through our selected path
|
||||
return vastcpy_dispatch(physaddr, rdmapping, written_length);
|
||||
}
|
||||
|
||||
XE_NOINLINE
|
||||
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length) {
|
||||
return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
|
||||
written_length);
|
||||
}
|
||||
} // namespace memory
|
||||
|
||||
// TODO(benvanik): fancy AVX versions.
|
||||
|
|
|
@ -17,9 +17,9 @@
|
|||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
|
||||
#include "xenia/base/byte_order.h"
|
||||
#include "xenia/base/platform.h"
|
||||
|
||||
|
||||
namespace xe {
|
||||
namespace memory {
|
||||
|
@ -141,6 +141,10 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) {
|
|||
seed ^= hasher(v) + 0x9E3779B9 + (seed << 6) + (seed >> 2);
|
||||
return hash_combine(seed, vs...);
|
||||
}
|
||||
// must be divisible by cache line size
|
||||
XE_NOINLINE
|
||||
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
|
||||
uint32_t written_length);
|
||||
|
||||
} // namespace memory
|
||||
|
||||
|
|
|
@ -133,6 +133,7 @@ using global_unique_lock_type = std::unique_lock<global_mutex_type>;
|
|||
// };
|
||||
class global_critical_region {
|
||||
public:
|
||||
constexpr global_critical_region() {}
|
||||
static global_mutex_type& mutex();
|
||||
|
||||
// Acquires a lock on the global critical section.
|
||||
|
@ -144,18 +145,18 @@ class global_critical_region {
|
|||
}
|
||||
|
||||
// Acquires a lock on the global critical section.
|
||||
inline global_unique_lock_type Acquire() {
|
||||
static inline global_unique_lock_type Acquire() {
|
||||
return global_unique_lock_type(mutex());
|
||||
}
|
||||
|
||||
// Acquires a deferred lock on the global critical section.
|
||||
inline global_unique_lock_type AcquireDeferred() {
|
||||
static inline global_unique_lock_type AcquireDeferred() {
|
||||
return global_unique_lock_type(mutex(), std::defer_lock);
|
||||
}
|
||||
|
||||
// Tries to acquire a lock on the glboal critical section.
|
||||
// Check owns_lock() to see if the lock was successfully acquired.
|
||||
inline global_unique_lock_type TryAcquire() {
|
||||
static inline global_unique_lock_type TryAcquire() {
|
||||
return global_unique_lock_type(mutex(), std::try_to_lock);
|
||||
}
|
||||
};
|
||||
|
|
|
@ -183,20 +183,13 @@ inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
|
|||
xenia_assert(this->capacity_ >= 4);
|
||||
|
||||
ring_size_t next_read_offset = read_offset + 4;
|
||||
#if 0
|
||||
size_t zerotest = next_read_offset - this->capacity_;
|
||||
// unpredictable branch, use bit arith instead
|
||||
// todo: it would be faster to use lzcnt, but we need to figure out if all
|
||||
// machines we support support it
|
||||
next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
|
||||
#else
|
||||
|
||||
if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
|
||||
next_read_offset = 0;
|
||||
// todo: maybe prefetch next? or should that happen much earlier?
|
||||
}
|
||||
#endif
|
||||
this->read_offset_ = next_read_offset;
|
||||
unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
|
||||
uint32_t ring_value = *(uint32_t*)&this->buffer_[read_offset];
|
||||
return xe::byte_swap(ring_value);
|
||||
}
|
||||
} // namespace xe
|
||||
|
|
|
@ -271,7 +271,10 @@ inline std::pair<WaitResult, size_t> WaitAny(
|
|||
return WaitAny(wait_handles.data(), wait_handles.size(), is_alertable,
|
||||
timeout);
|
||||
}
|
||||
|
||||
struct EventInfo {
|
||||
uint32_t type;
|
||||
uint32_t state;
|
||||
};
|
||||
// Models a Win32-like event object.
|
||||
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms682396(v=vs.85).aspx
|
||||
class Event : public WaitHandle {
|
||||
|
@ -299,6 +302,8 @@ class Event : public WaitHandle {
|
|||
// the nonsignaled state after releasing the appropriate number of waiting
|
||||
// threads.
|
||||
virtual void Pulse() = 0;
|
||||
|
||||
virtual EventInfo Query() = 0;
|
||||
#if XE_PLATFORM_WIN32 ==1
|
||||
//SetEvent, but if there is a waiter we immediately transfer execution to it
|
||||
virtual void SetBoostPriority() = 0;
|
||||
|
|
|
@ -55,7 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
|
|||
|
||||
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
|
||||
NtDelayExecutionPointer);
|
||||
|
||||
XE_NTDLL_IMPORT(NtQueryEvent, cls_NtQueryEvent, NtQueryEventPointer);
|
||||
namespace xe {
|
||||
namespace threading {
|
||||
|
||||
|
@ -255,20 +255,20 @@ std::pair<WaitResult, size_t> WaitMultiple(WaitHandle* wait_handles[],
|
|||
size_t wait_handle_count,
|
||||
bool wait_all, bool is_alertable,
|
||||
std::chrono::milliseconds timeout) {
|
||||
std::vector<HANDLE> handles(
|
||||
wait_handle_count); // max handles is like 64, so it would make more
|
||||
// sense to just do a fixed size array here
|
||||
xenia_assert(wait_handle_count <= 64);
|
||||
HANDLE handles[64];
|
||||
|
||||
for (size_t i = 0; i < wait_handle_count; ++i) {
|
||||
handles[i] = wait_handles[i]->native_handle();
|
||||
}
|
||||
DWORD result = WaitForMultipleObjectsEx(
|
||||
DWORD(handles.size()), handles.data(), wait_all ? TRUE : FALSE,
|
||||
static_cast<DWORD>(wait_handle_count), handles, wait_all ? TRUE : FALSE,
|
||||
DWORD(timeout.count()), is_alertable ? TRUE : FALSE);
|
||||
if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + handles.size()) {
|
||||
if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + wait_handle_count) {
|
||||
return std::pair<WaitResult, size_t>(WaitResult::kSuccess,
|
||||
result - WAIT_OBJECT_0);
|
||||
} else if (result >= WAIT_ABANDONED_0 &&
|
||||
result < WAIT_ABANDONED_0 + handles.size()) {
|
||||
result < WAIT_ABANDONED_0 + wait_handle_count) {
|
||||
return std::pair<WaitResult, size_t>(WaitResult::kAbandoned,
|
||||
result - WAIT_ABANDONED_0);
|
||||
}
|
||||
|
@ -293,7 +293,15 @@ class Win32Event : public Win32Handle<Event> {
|
|||
void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
|
||||
void SetBoostPriority() override {
|
||||
// no previous state for boostpriority
|
||||
NtSetEventBoostPriorityPointer.invoke(handle_);
|
||||
// Boost priority is unimplemented under wine probably because it's not used
|
||||
// anywhere in user mode except by us. Maybe some Windows internals uses it
|
||||
// see:
|
||||
// https://discord.com/channels/308194948048486401/308207592482668545/1027178776599216228
|
||||
if (NtSetEventBoostPriorityPointer) {
|
||||
NtSetEventBoostPriorityPointer.invoke(handle_);
|
||||
} else {
|
||||
NtSetEventPointer.invoke(handle_, nullptr);
|
||||
}
|
||||
}
|
||||
#else
|
||||
void Set() override { SetEvent(handle_); }
|
||||
|
@ -305,6 +313,11 @@ class Win32Event : public Win32Handle<Event> {
|
|||
SetEvent(handle_);
|
||||
}
|
||||
#endif
|
||||
|
||||
EventInfo Query() { EventInfo result{};
|
||||
NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
|
||||
#include "xenia/cpu/hir/hir_builder.h"
|
||||
#include "xenia/cpu/processor.h"
|
||||
XE_MSVC_OPTIMIZE_SMALL()
|
||||
|
||||
DEFINE_bool(use_fast_dot_product, false,
|
||||
"Experimental optimization, much shorter sequence on dot products, "
|
||||
"treating inf as overflow instead of using mcxsr"
|
||||
|
|
|
@ -10,11 +10,7 @@ project("xenia-cpu")
|
|||
"xenia-base",
|
||||
"mspack",
|
||||
})
|
||||
filter({"configurations:Release", "platforms:Windows"})
|
||||
buildoptions({
|
||||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
|
||||
includedirs({
|
||||
project_root.."/third_party/llvm/include",
|
||||
})
|
||||
|
@ -27,3 +23,8 @@ project("xenia-cpu")
|
|||
|
||||
include("testing")
|
||||
include("ppc/testing")
|
||||
filter({"configurations:Release", "platforms:Windows"})
|
||||
buildoptions({
|
||||
"/Os",
|
||||
"/O1"
|
||||
})
|
|
@ -17,6 +17,7 @@ project("xenia-debug-ui")
|
|||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
filter{}
|
||||
defines({
|
||||
})
|
||||
includedirs({
|
||||
|
|
|
@ -9,23 +9,16 @@
|
|||
|
||||
#include "xenia/gpu/command_processor.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
#include "third_party/fmt/include/fmt/format.h"
|
||||
#include "xenia/base/byte_stream.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
#include "xenia/base/ring_buffer.h"
|
||||
#include "xenia/gpu/gpu_flags.h"
|
||||
#include "xenia/gpu/graphics_system.h"
|
||||
#include "xenia/gpu/sampler_info.h"
|
||||
#include "xenia/gpu/texture_info.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
#include "xenia/kernel/kernel_state.h"
|
||||
#include "xenia/kernel/user_module.h"
|
||||
|
||||
|
@ -46,11 +39,6 @@ CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system,
|
|||
write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)),
|
||||
write_ptr_index_(0) {
|
||||
assert_not_null(write_ptr_index_event_);
|
||||
#if 0
|
||||
dmac_ = dma::CreateDMAC();
|
||||
#else
|
||||
dmac_ = nullptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
CommandProcessor::~CommandProcessor() = default;
|
||||
|
@ -625,78 +613,6 @@ void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }
|
|||
|
||||
void CommandProcessor::ReturnFromWait() {}
|
||||
|
||||
uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
|
||||
uint32_t write_index) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
|
||||
// If we have a pending trace stream open it now. That way we ensure we get
|
||||
// all commands.
|
||||
if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
|
||||
uint32_t title_id = kernel_state_->GetExecutableModule()
|
||||
? kernel_state_->GetExecutableModule()->title_id()
|
||||
: 0;
|
||||
auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
|
||||
auto path = trace_stream_path_ / file_name;
|
||||
trace_writer_.Open(path, title_id);
|
||||
InitializeTrace();
|
||||
}
|
||||
#endif
|
||||
// Adjust pointer base.
|
||||
uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
|
||||
start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
|
||||
uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
|
||||
end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
|
||||
|
||||
trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
|
||||
|
||||
// Execute commands!
|
||||
|
||||
RingBuffer old_reader = reader_;
|
||||
new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
|
||||
primary_buffer_size_);
|
||||
|
||||
reader_.set_read_offset(read_index * sizeof(uint32_t));
|
||||
reader_.set_write_offset(write_index * sizeof(uint32_t));
|
||||
// prefetch the wraparound range
|
||||
// it likely is already in L3 cache, but in a zen system it may be another
|
||||
// chiplets l3
|
||||
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
|
||||
GetCurrentRingReadCount());
|
||||
do {
|
||||
if (!ExecutePacket()) {
|
||||
// This probably should be fatal - but we're going to continue anyways.
|
||||
XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
} while (reader_.read_count());
|
||||
|
||||
OnPrimaryBufferEnd();
|
||||
|
||||
trace_writer_.WritePrimaryBufferEnd();
|
||||
|
||||
reader_ = old_reader;
|
||||
return write_index;
|
||||
}
|
||||
|
||||
void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
|
||||
// Execute commands!
|
||||
RingBuffer old_reader = reader_;
|
||||
|
||||
new (&reader_)
|
||||
RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
|
||||
|
||||
reader_.set_write_offset(count * sizeof(uint32_t));
|
||||
|
||||
do {
|
||||
if (!ExecutePacket()) {
|
||||
XELOGE("**** ExecutePacket: Failed to execute packet.");
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
} while (reader_.read_count());
|
||||
reader_ = old_reader;
|
||||
}
|
||||
|
||||
void CommandProcessor::InitializeTrace() {
|
||||
// Write the initial register values, to be loaded directly into the
|
||||
|
|
|
@ -19,11 +19,8 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/dma.h"
|
||||
#include "xenia/base/ring_buffer.h"
|
||||
#include "xenia/base/threading.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/trace_writer.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
#include "xenia/kernel/xthread.h"
|
||||
|
@ -82,7 +79,6 @@ class CommandProcessor {
|
|||
CommandProcessor(GraphicsSystem* graphics_system,
|
||||
kernel::KernelState* kernel_state);
|
||||
virtual ~CommandProcessor();
|
||||
dma::XeDMAC* GetDMAC() const { return dmac_; }
|
||||
uint32_t counter() const { return counter_; }
|
||||
void increment_counter() { counter_++; }
|
||||
|
||||
|
@ -135,7 +131,7 @@ class CommandProcessor {
|
|||
|
||||
void UpdateWritePointer(uint32_t value);
|
||||
|
||||
void ExecutePacket(uint32_t ptr, uint32_t count);
|
||||
|
||||
|
||||
bool is_paused() const { return paused_; }
|
||||
void Pause();
|
||||
|
@ -172,7 +168,7 @@ class CommandProcessor {
|
|||
uint32_t num_registers);
|
||||
|
||||
XE_NOINLINE
|
||||
virtual void WriteOneRegisterFromRing(
|
||||
void WriteOneRegisterFromRing(
|
||||
uint32_t base,
|
||||
uint32_t
|
||||
num_times); // repeatedly write a value to one register, presumably a
|
||||
|
@ -221,7 +217,7 @@ class CommandProcessor {
|
|||
virtual void PrepareForWait();
|
||||
virtual void ReturnFromWait();
|
||||
|
||||
uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
|
||||
|
||||
virtual void OnPrimaryBufferEnd() {}
|
||||
|
||||
#include "pm4_command_processor_declare.h"
|
||||
|
@ -300,7 +296,6 @@ class CommandProcessor {
|
|||
reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
|
||||
reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
|
||||
uint32_t gamma_ramp_rw_component_ = 0;
|
||||
dma::XeDMAC* dmac_ = nullptr;
|
||||
};
|
||||
|
||||
} // namespace gpu
|
||||
|
|
|
@ -2672,45 +2672,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
|
|||
// validity is tracked.
|
||||
const Shader::ConstantRegisterMap& constant_map_vertex =
|
||||
vertex_shader->constant_register_map();
|
||||
for (uint32_t i = 0; i < xe::countof(constant_map_vertex.vertex_fetch_bitmap);
|
||||
++i) {
|
||||
uint32_t vfetch_bits_remaining = constant_map_vertex.vertex_fetch_bitmap[i];
|
||||
uint32_t j;
|
||||
while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
|
||||
vfetch_bits_remaining &= ~(uint32_t(1) << j);
|
||||
uint32_t vfetch_index = i * 32 + j;
|
||||
const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
|
||||
switch (vfetch_constant.type) {
|
||||
case xenos::FetchConstantType::kVertex:
|
||||
break;
|
||||
case xenos::FetchConstantType::kInvalidVertex:
|
||||
if (cvars::gpu_allow_invalid_fetch_constants) {
|
||||
{
|
||||
uint32_t vfetch_addresses[96];
|
||||
uint32_t vfetch_sizes[96];
|
||||
uint32_t vfetch_current_queued = 0;
|
||||
for (uint32_t i = 0;
|
||||
i < xe::countof(constant_map_vertex.vertex_fetch_bitmap); ++i) {
|
||||
uint32_t vfetch_bits_remaining =
|
||||
constant_map_vertex.vertex_fetch_bitmap[i];
|
||||
uint32_t j;
|
||||
while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
|
||||
vfetch_bits_remaining = xe::clear_lowest_bit(vfetch_bits_remaining);
|
||||
uint32_t vfetch_index = i * 32 + j;
|
||||
const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
|
||||
switch (vfetch_constant.type) {
|
||||
case xenos::FetchConstantType::kVertex:
|
||||
break;
|
||||
}
|
||||
XELOGW(
|
||||
"Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" type! "
|
||||
"This is incorrect behavior, but you can try bypassing this by "
|
||||
"launching Xenia with --gpu_allow_invalid_fetch_constants=true.",
|
||||
vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
|
||||
return false;
|
||||
default:
|
||||
XELOGW(
|
||||
"Vertex fetch constant {} ({:08X} {:08X}) is completely invalid!",
|
||||
vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
|
||||
return false;
|
||||
case xenos::FetchConstantType::kInvalidVertex:
|
||||
if (cvars::gpu_allow_invalid_fetch_constants) {
|
||||
break;
|
||||
}
|
||||
XELOGW(
|
||||
"Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" "
|
||||
"type! "
|
||||
"This is incorrect behavior, but you can try bypassing this by "
|
||||
"launching Xenia with "
|
||||
"--gpu_allow_invalid_fetch_constants=true.",
|
||||
vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
|
||||
return false;
|
||||
default:
|
||||
XELOGW(
|
||||
"Vertex fetch constant {} ({:08X} {:08X}) is completely "
|
||||
"invalid!",
|
||||
vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
|
||||
return false;
|
||||
}
|
||||
vfetch_addresses[vfetch_current_queued] = vfetch_constant.address;
|
||||
vfetch_sizes[vfetch_current_queued++] = vfetch_constant.size;
|
||||
}
|
||||
if (!shared_memory_->RequestRange(vfetch_constant.address << 2,
|
||||
vfetch_constant.size << 2)) {
|
||||
XELOGE(
|
||||
"Failed to request vertex buffer at 0x{:08X} (size {}) in the "
|
||||
"shared memory",
|
||||
vfetch_constant.address << 2, vfetch_constant.size << 2);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (vfetch_current_queued) {
|
||||
// so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
|
||||
// pre-acquire the critical region so we're not repeatedly re-acquiring it
|
||||
// in requestrange
|
||||
auto shared_memory_request_range_hoisted =
|
||||
global_critical_region::Acquire();
|
||||
|
||||
for (uint32_t i = 0; i < vfetch_current_queued; ++i) {
|
||||
if (!shared_memory_->RequestRange(vfetch_addresses[i] << 2,
|
||||
vfetch_sizes[i] << 2)) {
|
||||
XELOGE(
|
||||
"Failed to request vertex buffer at 0x{:08X} (size {}) in the "
|
||||
"shared memory",
|
||||
vfetch_addresses[i] << 2, vfetch_sizes[i] << 2);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gather memexport ranges and ensure the heaps for them are resident, and
|
||||
// also load the data surrounding the export and to fill the regions that
|
||||
// won't be modified by the shaders.
|
||||
|
@ -3076,24 +3097,6 @@ void D3D12CommandProcessor::InitializeTrace() {
|
|||
shared_memory_->InitializeTraceCompleteDownloads();
|
||||
}
|
||||
}
|
||||
static void DmaPrefunc(dma::XeDMAJob* job) {
|
||||
D3D12_RANGE readback_range;
|
||||
readback_range.Begin = 0;
|
||||
readback_range.End = job->size;
|
||||
void* readback_mapping;
|
||||
ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
|
||||
|
||||
HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
|
||||
xenia_assert(SUCCEEDED(mapres));
|
||||
|
||||
job->source = (uint8_t*)readback_mapping;
|
||||
}
|
||||
|
||||
static void DmaPostfunc(dma::XeDMAJob* job) {
|
||||
D3D12_RANGE readback_write_range = {};
|
||||
ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
|
||||
readback_buffer->Unmap(0, &readback_write_range);
|
||||
}
|
||||
|
||||
bool D3D12CommandProcessor::IssueCopy() {
|
||||
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
|
||||
|
@ -3102,7 +3105,6 @@ bool D3D12CommandProcessor::IssueCopy() {
|
|||
if (!BeginSubmission(true)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!cvars::d3d12_readback_resolve) {
|
||||
uint32_t written_address, written_length;
|
||||
return render_target_cache_->Resolve(*memory_, *shared_memory_,
|
||||
|
@ -3129,34 +3131,21 @@ bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() {
|
|||
readback_buffer, 0, shared_memory_buffer, written_address,
|
||||
written_length);
|
||||
if (AwaitAllQueueOperationsCompletion()) {
|
||||
#if 1
|
||||
D3D12_RANGE readback_range;
|
||||
readback_range.Begin = 0;
|
||||
readback_range.End = written_length;
|
||||
void* readback_mapping;
|
||||
if (SUCCEEDED(
|
||||
readback_buffer->Map(0, &readback_range, &readback_mapping))) {
|
||||
// chrispy: this memcpy needs to be optimized as much as possible
|
||||
D3D12_RANGE readback_range;
|
||||
readback_range.Begin = 0;
|
||||
readback_range.End = written_length;
|
||||
void* readback_mapping;
|
||||
if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
|
||||
&readback_mapping))) {
|
||||
// chrispy: this memcpy needs to be optimized as much as possible
|
||||
|
||||
auto physaddr = memory_->TranslatePhysical(written_address);
|
||||
dma::vastcpy(physaddr, (uint8_t*)readback_mapping, written_length);
|
||||
// XEDmaCpy(physaddr, readback_mapping, written_length);
|
||||
D3D12_RANGE readback_write_range = {};
|
||||
readback_buffer->Unmap(0, &readback_write_range);
|
||||
}
|
||||
|
||||
#else
|
||||
dma::XeDMAJob job{};
|
||||
job.destination = memory_->TranslatePhysical(written_address);
|
||||
job.size = written_length;
|
||||
job.source = nullptr;
|
||||
job.userdata1 = (void*)readback_buffer;
|
||||
job.precall = DmaPrefunc;
|
||||
job.postcall = DmaPostfunc;
|
||||
|
||||
readback_available_ = GetDMAC()->PushDMAJob(&job);
|
||||
|
||||
#endif
|
||||
auto physaddr = memory_->TranslatePhysical(written_address);
|
||||
memory::vastcpy(physaddr, (uint8_t*)readback_mapping,
|
||||
written_length);
|
||||
// XEDmaCpy(physaddr, readback_mapping, written_length);
|
||||
D3D12_RANGE readback_write_range = {};
|
||||
readback_buffer->Unmap(0, &readback_write_range);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3833,7 +3822,8 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
|
|||
uint32_t user_clip_plane_index;
|
||||
while (xe::bit_scan_forward(user_clip_planes_remaining,
|
||||
&user_clip_plane_index)) {
|
||||
user_clip_planes_remaining &= ~(UINT32_C(1) << user_clip_plane_index);
|
||||
user_clip_planes_remaining =
|
||||
xe::clear_lowest_bit(user_clip_planes_remaining);
|
||||
const float* user_clip_plane =
|
||||
®s[XE_GPU_REG_PA_CL_UCP_0_X + user_clip_plane_index * 4].f32;
|
||||
if (std::memcmp(user_clip_plane_write_ptr, user_clip_plane,
|
||||
|
@ -3917,7 +3907,7 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
|
|||
uint32_t textures_remaining = used_texture_mask;
|
||||
uint32_t texture_index;
|
||||
while (xe::bit_scan_forward(textures_remaining, &texture_index)) {
|
||||
textures_remaining &= ~(uint32_t(1) << texture_index);
|
||||
textures_remaining = xe::clear_lowest_bit(textures_remaining);
|
||||
uint32_t& texture_signs_uint =
|
||||
system_constants_.texture_swizzled_signs[texture_index >> 2];
|
||||
uint32_t texture_signs_shift = (texture_index & 3) * 8;
|
||||
|
@ -5116,12 +5106,6 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
|
|||
if (size == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
#if 1
|
||||
if (readback_available_) {
|
||||
GetDMAC()->WaitJobDone(readback_available_);
|
||||
readback_available_ = 0;
|
||||
}
|
||||
#endif
|
||||
size = xe::align(size, kReadbackBufferSizeIncrement);
|
||||
if (size > readback_buffer_size_) {
|
||||
const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
#include <utility>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/dma.h"
|
||||
#include "xenia/gpu/command_processor.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
|
||||
#include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
|
||||
|
@ -50,8 +49,10 @@ struct MemExportRange {
|
|||
uint32_t size_dwords;
|
||||
};
|
||||
class D3D12CommandProcessor final : public CommandProcessor {
|
||||
public:
|
||||
protected:
|
||||
#include "../pm4_command_processor_declare.h"
|
||||
|
||||
public:
|
||||
explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
|
||||
kernel::KernelState* kernel_state);
|
||||
~D3D12CommandProcessor();
|
||||
|
@ -232,8 +233,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
uint32_t base,
|
||||
uint32_t num_registers);
|
||||
XE_NOINLINE
|
||||
virtual void WriteOneRegisterFromRing(uint32_t base,
|
||||
uint32_t num_times) override;
|
||||
void WriteOneRegisterFromRing(uint32_t base,
|
||||
uint32_t num_times);
|
||||
|
||||
XE_FORCEINLINE
|
||||
void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
|
||||
|
@ -677,7 +678,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
|
|||
|
||||
static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
|
||||
ID3D12Resource* readback_buffer_ = nullptr;
|
||||
dma::DMACJobHandle readback_available_ = 0;
|
||||
uint32_t readback_buffer_size_ = 0;
|
||||
|
||||
std::atomic<bool> pix_capture_requested_ = false;
|
||||
|
|
|
@ -407,15 +407,14 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
|
|||
|
||||
bool D3D12SharedMemory::UploadRanges(
|
||||
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
|
||||
unsigned num_upload_page_ranges) {
|
||||
uint32_t num_upload_page_ranges) {
|
||||
if (!num_upload_page_ranges) {
|
||||
return true;
|
||||
}
|
||||
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
|
||||
command_processor_.SubmitBarriers();
|
||||
auto& command_list = command_processor_.GetDeferredCommandList();
|
||||
// for (auto upload_range : upload_page_ranges) {
|
||||
for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
|
||||
for (uint32_t i = 0; i < num_upload_page_ranges; ++i) {
|
||||
auto& upload_range = upload_page_ranges[i];
|
||||
uint32_t upload_range_start = upload_range.first;
|
||||
uint32_t upload_range_length = upload_range.second;
|
||||
|
@ -437,7 +436,7 @@ bool D3D12SharedMemory::UploadRanges(
|
|||
uint32_t(upload_buffer_size), false, false);
|
||||
|
||||
if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) {
|
||||
dma::vastcpy(
|
||||
memory::vastcpy(
|
||||
upload_buffer_mapping,
|
||||
memory().TranslatePhysical(upload_range_start << page_size_log2()),
|
||||
static_cast<uint32_t>(upload_buffer_size));
|
||||
|
|
|
@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory {
|
|||
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
|
||||
uint32_t length_allocations) override;
|
||||
|
||||
bool UploadRanges(const std::pair<uint32_t, uint32_t>*
|
||||
upload_page_ranges, unsigned num_ranges) override;
|
||||
bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
|
||||
uint32_t num_ranges) override;
|
||||
|
||||
private:
|
||||
D3D12CommandProcessor& command_processor_;
|
||||
|
|
|
@ -491,7 +491,7 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) {
|
|||
uint32_t textures_remaining = used_texture_mask;
|
||||
uint32_t index;
|
||||
while (xe::bit_scan_forward(textures_remaining, &index)) {
|
||||
textures_remaining &= ~(uint32_t(1) << index);
|
||||
textures_remaining = xe::clear_lowest_bit(textures_remaining);
|
||||
const TextureBinding* binding = GetValidTextureBinding(index);
|
||||
if (!binding) {
|
||||
continue;
|
||||
|
|
|
@ -9,21 +9,14 @@
|
|||
|
||||
#include "xenia/gpu/draw_util.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/gpu/gpu_flags.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/texture_cache.h"
|
||||
#include "xenia/gpu/texture_info.h"
|
||||
#include "xenia/gpu/texture_util.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
#include "xenia/ui/graphics_util.h"
|
||||
|
||||
// Very prominent in 545407F2.
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/shader.h"
|
||||
#include "xenia/gpu/trace_writer.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
|
||||
|
||||
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT;
|
||||
|
||||
virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) XE_RESTRICT;
|
||||
virtual bool ExecutePacket();
|
||||
XE_NOINLINE
|
||||
|
||||
public:
|
||||
void ExecutePacket(uint32_t ptr, uint32_t count);
|
||||
|
||||
protected:
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType0( uint32_t packet) XE_RESTRICT;
|
||||
XE_NOINLINE
|
||||
bool ExecutePacketType1( uint32_t packet) XE_RESTRICT;
|
||||
|
@ -103,4 +108,7 @@ uint32_t GetCurrentRingReadCount();
|
|||
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool ExecutePacketType3_CountOverflow(uint32_t count);
|
||||
bool ExecutePacketType3_CountOverflow(uint32_t count);
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool ExecutePacketType0_CountOverflow(uint32_t count);
|
|
@ -75,33 +75,45 @@ bool COMMAND_PROCESSOR::ExecutePacket() {
|
|||
}
|
||||
}
|
||||
XE_NOINLINE
|
||||
XE_COLD
|
||||
bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
|
||||
XELOGE("ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
|
||||
COMMAND_PROCESSOR::GetCurrentRingReadCount(),
|
||||
count * sizeof(uint32_t));
|
||||
return false;
|
||||
}
|
||||
/*
|
||||
Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
|
||||
*/
|
||||
XE_NOINLINE
|
||||
bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
|
||||
// Type-0 packet.
|
||||
// Write count registers in sequence to the registers starting at
|
||||
// (base_index << 2).
|
||||
|
||||
uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
|
||||
if (COMMAND_PROCESSOR::GetCurrentRingReadCount() < count * sizeof(uint32_t)) {
|
||||
XELOGE(
|
||||
"ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
|
||||
COMMAND_PROCESSOR::GetCurrentRingReadCount(), count * sizeof(uint32_t));
|
||||
return false;
|
||||
}
|
||||
|
||||
trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
|
||||
|
||||
uint32_t base_index = (packet & 0x7FFF);
|
||||
uint32_t write_one_reg = (packet >> 15) & 0x1;
|
||||
if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
|
||||
count * sizeof(uint32_t)) {
|
||||
trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
|
||||
|
||||
if (!write_one_reg) {
|
||||
COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index, count);
|
||||
uint32_t base_index = (packet & 0x7FFF);
|
||||
uint32_t write_one_reg = (packet >> 15) & 0x1;
|
||||
|
||||
if (!write_one_reg) {
|
||||
COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index,
|
||||
count);
|
||||
|
||||
} else {
|
||||
COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
|
||||
}
|
||||
|
||||
trace_writer_.WritePacketEnd();
|
||||
return true;
|
||||
} else {
|
||||
COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
|
||||
return COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(count);
|
||||
}
|
||||
|
||||
trace_writer_.WritePacketEnd();
|
||||
return true;
|
||||
}
|
||||
XE_NOINLINE
|
||||
bool COMMAND_PROCESSOR::ExecutePacketType1(uint32_t packet) XE_RESTRICT {
|
||||
|
@ -430,6 +442,11 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(
|
|||
|
||||
XE_NOINLINE
|
||||
static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
|
||||
/*
|
||||
Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits
|
||||
then use the wait_info value in order to select the bits that correctly implement the condition
|
||||
If neither subtraction has the signbit set then that means the value is equal
|
||||
*/
|
||||
bool matched = false;
|
||||
switch (wait_info & 0x7) {
|
||||
case 0x0: // Never.
|
||||
|
@ -1058,6 +1075,10 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
todo: shouldn't this do something?
|
||||
*/
|
||||
|
||||
bool COMMAND_PROCESSOR::ExecutePacketType3_INVALIDATE_STATE(
|
||||
uint32_t packet, uint32_t count) XE_RESTRICT {
|
||||
// selective invalidation of state pointers
|
||||
|
@ -1099,3 +1120,76 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
|
||||
uint32_t write_index) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
|
||||
// If we have a pending trace stream open it now. That way we ensure we get
|
||||
// all commands.
|
||||
if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
|
||||
uint32_t title_id = kernel_state_->GetExecutableModule()
|
||||
? kernel_state_->GetExecutableModule()->title_id()
|
||||
: 0;
|
||||
auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
|
||||
auto path = trace_stream_path_ / file_name;
|
||||
trace_writer_.Open(path, title_id);
|
||||
InitializeTrace();
|
||||
}
|
||||
#endif
|
||||
// Adjust pointer base.
|
||||
uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
|
||||
start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
|
||||
uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
|
||||
end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
|
||||
|
||||
trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
|
||||
|
||||
// Execute commands!
|
||||
|
||||
RingBuffer old_reader = reader_;
|
||||
new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
|
||||
primary_buffer_size_);
|
||||
|
||||
reader_.set_read_offset(read_index * sizeof(uint32_t));
|
||||
reader_.set_write_offset(write_index * sizeof(uint32_t));
|
||||
// prefetch the wraparound range
|
||||
// it likely is already in L3 cache, but in a zen system it may be another
|
||||
// chiplets l3
|
||||
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
|
||||
GetCurrentRingReadCount());
|
||||
do {
|
||||
if (!COMMAND_PROCESSOR::ExecutePacket()) {
|
||||
// This probably should be fatal - but we're going to continue anyways.
|
||||
XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
} while (reader_.read_count());
|
||||
|
||||
COMMAND_PROCESSOR::OnPrimaryBufferEnd();
|
||||
|
||||
trace_writer_.WritePrimaryBufferEnd();
|
||||
|
||||
reader_ = old_reader;
|
||||
return write_index;
|
||||
}
|
||||
|
||||
void COMMAND_PROCESSOR::ExecutePacket(uint32_t ptr, uint32_t count) {
|
||||
// Execute commands!
|
||||
RingBuffer old_reader = reader_;
|
||||
|
||||
new (&reader_)
|
||||
RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
|
||||
|
||||
reader_.set_write_offset(count * sizeof(uint32_t));
|
||||
|
||||
do {
|
||||
if (!COMMAND_PROCESSOR::ExecutePacket()) {
|
||||
XELOGE("**** ExecutePacket: Failed to execute packet.");
|
||||
assert_always();
|
||||
break;
|
||||
}
|
||||
} while (reader_.read_count());
|
||||
reader_ = old_reader;
|
||||
}
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
#define XENIA_GPU_SAMPLER_INFO_H_
|
||||
|
||||
#include "xenia/gpu/shader.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
|
|
@ -24,7 +24,6 @@
|
|||
#include "xenia/base/string_buffer.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/ucode.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
|
|
@ -13,13 +13,6 @@
|
|||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/byte_order.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/trace_writer.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
||||
|
|
|
@ -14,12 +14,9 @@
|
|||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/shader.h"
|
||||
#include "xenia/gpu/trace_writer.h"
|
||||
#include "xenia/gpu/ucode.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
#include "xenia/memory.h"
|
||||
|
||||
namespace xe {
|
||||
|
|
|
@ -9,15 +9,9 @@
|
|||
|
||||
#include "xenia/gpu/shader_translator.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdarg>
|
||||
#include <cstring>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/gpu/gpu_flags.h"
|
||||
|
||||
namespace xe {
|
||||
|
|
|
@ -11,16 +11,7 @@
|
|||
#define XENIA_GPU_SHADER_TRANSLATOR_H_
|
||||
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/string_buffer.h"
|
||||
#include "xenia/gpu/registers.h"
|
||||
#include "xenia/gpu/shader.h"
|
||||
#include "xenia/gpu/ucode.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
|
|
@ -10,15 +10,11 @@
|
|||
#include "xenia/gpu/shared_memory.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/bit_range.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
#include "xenia/memory.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
|
|
@ -10,12 +10,6 @@
|
|||
#ifndef XENIA_GPU_SHARED_MEMORY_H_
|
||||
#define XENIA_GPU_SHARED_MEMORY_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <mutex>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "xenia/base/mutex.h"
|
||||
#include "xenia/memory.h"
|
||||
|
||||
namespace xe {
|
||||
|
@ -141,7 +135,7 @@ class SharedMemory {
|
|||
// overall bounds of pages to be uploaded.
|
||||
virtual bool UploadRanges(
|
||||
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
|
||||
unsigned num_upload_ranges) = 0;
|
||||
uint32_t num_upload_ranges) = 0;
|
||||
|
||||
const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
|
||||
return trace_download_ranges_;
|
||||
|
@ -183,14 +177,10 @@ class SharedMemory {
|
|||
FixedVMemVector<MAX_UPLOAD_RANGES * sizeof(std::pair<uint32_t, uint32_t>)>
|
||||
upload_ranges_;
|
||||
|
||||
// GPU-written memory downloading for traces. <Start address, length>.
|
||||
std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
|
||||
uint32_t trace_download_page_count_ = 0;
|
||||
|
||||
// Mutex between the guest memory subsystem and the command processor, to be
|
||||
// locked when checking or updating validity of pages/ranges and when firing
|
||||
// watches.
|
||||
xe::global_critical_region global_critical_region_;
|
||||
static constexpr xe::global_critical_region global_critical_region_{};
|
||||
|
||||
// ***************************************************************************
|
||||
// Things below should be fully protected by global_critical_region.
|
||||
|
@ -266,6 +256,11 @@ class SharedMemory {
|
|||
uint32_t watch_node_current_pool_allocated_ = 0;
|
||||
WatchRange* watch_range_first_free_ = nullptr;
|
||||
WatchNode* watch_node_first_free_ = nullptr;
|
||||
|
||||
// GPU-written memory downloading for traces. <Start address, length>.
|
||||
std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
|
||||
uint32_t trace_download_page_count_ = 0;
|
||||
|
||||
// Triggers the watches (global and per-range), removing triggered range
|
||||
// watches.
|
||||
void FireWatches(uint32_t page_first, uint32_t page_last,
|
||||
|
|
|
@ -9,21 +9,11 @@
|
|||
|
||||
#include "xenia/gpu/texture_cache.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <utility>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/clock.h"
|
||||
#include "xenia/base/cvar.h"
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/profiling.h"
|
||||
#include "xenia/gpu/gpu_flags.h"
|
||||
#include "xenia/gpu/register_file.h"
|
||||
#include "xenia/gpu/texture_info.h"
|
||||
#include "xenia/gpu/texture_util.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
DEFINE_int32(
|
||||
draw_resolution_scale_x, 1,
|
||||
|
@ -332,9 +322,13 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
|
|||
uint32_t bindings_changed = 0;
|
||||
uint32_t textures_remaining = used_texture_mask & ~texture_bindings_in_sync_;
|
||||
uint32_t index = 0;
|
||||
|
||||
Texture* textures_to_load[64]; // max bits = 32, can be unsigned + signed
|
||||
// means max array size = 64
|
||||
uint32_t num_textures_to_load = 0;
|
||||
while (xe::bit_scan_forward(textures_remaining, &index)) {
|
||||
uint32_t index_bit = UINT32_C(1) << index;
|
||||
textures_remaining &= ~index_bit;
|
||||
textures_remaining = xe::clear_lowest_bit(textures_remaining);
|
||||
TextureBinding& binding = texture_bindings_[index];
|
||||
const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
|
||||
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6);
|
||||
|
@ -406,12 +400,15 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
|
|||
binding.texture_signed = nullptr;
|
||||
}
|
||||
if (load_unsigned_data && binding.texture != nullptr) {
|
||||
LoadTextureData(*binding.texture);
|
||||
textures_to_load[num_textures_to_load++] = binding.texture;
|
||||
}
|
||||
if (load_signed_data && binding.texture_signed != nullptr) {
|
||||
LoadTextureData(*binding.texture_signed);
|
||||
textures_to_load[num_textures_to_load++] = binding.texture_signed;
|
||||
}
|
||||
}
|
||||
|
||||
LoadTexturesData(textures_to_load, num_textures_to_load);
|
||||
|
||||
if (bindings_changed) {
|
||||
UpdateTextureBindingsImpl(bindings_changed);
|
||||
}
|
||||
|
@ -643,7 +640,134 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
|
|||
texture->LogAction("Created");
|
||||
return texture;
|
||||
}
|
||||
void TextureCache::LoadTexturesData(Texture** textures, uint32_t n_textures) {
|
||||
assert_true(n_textures <= 64);
|
||||
if (n_textures < 2) {
|
||||
if (!n_textures) {
|
||||
return;
|
||||
} else {
|
||||
LoadTextureData(*textures[0]);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t index_base_outdated = 0;
|
||||
uint64_t index_mips_outdated = 0;
|
||||
uint32_t nkept = 0;
|
||||
{
|
||||
auto global_lock = global_critical_region_.Acquire();
|
||||
for (uint32_t i = 0; i < n_textures; ++i) {
|
||||
Texture* current = textures[i];
|
||||
|
||||
auto base_outdated = current->base_outdated(global_lock);
|
||||
auto mips_outdated = current->mips_outdated(global_lock);
|
||||
|
||||
index_base_outdated |= static_cast<uint64_t>(base_outdated) << i;
|
||||
index_mips_outdated |= static_cast<uint64_t>(mips_outdated) << i;
|
||||
if (!base_outdated && !mips_outdated) {
|
||||
textures[i] = nullptr;
|
||||
|
||||
} else {
|
||||
nkept++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nkept == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < n_textures; ++i) {
|
||||
Texture* p_texture = textures[i];
|
||||
if (!p_texture) {
|
||||
continue;
|
||||
}
|
||||
textures[i] = nullptr;
|
||||
Texture& texture = *p_texture;
|
||||
|
||||
TextureKey texture_key = texture.key();
|
||||
// Implementation may load multiple blocks at once via accesses of up to 128
|
||||
// bits (R32G32B32A32_UINT), so aligning the size to this value to make sure
|
||||
// if the texture is small (especially if it's linear), the last blocks
|
||||
// won't be cut off (hosts may return 0, 0, 0, 0 for the whole
|
||||
// R32G32B32A32_UINT access for the non-16-aligned tail even if 1...15 bytes
|
||||
// are actually provided for it).
|
||||
|
||||
// Request uploading of the texture data to the shared memory.
|
||||
// This is also necessary when resolution scaling is used - the texture
|
||||
// cache relies on shared memory for invalidation of both unscaled and
|
||||
// scaled textures. Plus a texture may be unscaled partially, when only a
|
||||
// portion of its pages is invalidated, in this case we'll need the texture
|
||||
// from the shared memory to load the unscaled parts.
|
||||
// TODO(Triang3l): Load unscaled parts.
|
||||
bool base_resolved = texture.GetBaseResolved();
|
||||
if (index_base_outdated & (1ULL << i)) {
|
||||
if (!shared_memory().RequestRange(
|
||||
texture_key.base_page << 12,
|
||||
xe::align(texture.GetGuestBaseSize(), UINT32_C(16)),
|
||||
texture_key.scaled_resolve ? nullptr : &base_resolved)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
bool mips_resolved = texture.GetMipsResolved();
|
||||
if (index_mips_outdated & (1ULL << i)) {
|
||||
if (!shared_memory().RequestRange(
|
||||
texture_key.mip_page << 12,
|
||||
xe::align(texture.GetGuestMipsSize(), UINT32_C(16)),
|
||||
texture_key.scaled_resolve ? nullptr : &mips_resolved)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (texture_key.scaled_resolve) {
|
||||
// Make sure all the scaled resolve memory is resident and accessible from
|
||||
// the shader, including any possible padding that hasn't yet been touched
|
||||
// by an actual resolve, but is still included in the texture size, so the
|
||||
// GPU won't be trying to access unmapped memory.
|
||||
if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12,
|
||||
texture.GetGuestBaseSize(), 4)) {
|
||||
continue;
|
||||
}
|
||||
if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12,
|
||||
texture.GetGuestMipsSize(), 4)) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Actually load the texture data.
|
||||
if (!LoadTextureDataFromResidentMemoryImpl(
|
||||
texture, (index_base_outdated & (1ULL << i)) != 0,
|
||||
(index_mips_outdated & (1ULL << i)) != 0)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Update the source of the texture (resolve vs. CPU or memexport) for
|
||||
// purposes of handling piecewise gamma emulation via sRGB and for
|
||||
// resolution scale in sampling offsets.
|
||||
if (!texture_key.scaled_resolve) {
|
||||
texture.SetBaseResolved(base_resolved);
|
||||
texture.SetMipsResolved(mips_resolved);
|
||||
}
|
||||
// reque for makeuptodatandwatch
|
||||
textures[i] = &texture;
|
||||
}
|
||||
{
|
||||
auto crit = global_critical_region_.Acquire();
|
||||
|
||||
for (uint32_t i = 0; i < n_textures; ++i) {
|
||||
auto texture = textures[i];
|
||||
if (!texture) {
|
||||
continue;
|
||||
}
|
||||
// Mark the ranges as uploaded and watch them. This is needed for scaled
|
||||
// resolves as well to detect when the CPU wants to reuse the memory for a
|
||||
// regular texture or a vertex buffer, and thus the scaled resolve version
|
||||
// is not up to date anymore.
|
||||
texture->MakeUpToDateAndWatch(crit);
|
||||
|
||||
texture->LogAction("Loaded");
|
||||
}
|
||||
}
|
||||
}
|
||||
bool TextureCache::LoadTextureData(Texture& texture) {
|
||||
// Check what needs to be uploaded.
|
||||
bool base_outdated, mips_outdated;
|
||||
|
|
|
@ -559,6 +559,7 @@ class TextureCache {
|
|||
return load_shader_info_[load_shader_index];
|
||||
}
|
||||
bool LoadTextureData(Texture& texture);
|
||||
void LoadTexturesData(Texture** textures, uint32_t n_textures);
|
||||
// Writes the texture data (for base, mips or both - but not neither) from the
|
||||
// shared memory or the scaled resolve memory. The shared memory management is
|
||||
// done outside this function, the implementation just needs to load the data
|
||||
|
|
|
@ -8,14 +8,7 @@
|
|||
*/
|
||||
|
||||
#include "xenia/gpu/texture_info.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
|
||||
#include "xenia/base/logging.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/base/xxhash.h"
|
||||
|
||||
namespace xe {
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
#include <array>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
namespace xe {
|
||||
|
|
|
@ -8,13 +8,6 @@
|
|||
*/
|
||||
|
||||
#include "xenia/gpu/texture_util.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/math.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace texture_util {
|
||||
|
|
|
@ -10,11 +10,6 @@
|
|||
#ifndef XENIA_GPU_UCODE_H_
|
||||
#define XENIA_GPU_UCODE_H_
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/platform.h"
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
// The XNA Game Studio 3.1 contains Graphics.ShaderCompiler.AssembleFromSource,
|
||||
|
|
|
@ -46,6 +46,9 @@ namespace gpu {
|
|||
namespace vulkan {
|
||||
|
||||
class VulkanCommandProcessor final : public CommandProcessor {
|
||||
protected:
|
||||
#include "../pm4_command_processor_declare.h"
|
||||
|
||||
public:
|
||||
// Single-descriptor layouts for use within a single frame.
|
||||
enum class SingleTransientDescriptorLayout {
|
||||
|
@ -53,7 +56,6 @@ class VulkanCommandProcessor final : public CommandProcessor {
|
|||
kStorageBufferCompute,
|
||||
kCount,
|
||||
};
|
||||
#include "../pm4_command_processor_declare.h"
|
||||
|
||||
class ScratchBufferAcquisition {
|
||||
public:
|
||||
|
|
|
@ -377,7 +377,7 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange(
|
|||
|
||||
bool VulkanSharedMemory::UploadRanges(
|
||||
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
|
||||
unsigned num_upload_ranges) {
|
||||
uint32_t num_upload_ranges) {
|
||||
if (!num_upload_ranges) {
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory {
|
|||
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
|
||||
uint32_t length_allocations) override;
|
||||
|
||||
bool UploadRanges(const std::pair<uint32_t, uint32_t>*
|
||||
upload_page_ranges, unsigned num_ranges) override;
|
||||
bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
|
||||
uint32_t num_ranges) override;
|
||||
|
||||
private:
|
||||
void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,
|
||||
|
|
|
@ -9,10 +9,6 @@
|
|||
|
||||
#include "xenia/gpu/xenos.h"
|
||||
|
||||
#include <cmath>
|
||||
|
||||
#include "xenia/base/math.h"
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
namespace xenos {
|
||||
|
|
|
@ -10,13 +10,10 @@
|
|||
#ifndef XENIA_GPU_XENOS_H_
|
||||
#define XENIA_GPU_XENOS_H_
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "xenia/base/assert.h"
|
||||
#include "xenia/base/byte_order.h"
|
||||
#include "xenia/base/math.h"
|
||||
#include "xenia/base/memory.h"
|
||||
#include "xenia/base/platform.h"
|
||||
#include "xenia/base/math.h"
|
||||
|
||||
|
||||
namespace xe {
|
||||
namespace gpu {
|
||||
|
|
|
@ -264,8 +264,9 @@ ObjectTable::ObjectTableEntry* ObjectTable::LookupTable(X_HANDLE handle) {
|
|||
|
||||
// Generic lookup
|
||||
template <>
|
||||
object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle) {
|
||||
auto object = ObjectTable::LookupObject(handle, false);
|
||||
object_ref<XObject> ObjectTable::LookupObject<XObject>(
|
||||
X_HANDLE handle, bool already_locked) {
|
||||
auto object = ObjectTable::LookupObject(handle, already_locked);
|
||||
auto result = object_ref<XObject>(reinterpret_cast<XObject*>(object));
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -46,10 +46,9 @@ class ObjectTable {
|
|||
// Restores a XObject reference with a handle. Mainly for internal use - do
|
||||
// not use.
|
||||
X_STATUS RestoreHandle(X_HANDLE handle, XObject* object);
|
||||
|
||||
template <typename T>
|
||||
object_ref<T> LookupObject(X_HANDLE handle) {
|
||||
auto object = LookupObject(handle, false);
|
||||
object_ref<T> LookupObject(X_HANDLE handle, bool already_locked = false) {
|
||||
auto object = LookupObject(handle, already_locked);
|
||||
if (T::kObjectType == XObject::Type::Socket) {
|
||||
object = LookupObject((handle | 0xF8000000), false);
|
||||
}
|
||||
|
@ -110,7 +109,8 @@ class ObjectTable {
|
|||
|
||||
// Generic lookup
|
||||
template <>
|
||||
object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle);
|
||||
object_ref<XObject> ObjectTable::LookupObject<XObject>(
|
||||
X_HANDLE handle, bool already_locked);
|
||||
|
||||
} // namespace util
|
||||
} // namespace kernel
|
||||
|
|
|
@ -54,7 +54,15 @@ DECLARE_XBOXKRNL_EXPORT1(XAudioEnableDucker, kAudio, kStub);
|
|||
|
||||
dword_result_t XAudioRegisterRenderDriverClient_entry(lpdword_t callback_ptr,
|
||||
lpdword_t driver_ptr) {
|
||||
if (!callback_ptr) {
|
||||
return X_E_INVALIDARG;
|
||||
}
|
||||
|
||||
uint32_t callback = callback_ptr[0];
|
||||
|
||||
if (!callback) {
|
||||
return X_E_INVALIDARG;
|
||||
}
|
||||
uint32_t callback_arg = callback_ptr[1];
|
||||
|
||||
auto audio_system = kernel_state()->emulator()->audio_system();
|
||||
|
|
|
@ -515,7 +515,26 @@ dword_result_t NtPulseEvent_entry(dword_t handle,
|
|||
}
|
||||
DECLARE_XBOXKRNL_EXPORT2(NtPulseEvent, kThreading, kImplemented,
|
||||
kHighFrequency);
|
||||
dword_result_t NtQueryEvent_entry(dword_t handle, lpdword_t out_struc) {
|
||||
X_STATUS result = X_STATUS_SUCCESS;
|
||||
|
||||
auto ev = kernel_state()->object_table()->LookupObject<XEvent>(handle);
|
||||
if (ev) {
|
||||
uint32_t type_tmp, state_tmp;
|
||||
|
||||
ev->Query(&type_tmp, &state_tmp);
|
||||
|
||||
out_struc[0] = type_tmp;
|
||||
out_struc[1] = state_tmp;
|
||||
|
||||
} else {
|
||||
result = X_STATUS_INVALID_HANDLE;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT2(NtQueryEvent, kThreading, kImplemented,
|
||||
kHighFrequency);
|
||||
uint32_t xeNtClearEvent(uint32_t handle) {
|
||||
X_STATUS result = X_STATUS_SUCCESS;
|
||||
|
||||
|
@ -832,23 +851,25 @@ dword_result_t KeWaitForMultipleObjects_entry(
|
|||
lpqword_t timeout_ptr, lpvoid_t wait_block_array_ptr) {
|
||||
assert_true(wait_type <= 1);
|
||||
|
||||
std::vector<object_ref<XObject>> objects;
|
||||
for (uint32_t n = 0; n < count; n++) {
|
||||
auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
|
||||
auto object_ref =
|
||||
XObject::GetNativeObject<XObject>(kernel_state(), object_ptr);
|
||||
if (!object_ref) {
|
||||
return X_STATUS_INVALID_PARAMETER;
|
||||
assert_true(count <= 64);
|
||||
object_ref<XObject> objects[64];
|
||||
{
|
||||
auto crit = global_critical_region::AcquireDirect();
|
||||
for (uint32_t n = 0; n < count; n++) {
|
||||
auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
|
||||
auto object_ref = XObject::GetNativeObject<XObject>(kernel_state(),
|
||||
object_ptr, -1, true);
|
||||
if (!object_ref) {
|
||||
return X_STATUS_INVALID_PARAMETER;
|
||||
}
|
||||
|
||||
objects[n] = std::move(object_ref);
|
||||
}
|
||||
|
||||
objects.push_back(std::move(object_ref));
|
||||
}
|
||||
|
||||
uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
|
||||
return XObject::WaitMultiple(uint32_t(objects.size()),
|
||||
reinterpret_cast<XObject**>(objects.data()),
|
||||
wait_type, wait_reason, processor_mode,
|
||||
alertable, timeout_ptr ? &timeout : nullptr);
|
||||
return XObject::WaitMultiple(
|
||||
uint32_t(count), reinterpret_cast<XObject**>(&objects[0]), wait_type,
|
||||
wait_reason, processor_mode, alertable, timeout_ptr ? &timeout : nullptr);
|
||||
}
|
||||
DECLARE_XBOXKRNL_EXPORT3(KeWaitForMultipleObjects, kThreading, kImplemented,
|
||||
kBlocking, kHighFrequency);
|
||||
|
@ -859,19 +880,32 @@ uint32_t xeNtWaitForMultipleObjectsEx(uint32_t count, xe::be<uint32_t>* handles,
|
|||
uint64_t* timeout_ptr) {
|
||||
assert_true(wait_type <= 1);
|
||||
|
||||
std::vector<object_ref<XObject>> objects;
|
||||
for (uint32_t n = 0; n < count; n++) {
|
||||
uint32_t object_handle = handles[n];
|
||||
auto object =
|
||||
kernel_state()->object_table()->LookupObject<XObject>(object_handle);
|
||||
if (!object) {
|
||||
return X_STATUS_INVALID_PARAMETER;
|
||||
assert_true(count <= 64);
|
||||
object_ref<XObject> objects[64];
|
||||
|
||||
/*
|
||||
Reserving to squash the constant reallocations, in a benchmark of one
|
||||
particular game over a period of five minutes roughly 11% of CPU time was
|
||||
spent inside a helper function to Windows' heap allocation function. 7% of
|
||||
that time was traced back to here
|
||||
|
||||
edit: actually switched to fixed size array, as there can never be more
|
||||
than 64 events specified
|
||||
*/
|
||||
{
|
||||
auto crit = global_critical_region::AcquireDirect();
|
||||
for (uint32_t n = 0; n < count; n++) {
|
||||
uint32_t object_handle = handles[n];
|
||||
auto object = kernel_state()->object_table()->LookupObject<XObject>(
|
||||
object_handle, true);
|
||||
if (!object) {
|
||||
return X_STATUS_INVALID_PARAMETER;
|
||||
}
|
||||
objects[n] = std::move(object);
|
||||
}
|
||||
objects.push_back(std::move(object));
|
||||
}
|
||||
|
||||
return XObject::WaitMultiple(count,
|
||||
reinterpret_cast<XObject**>(objects.data()),
|
||||
return XObject::WaitMultiple(count, reinterpret_cast<XObject**>(&objects[0]),
|
||||
wait_type, 6, wait_mode, alertable, timeout_ptr);
|
||||
}
|
||||
|
||||
|
@ -879,6 +913,9 @@ dword_result_t NtWaitForMultipleObjectsEx_entry(
|
|||
dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode,
|
||||
dword_t alertable, lpqword_t timeout_ptr) {
|
||||
uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
|
||||
if (!count || count > 64 || wait_type != 1 && wait_type) {
|
||||
return X_STATUS_INVALID_PARAMETER;
|
||||
}
|
||||
return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode,
|
||||
alertable,
|
||||
timeout_ptr ? &timeout : nullptr);
|
||||
|
@ -892,11 +929,14 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
|
|||
dword_t r6,
|
||||
lpqword_t timeout_ptr) {
|
||||
X_STATUS result = X_STATUS_SUCCESS;
|
||||
// pre-lock for these two handle lookups
|
||||
global_critical_region::mutex().lock();
|
||||
|
||||
auto signal_object =
|
||||
kernel_state()->object_table()->LookupObject<XObject>(signal_handle);
|
||||
auto signal_object = kernel_state()->object_table()->LookupObject<XObject>(
|
||||
signal_handle, true);
|
||||
auto wait_object =
|
||||
kernel_state()->object_table()->LookupObject<XObject>(wait_handle);
|
||||
kernel_state()->object_table()->LookupObject<XObject>(wait_handle, true);
|
||||
global_critical_region::mutex().unlock();
|
||||
if (signal_object && wait_object) {
|
||||
uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
|
||||
result =
|
||||
|
|
|
@ -71,7 +71,12 @@ int32_t XEvent::Reset() {
|
|||
event_->Reset();
|
||||
return 1;
|
||||
}
|
||||
void XEvent::Query(uint32_t* out_type, uint32_t* out_state) {
|
||||
auto [type, state] = event_->Query();
|
||||
|
||||
*out_type = type;
|
||||
*out_state = state;
|
||||
}
|
||||
void XEvent::Clear() { event_->Reset(); }
|
||||
|
||||
bool XEvent::Save(ByteStream* stream) {
|
||||
|
|
|
@ -36,6 +36,7 @@ class XEvent : public XObject {
|
|||
int32_t Set(uint32_t priority_increment, bool wait);
|
||||
int32_t Pulse(uint32_t priority_increment, bool wait);
|
||||
int32_t Reset();
|
||||
void Query(uint32_t* out_type, uint32_t* out_state);
|
||||
void Clear();
|
||||
|
||||
bool Save(ByteStream* stream) override;
|
||||
|
|
|
@ -255,7 +255,8 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
|
|||
uint32_t wait_type, uint32_t wait_reason,
|
||||
uint32_t processor_mode, uint32_t alertable,
|
||||
uint64_t* opt_timeout) {
|
||||
std::vector<xe::threading::WaitHandle*> wait_handles(count);
|
||||
xe::threading::WaitHandle* wait_handles[64];
|
||||
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
wait_handles[i] = objects[i]->GetWaitHandle();
|
||||
assert_not_null(wait_handles[i]);
|
||||
|
@ -267,7 +268,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
|
|||
: std::chrono::milliseconds::max();
|
||||
|
||||
if (wait_type) {
|
||||
auto result = xe::threading::WaitAny(std::move(wait_handles),
|
||||
auto result = xe::threading::WaitAny(wait_handles, count,
|
||||
alertable ? true : false, timeout_ms);
|
||||
switch (result.first) {
|
||||
case xe::threading::WaitResult::kSuccess:
|
||||
|
@ -287,7 +288,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
|
|||
return X_STATUS_UNSUCCESSFUL;
|
||||
}
|
||||
} else {
|
||||
auto result = xe::threading::WaitAll(std::move(wait_handles),
|
||||
auto result = xe::threading::WaitAll(wait_handles, count,
|
||||
alertable ? true : false, timeout_ms);
|
||||
switch (result) {
|
||||
case xe::threading::WaitResult::kSuccess:
|
||||
|
@ -360,8 +361,8 @@ void XObject::SetNativePointer(uint32_t native_ptr, bool uninitialized) {
|
|||
}
|
||||
|
||||
object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
|
||||
void* native_ptr,
|
||||
int32_t as_type) {
|
||||
void* native_ptr, int32_t as_type,
|
||||
bool already_locked) {
|
||||
assert_not_null(native_ptr);
|
||||
|
||||
// Unfortunately the XDK seems to inline some KeInitialize calls, meaning
|
||||
|
@ -373,10 +374,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
|
|||
// We identify this by setting wait_list_flink to a magic value. When set,
|
||||
// wait_list_blink will hold a handle to our object.
|
||||
|
||||
auto global_lock = xe::global_critical_region::AcquireDirect();
|
||||
if (!already_locked) {
|
||||
global_critical_region::mutex().lock();
|
||||
}
|
||||
|
||||
auto header = reinterpret_cast<X_DISPATCH_HEADER*>(native_ptr);
|
||||
|
||||
XObject* result;
|
||||
if (as_type == -1) {
|
||||
as_type = header->type;
|
||||
}
|
||||
|
@ -385,10 +388,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
|
|||
// Already initialized.
|
||||
// TODO: assert if the type of the object != as_type
|
||||
uint32_t handle = header->wait_list_blink;
|
||||
auto object = kernel_state->object_table()->LookupObject<XObject>(handle);
|
||||
|
||||
result = kernel_state->object_table()
|
||||
->LookupObject<XObject>(handle, true)
|
||||
.release();
|
||||
goto return_result;
|
||||
// TODO(benvanik): assert nothing has been changed in the struct.
|
||||
return object;
|
||||
// return object;
|
||||
} else {
|
||||
// First use, create new.
|
||||
// https://www.nirsoft.net/kernel_struct/vista/KOBJECTS.html
|
||||
|
@ -430,14 +435,22 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
|
|||
case 24: // ThreadedDpcObject
|
||||
default:
|
||||
assert_always();
|
||||
return NULL;
|
||||
result = nullptr;
|
||||
goto return_result;
|
||||
|
||||
// return NULL;
|
||||
}
|
||||
|
||||
// Stash pointer in struct.
|
||||
// FIXME: This assumes the object contains a dispatch header (some don't!)
|
||||
StashHandle(header, object->handle());
|
||||
result = object;
|
||||
|
||||
return object_ref<XObject>(object);
|
||||
return_result:
|
||||
if (!already_locked) {
|
||||
global_critical_region::mutex().unlock();
|
||||
}
|
||||
return object_ref<XObject>(result);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -192,10 +192,12 @@ class XObject {
|
|||
|
||||
static object_ref<XObject> GetNativeObject(KernelState* kernel_state,
|
||||
void* native_ptr,
|
||||
int32_t as_type = -1);
|
||||
int32_t as_type = -1,
|
||||
bool already_locked = false);
|
||||
template <typename T>
|
||||
static object_ref<T> GetNativeObject(KernelState* kernel_state,
|
||||
void* native_ptr, int32_t as_type = -1);
|
||||
void* native_ptr, int32_t as_type = -1,
|
||||
bool already_locked = false);
|
||||
|
||||
protected:
|
||||
bool SaveObject(ByteStream* stream);
|
||||
|
@ -362,9 +364,11 @@ object_ref<T> retain_object(T* ptr) {
|
|||
|
||||
template <typename T>
|
||||
object_ref<T> XObject::GetNativeObject(KernelState* kernel_state,
|
||||
void* native_ptr, int32_t as_type) {
|
||||
void* native_ptr, int32_t as_type,
|
||||
bool already_locked) {
|
||||
return object_ref<T>(reinterpret_cast<T*>(
|
||||
GetNativeObject(kernel_state, native_ptr, as_type).release()));
|
||||
GetNativeObject(kernel_state, native_ptr, as_type, already_locked)
|
||||
.release()));
|
||||
}
|
||||
|
||||
} // namespace kernel
|
||||
|
|
|
@ -15,6 +15,7 @@ project("xenia-ui-d3d12")
|
|||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
filter {}
|
||||
local_platform_files()
|
||||
files({
|
||||
"../shaders/bytecode/d3d12_5_1/*.h",
|
||||
|
|
|
@ -15,6 +15,7 @@ project("xenia-ui-vulkan")
|
|||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
filter {}
|
||||
includedirs({
|
||||
project_root.."/third_party/Vulkan-Headers/include",
|
||||
})
|
||||
|
|
|
@ -16,6 +16,8 @@ project("xenia-vfs")
|
|||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
filter {}
|
||||
|
||||
recursive_platform_files()
|
||||
removefiles({"vfs_dump.cc"})
|
||||
|
||||
|
|
|
@ -16,6 +16,8 @@ project("capstone")
|
|||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
filter {}
|
||||
|
||||
includedirs({
|
||||
"capstone",
|
||||
"capstone/include",
|
||||
|
|
|
@ -13,6 +13,8 @@ project("fmt")
|
|||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
filter {}
|
||||
|
||||
includedirs({
|
||||
"fmt/include",
|
||||
})
|
||||
|
|
|
@ -15,6 +15,8 @@ project("glslang-spirv")
|
|||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
filter {}
|
||||
|
||||
files({
|
||||
"glslang/SPIRV/bitutils.h",
|
||||
"glslang/SPIRV/disassemble.cpp",
|
||||
|
|
|
@ -16,6 +16,7 @@ project("imgui")
|
|||
"/Os",
|
||||
"/O1"
|
||||
})
|
||||
filter{}
|
||||
files({
|
||||
"imgui/imconfig.h",
|
||||
"imgui/imgui.cpp",
|
||||
|
|
Loading…
Reference in New Issue