Merge pull request #81 from chrisps/canary_experimental

global lock changes, minor kernel changes, premake fix
This commit is contained in:
chrisps 2022-10-08 12:04:43 -07:00 committed by GitHub
commit 08d38bdff6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
59 changed files with 740 additions and 793 deletions

View File

@ -62,7 +62,9 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
// Allocate ffmpeg stuff:
av_packet_ = av_packet_alloc();
assert_not_null(av_packet_);
//chrispy: preallocate this buffer so that ffmpeg isn't reallocating it for every packet,
//these allocations were causing RtlSubsegmentInitialize
av_packet_->buf = av_buffer_alloc(128 * 1024);
// find the XMA2 audio decoder
av_codec_ = avcodec_find_decoder(AV_CODEC_ID_XMAFRAMES);
if (!av_codec_) {
@ -91,18 +93,20 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
}
bool XmaContext::Work() {
std::lock_guard<xe_mutex> lock(lock_);
if (!is_allocated() || !is_enabled()) {
if (!is_enabled() || !is_allocated()) {
return false;
}
{
std::lock_guard<xe_mutex> lock(lock_);
set_is_enabled(false);
set_is_enabled(false);
auto context_ptr = memory()->TranslateVirtual(guest_ptr());
XMA_CONTEXT_DATA data(context_ptr);
Decode(&data);
data.Store(context_ptr);
return true;
auto context_ptr = memory()->TranslateVirtual(guest_ptr());
XMA_CONTEXT_DATA data(context_ptr);
Decode(&data);
data.Store(context_ptr);
return true;
}
}
void XmaContext::Enable() {

View File

@ -201,8 +201,8 @@ class XmaContext {
uint32_t id_ = 0;
uint32_t guest_ptr_ = 0;
xe_mutex lock_;
bool is_allocated_ = false;
bool is_enabled_ = false;
volatile bool is_allocated_ = false;
volatile bool is_enabled_ = false;
// bool is_dirty_ = true;
// ffmpeg structures

View File

@ -1,348 +0,0 @@
#include "dma.h"
#include "logging.h"
#include "mutex.h"
#include "platform_win.h"
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
NtDelayExecutionPointer);
XE_NTDLL_IMPORT(NtAlertThread, cls_NtAlertThread, NtAlertThreadPointer);
XE_NTDLL_IMPORT(NtAlertThreadByThreadId, cls_NtAlertThreadByThreadId,
NtAlertThreadByThreadId);
template <size_t N, typename... Ts>
static void xedmaloghelper(const char (&fmt)[N], Ts... args) {
char buffer[1024];
sprintf_s(buffer, fmt, args...);
XELOGI("%s", buffer);
}
//#define XEDMALOG(...) XELOGI("XeDma: " __VA_ARGS__)
//#define XEDMALOG(...) xedmaloghelper("XeDma: " __VA_ARGS__)
#define XEDMALOG(...) static_cast<void>(0)
using xe::swcache::CacheLine;
static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
#if defined(__clang__)
XE_FORCEINLINE
static void mvdir64b(void* to, const void* from) {
__asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
}
#define _movdir64b mvdir64b
#endif
XE_FORCEINLINE
static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
CacheLine* XE_RESTRICT from) {
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
CacheLine* dest1 = to;
CacheLine* src1 = from;
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
xe::swcache::CacheLine line0, line1, line2, line3;
xe::swcache::ReadLine(&line0, src1 + i);
xe::swcache::ReadLine(&line1, src2 + i);
xe::swcache::ReadLine(&line2, src3 + i);
xe::swcache::ReadLine(&line3, src4 + i);
XE_MSVC_REORDER_BARRIER();
xe::swcache::WriteLineNT(dest1 + i, &line0);
xe::swcache::WriteLineNT(dest2 + i, &line1);
xe::swcache::WriteLineNT(dest3 + i, &line2);
xe::swcache::WriteLineNT(dest4 + i, &line3);
}
XE_MSVC_REORDER_BARRIER();
}
XE_FORCEINLINE
static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
CacheLine* XE_RESTRICT from) {
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
CacheLine* dest1 = to;
CacheLine* src1 = from;
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
#if 0
xe::swcache::CacheLine line0, line1, line2, line3;
xe::swcache::ReadLine(&line0, src1 + i);
xe::swcache::ReadLine(&line1, src2 + i);
xe::swcache::ReadLine(&line2, src3 + i);
xe::swcache::ReadLine(&line3, src4 + i);
XE_MSVC_REORDER_BARRIER();
xe::swcache::WriteLineNT(dest1 + i, &line0);
xe::swcache::WriteLineNT(dest2 + i, &line1);
xe::swcache::WriteLineNT(dest3 + i, &line2);
xe::swcache::WriteLineNT(dest4 + i, &line3);
#else
_movdir64b(dest1 + i, src1 + i);
_movdir64b(dest2 + i, src2 + i);
_movdir64b(dest3 + i, src3 + i);
_movdir64b(dest4 + i, src4 + i);
#endif
}
XE_MSVC_REORDER_BARRIER();
}
namespace xe::dma {
using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length);
static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length) {
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
while (written_length >= 16384) {
XeCopy16384StreamingAVX(physaddr, rdmapping);
physaddr += NUM_LINES_FOR_16K;
rdmapping += NUM_LINES_FOR_16K;
written_length -= 16384;
}
if (!written_length) {
return;
}
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
uint32_t i = 0;
for (; i + 1 < num_written_lines; i += 2) {
xe::swcache::CacheLine line0, line1;
xe::swcache::ReadLine(&line0, rdmapping + i);
xe::swcache::ReadLine(&line1, rdmapping + i + 1);
XE_MSVC_REORDER_BARRIER();
xe::swcache::WriteLineNT(physaddr + i, &line0);
xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
}
if (i < num_written_lines) {
xe::swcache::CacheLine line0;
xe::swcache::ReadLine(&line0, rdmapping + i);
xe::swcache::WriteLineNT(physaddr + i, &line0);
}
}
static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length) {
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
while (written_length >= 16384) {
XeCopy16384Movdir64M(physaddr, rdmapping);
physaddr += NUM_LINES_FOR_16K;
rdmapping += NUM_LINES_FOR_16K;
written_length -= 16384;
}
if (!written_length) {
return;
}
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
uint32_t i = 0;
for (; i + 1 < num_written_lines; i += 2) {
_movdir64b(physaddr + i, rdmapping + i);
_movdir64b(physaddr + i + 1, rdmapping + i + 1);
}
if (i < num_written_lines) {
_movdir64b(physaddr + i, rdmapping + i);
}
}
XE_COLD
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length);
static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
XE_COLD
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length) {
VastCpyDispatch dispatch_to_use = nullptr;
if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
XELOGI("Selecting MOVDIR64M vastcpy.");
dispatch_to_use = vastcpy_impl_movdir64m;
} else {
XELOGI("Selecting generic AVX vastcpy.");
dispatch_to_use = vastcpy_impl_avx;
}
vastcpy_dispatch =
dispatch_to_use; // all future calls will go through our selected path
return vastcpy_dispatch(physaddr, rdmapping, written_length);
}
XE_NOINLINE
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
uint32_t written_length) {
return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
written_length);
}
#define MAX_INFLIGHT_DMAJOBS 65536
#define INFLICT_DMAJOB_MASK (MAX_INFLIGHT_DMAJOBS - 1)
class XeDMACGeneric : public XeDMAC {
std::unique_ptr<xe::threading::Thread> thrd_;
XeDMAJob* jobs_ring_;
volatile std::atomic<uintptr_t> write_ptr_;
struct alignas(XE_HOST_CACHE_LINE_SIZE) {
volatile std::atomic<uintptr_t> read_ptr_;
xe_mutex push_into_ring_lock_;
};
HANDLE gotjob_event;
void WorkerWait();
public:
virtual ~XeDMACGeneric() {}
void WorkerThreadMain();
XeDMACGeneric() {
threading::Thread::CreationParameters crparams;
crparams.create_suspended = true;
crparams.initial_priority = threading::ThreadPriority::kNormal;
crparams.stack_size = 65536;
gotjob_event = CreateEventA(nullptr, false, false, nullptr);
thrd_ = std::move(threading::Thread::Create(
crparams, [this]() { this->WorkerThreadMain(); }));
jobs_ring_ = (XeDMAJob*)_aligned_malloc(
MAX_INFLIGHT_DMAJOBS * sizeof(XeDMAJob), XE_HOST_CACHE_LINE_SIZE);
write_ptr_ = 0;
read_ptr_ = 0;
thrd_->Resume();
}
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) override {
// std::unique_lock<xe_mutex> pushlock{push_into_ring_lock_};
HANDLE dmacevent = CreateEventA(nullptr, true, false, nullptr);
{
job->dmac_specific_ = (uintptr_t)dmacevent;
jobs_ring_[write_ptr_ % MAX_INFLIGHT_DMAJOBS] = *job;
write_ptr_++;
SetEvent(gotjob_event);
}
return (DMACJobHandle)dmacevent;
}
virtual void WaitJobDone(DMACJobHandle handle) override {
while (WaitForSingleObject((HANDLE)handle, 2) == WAIT_TIMEOUT) {
// NtAlertThreadByThreadId.invoke<void>(thrd_->system_id());
// while (SignalObjectAndWait(gotjob_event, (HANDLE)handle, 2, false) ==
// WAIT_TIMEOUT) {
// ;
}
//}
// SignalObjectAndWait(gotjob_event, (HANDLE)handle, INFINITE, false);
CloseHandle((HANDLE)handle);
}
virtual void WaitForIdle() override {
while (write_ptr_ != read_ptr_) {
threading::MaybeYield();
}
}
};
void XeDMACGeneric::WorkerWait() {
constexpr unsigned NUM_PAUSE_SPINS = 2048;
constexpr unsigned NUM_YIELD_SPINS = 8;
#if 0
for (unsigned i = 0; i < NUM_PAUSE_SPINS; ++i) {
if (write_ptr_ == read_ptr_) {
_mm_pause();
} else {
break;
}
}
for (unsigned i = 0; i < NUM_YIELD_SPINS; ++i) {
if (write_ptr_ == read_ptr_) {
threading::MaybeYield();
} else {
break;
}
}
LARGE_INTEGER yield_execution_delay{};
yield_execution_delay.QuadPart =
-2000; //-10000 == 1 ms, so -2000 means delay for 0.2 milliseconds
while (write_ptr_ == read_ptr_) {
NtDelayExecutionPointer.invoke<void>(0, &yield_execution_delay);
}
#else
do {
if (WaitForSingleObjectEx(gotjob_event, 1, TRUE) == WAIT_OBJECT_0) {
while (write_ptr_ == read_ptr_) {
_mm_pause();
}
}
} while (write_ptr_ == read_ptr_);
#endif
}
void XeDMACGeneric::WorkerThreadMain() {
while (true) {
this->WorkerWait();
XeDMAJob current_job = jobs_ring_[read_ptr_ % MAX_INFLIGHT_DMAJOBS];
swcache::ReadFence();
if (current_job.precall) {
current_job.precall(&current_job);
}
size_t num_lines = current_job.size / XE_HOST_CACHE_LINE_SIZE;
size_t line_rounded = num_lines * XE_HOST_CACHE_LINE_SIZE;
size_t line_rem = current_job.size - line_rounded;
vastcpy(current_job.destination, current_job.source,
static_cast<uint32_t>(line_rounded));
if (line_rem) {
__movsb(current_job.destination + line_rounded,
current_job.source + line_rounded, line_rem);
}
if (current_job.postcall) {
current_job.postcall(&current_job);
}
read_ptr_++;
swcache::WriteFence();
SetEvent((HANDLE)current_job.dmac_specific_);
}
}
XeDMAC* CreateDMAC() { return new XeDMACGeneric(); }
} // namespace xe::dma

View File

@ -1,47 +0,0 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2020 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef XENIA_BASE_DMA_H_
#define XENIA_BASE_DMA_H_
#include "memory.h"
#include "threading.h"
namespace xe::dma {
struct XeDMAJob;
using DmaPrecall = void (*)(XeDMAJob* job);
using DmaPostcall = void (*)(XeDMAJob* job);
struct XeDMAJob {
//threading::Event* signal_on_done;
uintptr_t dmac_specific_;
uint8_t* destination;
uint8_t* source;
size_t size;
DmaPrecall precall;
DmaPostcall postcall;
void* userdata1;
void* userdata2;
};
using DMACJobHandle = uint64_t;
class XeDMAC {
public:
virtual ~XeDMAC() {}
virtual DMACJobHandle PushDMAJob(XeDMAJob* job) = 0;
virtual void WaitJobDone(DMACJobHandle handle) = 0;
virtual void WaitForIdle() = 0;
};
XeDMAC* CreateDMAC();
// must be divisible by cache line size
XE_NOINLINE
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
uint32_t written_length);
} // namespace xe::dma
#endif // XENIA_BASE_DMA_H_

View File

@ -44,8 +44,18 @@ template <typename T>
constexpr bool is_pow2(T value) {
return (value & (value - 1)) == 0;
}
/*
Use this in place of the shift + and not sequence that is being used currently in bit iteration code. This is more efficient
because it does not introduce a dependency on to the previous bit scanning operation. The shift and not sequence does get translated to a single instruction (the bit test and reset instruction),
but this code can be executed alongside the scan
*/
template<typename T>
constexpr T clear_lowest_bit(T value) {
static_assert(std::is_integral_v<T>);
// Rounds up the given value to the given alignment.
return (value - static_cast<T>(1)) & value;
}
// Rounds up the given value to the given alignment.
template <typename T>
constexpr T align(T value, T alignment) {
return (value + alignment - 1) & ~(alignment - 1);

View File

@ -10,6 +10,7 @@
#include "xenia/base/memory.h"
#include "xenia/base/cvar.h"
#include "xenia/base/platform.h"
#include "xenia/base/logging.h"
#if XE_ARCH_ARM64
#include <arm_neon.h>
@ -31,6 +32,180 @@ bool IsWritableExecutableMemoryPreferred() {
cvars::writable_executable_memory;
}
using xe::swcache::CacheLine;
static constexpr unsigned NUM_CACHELINES_IN_PAGE = 4096 / sizeof(CacheLine);
#if defined(__clang__)
XE_FORCEINLINE
static void mvdir64b(void* to, const void* from) {
__asm__("movdir64b %1, %0" : : "r"(to), "m"(*(char*)from) : "memory");
}
#define _movdir64b mvdir64b
#endif
XE_FORCEINLINE
static void XeCopy16384StreamingAVX(CacheLine* XE_RESTRICT to,
CacheLine* XE_RESTRICT from) {
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
CacheLine* dest1 = to;
CacheLine* src1 = from;
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
xe::swcache::CacheLine line0, line1, line2, line3;
xe::swcache::ReadLine(&line0, src1 + i);
xe::swcache::ReadLine(&line1, src2 + i);
xe::swcache::ReadLine(&line2, src3 + i);
xe::swcache::ReadLine(&line3, src4 + i);
XE_MSVC_REORDER_BARRIER();
xe::swcache::WriteLineNT(dest1 + i, &line0);
xe::swcache::WriteLineNT(dest2 + i, &line1);
xe::swcache::WriteLineNT(dest3 + i, &line2);
xe::swcache::WriteLineNT(dest4 + i, &line3);
}
XE_MSVC_REORDER_BARRIER();
}
XE_FORCEINLINE
static void XeCopy16384Movdir64M(CacheLine* XE_RESTRICT to,
CacheLine* XE_RESTRICT from) {
uint32_t num_lines_for_8k = 4096 / XE_HOST_CACHE_LINE_SIZE;
CacheLine* dest1 = to;
CacheLine* src1 = from;
CacheLine* dest2 = to + NUM_CACHELINES_IN_PAGE;
CacheLine* src2 = from + NUM_CACHELINES_IN_PAGE;
CacheLine* dest3 = to + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* src3 = from + (NUM_CACHELINES_IN_PAGE * 2);
CacheLine* dest4 = to + (NUM_CACHELINES_IN_PAGE * 3);
CacheLine* src4 = from + (NUM_CACHELINES_IN_PAGE * 3);
#pragma loop(no_vector)
for (uint32_t i = 0; i < num_lines_for_8k; ++i) {
_movdir64b(dest1 + i, src1 + i);
_movdir64b(dest2 + i, src2 + i);
_movdir64b(dest3 + i, src3 + i);
_movdir64b(dest4 + i, src4 + i);
}
XE_MSVC_REORDER_BARRIER();
}
using VastCpyDispatch = void (*)(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length);
static void vastcpy_impl_avx(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length) {
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
while (written_length >= 16384) {
XeCopy16384StreamingAVX(physaddr, rdmapping);
physaddr += NUM_LINES_FOR_16K;
rdmapping += NUM_LINES_FOR_16K;
written_length -= 16384;
}
if (!written_length) {
return;
}
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
uint32_t i = 0;
for (; i + 1 < num_written_lines; i += 2) {
xe::swcache::CacheLine line0, line1;
xe::swcache::ReadLine(&line0, rdmapping + i);
xe::swcache::ReadLine(&line1, rdmapping + i + 1);
XE_MSVC_REORDER_BARRIER();
xe::swcache::WriteLineNT(physaddr + i, &line0);
xe::swcache::WriteLineNT(physaddr + i + 1, &line1);
}
if (i < num_written_lines) {
xe::swcache::CacheLine line0;
xe::swcache::ReadLine(&line0, rdmapping + i);
xe::swcache::WriteLineNT(physaddr + i, &line0);
}
}
static void vastcpy_impl_movdir64m(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length) {
static constexpr unsigned NUM_LINES_FOR_16K = 16384 / XE_HOST_CACHE_LINE_SIZE;
while (written_length >= 16384) {
XeCopy16384Movdir64M(physaddr, rdmapping);
physaddr += NUM_LINES_FOR_16K;
rdmapping += NUM_LINES_FOR_16K;
written_length -= 16384;
}
if (!written_length) {
return;
}
uint32_t num_written_lines = written_length / XE_HOST_CACHE_LINE_SIZE;
uint32_t i = 0;
for (; i + 1 < num_written_lines; i += 2) {
_movdir64b(physaddr + i, rdmapping + i);
_movdir64b(physaddr + i + 1, rdmapping + i + 1);
}
if (i < num_written_lines) {
_movdir64b(physaddr + i, rdmapping + i);
}
}
XE_COLD
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length);
static VastCpyDispatch vastcpy_dispatch = first_vastcpy;
XE_COLD
static void first_vastcpy(CacheLine* XE_RESTRICT physaddr,
CacheLine* XE_RESTRICT rdmapping,
uint32_t written_length) {
VastCpyDispatch dispatch_to_use = nullptr;
if (amd64::GetFeatureFlags() & amd64::kX64EmitMovdir64M) {
XELOGI("Selecting MOVDIR64M vastcpy.");
dispatch_to_use = vastcpy_impl_movdir64m;
} else {
XELOGI("Selecting generic AVX vastcpy.");
dispatch_to_use = vastcpy_impl_avx;
}
vastcpy_dispatch =
dispatch_to_use; // all future calls will go through our selected path
return vastcpy_dispatch(physaddr, rdmapping, written_length);
}
XE_NOINLINE
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
uint32_t written_length) {
return vastcpy_dispatch((CacheLine*)physaddr, (CacheLine*)rdmapping,
written_length);
}
} // namespace memory
// TODO(benvanik): fancy AVX versions.

View File

@ -17,9 +17,9 @@
#include <string>
#include <string_view>
#include "xenia/base/assert.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/platform.h"
namespace xe {
namespace memory {
@ -141,6 +141,10 @@ size_t hash_combine(size_t seed, const T& v, const Ts&... vs) {
seed ^= hasher(v) + 0x9E3779B9 + (seed << 6) + (seed >> 2);
return hash_combine(seed, vs...);
}
// must be divisible by cache line size
XE_NOINLINE
void vastcpy(uint8_t* XE_RESTRICT physaddr, uint8_t* XE_RESTRICT rdmapping,
uint32_t written_length);
} // namespace memory

View File

@ -133,6 +133,7 @@ using global_unique_lock_type = std::unique_lock<global_mutex_type>;
// };
class global_critical_region {
public:
constexpr global_critical_region() {}
static global_mutex_type& mutex();
// Acquires a lock on the global critical section.
@ -144,18 +145,18 @@ class global_critical_region {
}
// Acquires a lock on the global critical section.
inline global_unique_lock_type Acquire() {
static inline global_unique_lock_type Acquire() {
return global_unique_lock_type(mutex());
}
// Acquires a deferred lock on the global critical section.
inline global_unique_lock_type AcquireDeferred() {
static inline global_unique_lock_type AcquireDeferred() {
return global_unique_lock_type(mutex(), std::defer_lock);
}
// Tries to acquire a lock on the glboal critical section.
// Check owns_lock() to see if the lock was successfully acquired.
inline global_unique_lock_type TryAcquire() {
static inline global_unique_lock_type TryAcquire() {
return global_unique_lock_type(mutex(), std::try_to_lock);
}
};

View File

@ -183,20 +183,13 @@ inline uint32_t RingBuffer::ReadAndSwap<uint32_t>() {
xenia_assert(this->capacity_ >= 4);
ring_size_t next_read_offset = read_offset + 4;
#if 0
size_t zerotest = next_read_offset - this->capacity_;
// unpredictable branch, use bit arith instead
// todo: it would be faster to use lzcnt, but we need to figure out if all
// machines we support support it
next_read_offset &= -static_cast<ptrdiff_t>(!!zerotest);
#else
if (XE_UNLIKELY(next_read_offset == this->capacity_)) {
next_read_offset = 0;
// todo: maybe prefetch next? or should that happen much earlier?
}
#endif
this->read_offset_ = next_read_offset;
unsigned int ring_value = *(uint32_t*)&this->buffer_[read_offset];
uint32_t ring_value = *(uint32_t*)&this->buffer_[read_offset];
return xe::byte_swap(ring_value);
}
} // namespace xe

View File

@ -271,7 +271,10 @@ inline std::pair<WaitResult, size_t> WaitAny(
return WaitAny(wait_handles.data(), wait_handles.size(), is_alertable,
timeout);
}
struct EventInfo {
uint32_t type;
uint32_t state;
};
// Models a Win32-like event object.
// https://msdn.microsoft.com/en-us/library/windows/desktop/ms682396(v=vs.85).aspx
class Event : public WaitHandle {
@ -299,6 +302,8 @@ class Event : public WaitHandle {
// the nonsignaled state after releasing the appropriate number of waiting
// threads.
virtual void Pulse() = 0;
virtual EventInfo Query() = 0;
#if XE_PLATFORM_WIN32 ==1
//SetEvent, but if there is a waiter we immediately transfer execution to it
virtual void SetBoostPriority() = 0;

View File

@ -55,7 +55,7 @@ XE_NTDLL_IMPORT(NtReleaseSemaphore, cls_NtReleaseSemaphore,
XE_NTDLL_IMPORT(NtDelayExecution, cls_NtDelayExecution,
NtDelayExecutionPointer);
XE_NTDLL_IMPORT(NtQueryEvent, cls_NtQueryEvent, NtQueryEventPointer);
namespace xe {
namespace threading {
@ -255,20 +255,20 @@ std::pair<WaitResult, size_t> WaitMultiple(WaitHandle* wait_handles[],
size_t wait_handle_count,
bool wait_all, bool is_alertable,
std::chrono::milliseconds timeout) {
std::vector<HANDLE> handles(
wait_handle_count); // max handles is like 64, so it would make more
// sense to just do a fixed size array here
xenia_assert(wait_handle_count <= 64);
HANDLE handles[64];
for (size_t i = 0; i < wait_handle_count; ++i) {
handles[i] = wait_handles[i]->native_handle();
}
DWORD result = WaitForMultipleObjectsEx(
DWORD(handles.size()), handles.data(), wait_all ? TRUE : FALSE,
static_cast<DWORD>(wait_handle_count), handles, wait_all ? TRUE : FALSE,
DWORD(timeout.count()), is_alertable ? TRUE : FALSE);
if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + handles.size()) {
if (result >= WAIT_OBJECT_0 && result < WAIT_OBJECT_0 + wait_handle_count) {
return std::pair<WaitResult, size_t>(WaitResult::kSuccess,
result - WAIT_OBJECT_0);
} else if (result >= WAIT_ABANDONED_0 &&
result < WAIT_ABANDONED_0 + handles.size()) {
result < WAIT_ABANDONED_0 + wait_handle_count) {
return std::pair<WaitResult, size_t>(WaitResult::kAbandoned,
result - WAIT_ABANDONED_0);
}
@ -293,7 +293,15 @@ class Win32Event : public Win32Handle<Event> {
void Pulse() override { NtPulseEventPointer.invoke(handle_, nullptr); }
void SetBoostPriority() override {
// no previous state for boostpriority
NtSetEventBoostPriorityPointer.invoke(handle_);
// Boost priority is unimplemented under wine probably because it's not used
// anywhere in user mode except by us. Maybe some Windows internals uses it
// see:
// https://discord.com/channels/308194948048486401/308207592482668545/1027178776599216228
if (NtSetEventBoostPriorityPointer) {
NtSetEventBoostPriorityPointer.invoke(handle_);
} else {
NtSetEventPointer.invoke(handle_, nullptr);
}
}
#else
void Set() override { SetEvent(handle_); }
@ -305,6 +313,11 @@ class Win32Event : public Win32Handle<Event> {
SetEvent(handle_);
}
#endif
EventInfo Query() { EventInfo result{};
NtQueryEventPointer.invoke(handle_, 0, &result, sizeof(EventInfo), nullptr);
return result;
}
};
std::unique_ptr<Event> Event::CreateManualResetEvent(bool initial_state) {

View File

@ -39,7 +39,7 @@
#include "xenia/cpu/backend/x64/x64_stack_layout.h"
#include "xenia/cpu/hir/hir_builder.h"
#include "xenia/cpu/processor.h"
XE_MSVC_OPTIMIZE_SMALL()
DEFINE_bool(use_fast_dot_product, false,
"Experimental optimization, much shorter sequence on dot products, "
"treating inf as overflow instead of using mcxsr"

View File

@ -10,11 +10,7 @@ project("xenia-cpu")
"xenia-base",
"mspack",
})
filter({"configurations:Release", "platforms:Windows"})
buildoptions({
"/Os",
"/O1"
})
includedirs({
project_root.."/third_party/llvm/include",
})
@ -27,3 +23,8 @@ project("xenia-cpu")
include("testing")
include("ppc/testing")
filter({"configurations:Release", "platforms:Windows"})
buildoptions({
"/Os",
"/O1"
})

View File

@ -17,6 +17,7 @@ project("xenia-debug-ui")
"/Os",
"/O1"
})
filter{}
defines({
})
includedirs({

View File

@ -9,23 +9,16 @@
#include "xenia/gpu/command_processor.h"
#include <algorithm>
#include <cinttypes>
#include <cmath>
#include <cstring>
#include "third_party/fmt/include/fmt/format.h"
#include "xenia/base/byte_stream.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/profiling.h"
#include "xenia/base/ring_buffer.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/graphics_system.h"
#include "xenia/gpu/sampler_info.h"
#include "xenia/gpu/texture_info.h"
#include "xenia/gpu/xenos.h"
#include "xenia/kernel/kernel_state.h"
#include "xenia/kernel/user_module.h"
@ -46,11 +39,6 @@ CommandProcessor::CommandProcessor(GraphicsSystem* graphics_system,
write_ptr_index_event_(xe::threading::Event::CreateAutoResetEvent(false)),
write_ptr_index_(0) {
assert_not_null(write_ptr_index_event_);
#if 0
dmac_ = dma::CreateDMAC();
#else
dmac_ = nullptr;
#endif
}
CommandProcessor::~CommandProcessor() = default;
@ -625,78 +613,6 @@ void CommandProcessor::PrepareForWait() { trace_writer_.Flush(); }
void CommandProcessor::ReturnFromWait() {}
uint32_t CommandProcessor::ExecutePrimaryBuffer(uint32_t read_index,
uint32_t write_index) {
SCOPE_profile_cpu_f("gpu");
#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
// If we have a pending trace stream open it now. That way we ensure we get
// all commands.
if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
uint32_t title_id = kernel_state_->GetExecutableModule()
? kernel_state_->GetExecutableModule()->title_id()
: 0;
auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
auto path = trace_stream_path_ / file_name;
trace_writer_.Open(path, title_id);
InitializeTrace();
}
#endif
// Adjust pointer base.
uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
// Execute commands!
RingBuffer old_reader = reader_;
new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
primary_buffer_size_);
reader_.set_read_offset(read_index * sizeof(uint32_t));
reader_.set_write_offset(write_index * sizeof(uint32_t));
// prefetch the wraparound range
// it likely is already in L3 cache, but in a zen system it may be another
// chiplets l3
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
GetCurrentRingReadCount());
do {
if (!ExecutePacket()) {
// This probably should be fatal - but we're going to continue anyways.
XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
assert_always();
break;
}
} while (reader_.read_count());
OnPrimaryBufferEnd();
trace_writer_.WritePrimaryBufferEnd();
reader_ = old_reader;
return write_index;
}
void CommandProcessor::ExecutePacket(uint32_t ptr, uint32_t count) {
// Execute commands!
RingBuffer old_reader = reader_;
new (&reader_)
RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
reader_.set_write_offset(count * sizeof(uint32_t));
do {
if (!ExecutePacket()) {
XELOGE("**** ExecutePacket: Failed to execute packet.");
assert_always();
break;
}
} while (reader_.read_count());
reader_ = old_reader;
}
void CommandProcessor::InitializeTrace() {
// Write the initial register values, to be loaded directly into the

View File

@ -19,11 +19,8 @@
#include <string>
#include <vector>
#include "xenia/base/dma.h"
#include "xenia/base/ring_buffer.h"
#include "xenia/base/threading.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
#include "xenia/kernel/xthread.h"
@ -82,7 +79,6 @@ class CommandProcessor {
CommandProcessor(GraphicsSystem* graphics_system,
kernel::KernelState* kernel_state);
virtual ~CommandProcessor();
dma::XeDMAC* GetDMAC() const { return dmac_; }
uint32_t counter() const { return counter_; }
void increment_counter() { counter_++; }
@ -135,7 +131,7 @@ class CommandProcessor {
void UpdateWritePointer(uint32_t value);
void ExecutePacket(uint32_t ptr, uint32_t count);
bool is_paused() const { return paused_; }
void Pause();
@ -172,7 +168,7 @@ class CommandProcessor {
uint32_t num_registers);
XE_NOINLINE
virtual void WriteOneRegisterFromRing(
void WriteOneRegisterFromRing(
uint32_t base,
uint32_t
num_times); // repeatedly write a value to one register, presumably a
@ -221,7 +217,7 @@ class CommandProcessor {
virtual void PrepareForWait();
virtual void ReturnFromWait();
uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index);
virtual void OnPrimaryBufferEnd() {}
#include "pm4_command_processor_declare.h"
@ -300,7 +296,6 @@ class CommandProcessor {
reg::DC_LUT_30_COLOR gamma_ramp_256_entry_table_[256] = {};
reg::DC_LUT_PWL_DATA gamma_ramp_pwl_rgb_[128][3] = {};
uint32_t gamma_ramp_rw_component_ = 0;
dma::XeDMAC* dmac_ = nullptr;
};
} // namespace gpu

View File

@ -2672,45 +2672,66 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
// validity is tracked.
const Shader::ConstantRegisterMap& constant_map_vertex =
vertex_shader->constant_register_map();
for (uint32_t i = 0; i < xe::countof(constant_map_vertex.vertex_fetch_bitmap);
++i) {
uint32_t vfetch_bits_remaining = constant_map_vertex.vertex_fetch_bitmap[i];
uint32_t j;
while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
vfetch_bits_remaining &= ~(uint32_t(1) << j);
uint32_t vfetch_index = i * 32 + j;
const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
switch (vfetch_constant.type) {
case xenos::FetchConstantType::kVertex:
break;
case xenos::FetchConstantType::kInvalidVertex:
if (cvars::gpu_allow_invalid_fetch_constants) {
{
uint32_t vfetch_addresses[96];
uint32_t vfetch_sizes[96];
uint32_t vfetch_current_queued = 0;
for (uint32_t i = 0;
i < xe::countof(constant_map_vertex.vertex_fetch_bitmap); ++i) {
uint32_t vfetch_bits_remaining =
constant_map_vertex.vertex_fetch_bitmap[i];
uint32_t j;
while (xe::bit_scan_forward(vfetch_bits_remaining, &j)) {
vfetch_bits_remaining = xe::clear_lowest_bit(vfetch_bits_remaining);
uint32_t vfetch_index = i * 32 + j;
const auto& vfetch_constant = regs.Get<xenos::xe_gpu_vertex_fetch_t>(
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + vfetch_index * 2);
switch (vfetch_constant.type) {
case xenos::FetchConstantType::kVertex:
break;
}
XELOGW(
"Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" type! "
"This is incorrect behavior, but you can try bypassing this by "
"launching Xenia with --gpu_allow_invalid_fetch_constants=true.",
vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
return false;
default:
XELOGW(
"Vertex fetch constant {} ({:08X} {:08X}) is completely invalid!",
vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
return false;
case xenos::FetchConstantType::kInvalidVertex:
if (cvars::gpu_allow_invalid_fetch_constants) {
break;
}
XELOGW(
"Vertex fetch constant {} ({:08X} {:08X}) has \"invalid\" "
"type! "
"This is incorrect behavior, but you can try bypassing this by "
"launching Xenia with "
"--gpu_allow_invalid_fetch_constants=true.",
vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
return false;
default:
XELOGW(
"Vertex fetch constant {} ({:08X} {:08X}) is completely "
"invalid!",
vfetch_index, vfetch_constant.dword_0, vfetch_constant.dword_1);
return false;
}
vfetch_addresses[vfetch_current_queued] = vfetch_constant.address;
vfetch_sizes[vfetch_current_queued++] = vfetch_constant.size;
}
if (!shared_memory_->RequestRange(vfetch_constant.address << 2,
vfetch_constant.size << 2)) {
XELOGE(
"Failed to request vertex buffer at 0x{:08X} (size {}) in the "
"shared memory",
vfetch_constant.address << 2, vfetch_constant.size << 2);
return false;
}
if (vfetch_current_queued) {
// so far, i have never seen vfetch_current_queued > 4. 1 is most common, 2 happens occasionally. did not test many games though
// pre-acquire the critical region so we're not repeatedly re-acquiring it
// in requestrange
auto shared_memory_request_range_hoisted =
global_critical_region::Acquire();
for (uint32_t i = 0; i < vfetch_current_queued; ++i) {
if (!shared_memory_->RequestRange(vfetch_addresses[i] << 2,
vfetch_sizes[i] << 2)) {
XELOGE(
"Failed to request vertex buffer at 0x{:08X} (size {}) in the "
"shared memory",
vfetch_addresses[i] << 2, vfetch_sizes[i] << 2);
return false;
}
}
}
}
// Gather memexport ranges and ensure the heaps for them are resident, and
// also load the data surrounding the export and to fill the regions that
// won't be modified by the shaders.
@ -3076,24 +3097,6 @@ void D3D12CommandProcessor::InitializeTrace() {
shared_memory_->InitializeTraceCompleteDownloads();
}
}
static void DmaPrefunc(dma::XeDMAJob* job) {
D3D12_RANGE readback_range;
readback_range.Begin = 0;
readback_range.End = job->size;
void* readback_mapping;
ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
HRESULT mapres = readback_buffer->Map(0, &readback_range, &readback_mapping);
xenia_assert(SUCCEEDED(mapres));
job->source = (uint8_t*)readback_mapping;
}
static void DmaPostfunc(dma::XeDMAJob* job) {
D3D12_RANGE readback_write_range = {};
ID3D12Resource* readback_buffer = (ID3D12Resource*)job->userdata1;
readback_buffer->Unmap(0, &readback_write_range);
}
bool D3D12CommandProcessor::IssueCopy() {
#if XE_UI_D3D12_FINE_GRAINED_DRAW_SCOPES
@ -3102,7 +3105,6 @@ bool D3D12CommandProcessor::IssueCopy() {
if (!BeginSubmission(true)) {
return false;
}
if (!cvars::d3d12_readback_resolve) {
uint32_t written_address, written_length;
return render_target_cache_->Resolve(*memory_, *shared_memory_,
@ -3129,34 +3131,21 @@ bool D3D12CommandProcessor::IssueCopy_ReadbackResolvePath() {
readback_buffer, 0, shared_memory_buffer, written_address,
written_length);
if (AwaitAllQueueOperationsCompletion()) {
#if 1
D3D12_RANGE readback_range;
readback_range.Begin = 0;
readback_range.End = written_length;
void* readback_mapping;
if (SUCCEEDED(
readback_buffer->Map(0, &readback_range, &readback_mapping))) {
// chrispy: this memcpy needs to be optimized as much as possible
D3D12_RANGE readback_range;
readback_range.Begin = 0;
readback_range.End = written_length;
void* readback_mapping;
if (SUCCEEDED(readback_buffer->Map(0, &readback_range,
&readback_mapping))) {
// chrispy: this memcpy needs to be optimized as much as possible
auto physaddr = memory_->TranslatePhysical(written_address);
dma::vastcpy(physaddr, (uint8_t*)readback_mapping, written_length);
// XEDmaCpy(physaddr, readback_mapping, written_length);
D3D12_RANGE readback_write_range = {};
readback_buffer->Unmap(0, &readback_write_range);
}
#else
dma::XeDMAJob job{};
job.destination = memory_->TranslatePhysical(written_address);
job.size = written_length;
job.source = nullptr;
job.userdata1 = (void*)readback_buffer;
job.precall = DmaPrefunc;
job.postcall = DmaPostfunc;
readback_available_ = GetDMAC()->PushDMAJob(&job);
#endif
auto physaddr = memory_->TranslatePhysical(written_address);
memory::vastcpy(physaddr, (uint8_t*)readback_mapping,
written_length);
// XEDmaCpy(physaddr, readback_mapping, written_length);
D3D12_RANGE readback_write_range = {};
readback_buffer->Unmap(0, &readback_write_range);
}
}
}
}
@ -3833,7 +3822,8 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
uint32_t user_clip_plane_index;
while (xe::bit_scan_forward(user_clip_planes_remaining,
&user_clip_plane_index)) {
user_clip_planes_remaining &= ~(UINT32_C(1) << user_clip_plane_index);
user_clip_planes_remaining =
xe::clear_lowest_bit(user_clip_planes_remaining);
const float* user_clip_plane =
&regs[XE_GPU_REG_PA_CL_UCP_0_X + user_clip_plane_index * 4].f32;
if (std::memcmp(user_clip_plane_write_ptr, user_clip_plane,
@ -3917,7 +3907,7 @@ XE_NOINLINE void D3D12CommandProcessor::UpdateSystemConstantValues_Impl(
uint32_t textures_remaining = used_texture_mask;
uint32_t texture_index;
while (xe::bit_scan_forward(textures_remaining, &texture_index)) {
textures_remaining &= ~(uint32_t(1) << texture_index);
textures_remaining = xe::clear_lowest_bit(textures_remaining);
uint32_t& texture_signs_uint =
system_constants_.texture_swizzled_signs[texture_index >> 2];
uint32_t texture_signs_shift = (texture_index & 3) * 8;
@ -5116,12 +5106,6 @@ ID3D12Resource* D3D12CommandProcessor::RequestReadbackBuffer(uint32_t size) {
if (size == 0) {
return nullptr;
}
#if 1
if (readback_available_) {
GetDMAC()->WaitJobDone(readback_available_);
readback_available_ = 0;
}
#endif
size = xe::align(size, kReadbackBufferSizeIncrement);
if (size > readback_buffer_size_) {
const ui::d3d12::D3D12Provider& provider = GetD3D12Provider();

View File

@ -21,7 +21,6 @@
#include <utility>
#include "xenia/base/assert.h"
#include "xenia/base/dma.h"
#include "xenia/gpu/command_processor.h"
#include "xenia/gpu/d3d12/d3d12_graphics_system.h"
#include "xenia/gpu/d3d12/d3d12_primitive_processor.h"
@ -50,8 +49,10 @@ struct MemExportRange {
uint32_t size_dwords;
};
class D3D12CommandProcessor final : public CommandProcessor {
public:
protected:
#include "../pm4_command_processor_declare.h"
public:
explicit D3D12CommandProcessor(D3D12GraphicsSystem* graphics_system,
kernel::KernelState* kernel_state);
~D3D12CommandProcessor();
@ -232,8 +233,8 @@ class D3D12CommandProcessor final : public CommandProcessor {
uint32_t base,
uint32_t num_registers);
XE_NOINLINE
virtual void WriteOneRegisterFromRing(uint32_t base,
uint32_t num_times) override;
void WriteOneRegisterFromRing(uint32_t base,
uint32_t num_times);
XE_FORCEINLINE
void WriteALURangeFromRing(xe::RingBuffer* ring, uint32_t base,
@ -677,7 +678,6 @@ class D3D12CommandProcessor final : public CommandProcessor {
static constexpr uint32_t kReadbackBufferSizeIncrement = 16 * 1024 * 1024;
ID3D12Resource* readback_buffer_ = nullptr;
dma::DMACJobHandle readback_available_ = 0;
uint32_t readback_buffer_size_ = 0;
std::atomic<bool> pix_capture_requested_ = false;

View File

@ -407,15 +407,14 @@ bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
bool D3D12SharedMemory::UploadRanges(
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
unsigned num_upload_page_ranges) {
uint32_t num_upload_page_ranges) {
if (!num_upload_page_ranges) {
return true;
}
CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
command_processor_.SubmitBarriers();
auto& command_list = command_processor_.GetDeferredCommandList();
// for (auto upload_range : upload_page_ranges) {
for (unsigned int i = 0; i < num_upload_page_ranges; ++i) {
for (uint32_t i = 0; i < num_upload_page_ranges; ++i) {
auto& upload_range = upload_page_ranges[i];
uint32_t upload_range_start = upload_range.first;
uint32_t upload_range_length = upload_range.second;
@ -437,7 +436,7 @@ bool D3D12SharedMemory::UploadRanges(
uint32_t(upload_buffer_size), false, false);
if (upload_buffer_size < (1ULL << 32) && upload_buffer_size > 8192) {
dma::vastcpy(
memory::vastcpy(
upload_buffer_mapping,
memory().TranslatePhysical(upload_range_start << page_size_log2()),
static_cast<uint32_t>(upload_buffer_size));

View File

@ -91,8 +91,8 @@ class D3D12SharedMemory : public SharedMemory {
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
uint32_t length_allocations) override;
bool UploadRanges(const std::pair<uint32_t, uint32_t>*
upload_page_ranges, unsigned num_ranges) override;
bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
uint32_t num_ranges) override;
private:
D3D12CommandProcessor& command_processor_;

View File

@ -491,7 +491,7 @@ void D3D12TextureCache::RequestTextures(uint32_t used_texture_mask) {
uint32_t textures_remaining = used_texture_mask;
uint32_t index;
while (xe::bit_scan_forward(textures_remaining, &index)) {
textures_remaining &= ~(uint32_t(1) << index);
textures_remaining = xe::clear_lowest_bit(textures_remaining);
const TextureBinding* binding = GetValidTextureBinding(index);
if (!binding) {
continue;

View File

@ -9,21 +9,14 @@
#include "xenia/gpu/draw_util.h"
#include <algorithm>
#include <cmath>
#include <cstring>
#include "xenia/base/assert.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/texture_cache.h"
#include "xenia/gpu/texture_info.h"
#include "xenia/gpu/texture_util.h"
#include "xenia/gpu/xenos.h"
#include "xenia/ui/graphics_util.h"
// Very prominent in 545407F2.

View File

@ -16,7 +16,6 @@
#include "xenia/base/assert.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"

View File

@ -1,9 +1,14 @@
void ExecuteIndirectBuffer(uint32_t ptr, uint32_t count) XE_RESTRICT;
virtual uint32_t ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index) XE_RESTRICT;
virtual bool ExecutePacket();
XE_NOINLINE
public:
void ExecutePacket(uint32_t ptr, uint32_t count);
protected:
XE_NOINLINE
bool ExecutePacketType0( uint32_t packet) XE_RESTRICT;
XE_NOINLINE
bool ExecutePacketType1( uint32_t packet) XE_RESTRICT;
@ -103,4 +108,7 @@ uint32_t GetCurrentRingReadCount();
XE_NOINLINE
XE_COLD
bool ExecutePacketType3_CountOverflow(uint32_t count);
bool ExecutePacketType3_CountOverflow(uint32_t count);
XE_NOINLINE
XE_COLD
bool ExecutePacketType0_CountOverflow(uint32_t count);

View File

@ -75,33 +75,45 @@ bool COMMAND_PROCESSOR::ExecutePacket() {
}
}
XE_NOINLINE
XE_COLD
bool COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(uint32_t count) {
XELOGE("ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
COMMAND_PROCESSOR::GetCurrentRingReadCount(),
count * sizeof(uint32_t));
return false;
}
/*
Todo: optimize this function this one along with execute packet type III are the most frequently called functions for PM4
*/
XE_NOINLINE
bool COMMAND_PROCESSOR::ExecutePacketType0(uint32_t packet) XE_RESTRICT {
// Type-0 packet.
// Write count registers in sequence to the registers starting at
// (base_index << 2).
uint32_t count = ((packet >> 16) & 0x3FFF) + 1;
if (COMMAND_PROCESSOR::GetCurrentRingReadCount() < count * sizeof(uint32_t)) {
XELOGE(
"ExecutePacketType0 overflow (read count {:08X}, packet count {:08X})",
COMMAND_PROCESSOR::GetCurrentRingReadCount(), count * sizeof(uint32_t));
return false;
}
trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
uint32_t base_index = (packet & 0x7FFF);
uint32_t write_one_reg = (packet >> 15) & 0x1;
if (COMMAND_PROCESSOR::GetCurrentRingReadCount() >=
count * sizeof(uint32_t)) {
trace_writer_.WritePacketStart(uint32_t(reader_.read_ptr() - 4), 1 + count);
if (!write_one_reg) {
COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index, count);
uint32_t base_index = (packet & 0x7FFF);
uint32_t write_one_reg = (packet >> 15) & 0x1;
if (!write_one_reg) {
COMMAND_PROCESSOR::WriteRegisterRangeFromRing(&reader_, base_index,
count);
} else {
COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
}
trace_writer_.WritePacketEnd();
return true;
} else {
COMMAND_PROCESSOR::WriteOneRegisterFromRing(base_index, count);
return COMMAND_PROCESSOR::ExecutePacketType0_CountOverflow(count);
}
trace_writer_.WritePacketEnd();
return true;
}
XE_NOINLINE
bool COMMAND_PROCESSOR::ExecutePacketType1(uint32_t packet) XE_RESTRICT {
@ -430,6 +442,11 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_INDIRECT_BUFFER(
XE_NOINLINE
static bool MatchValueAndRef(uint32_t value, uint32_t ref, uint32_t wait_info) {
/*
Todo: should subtract values from each other twice with the sides inverted and then create a mask from the sign bits
then use the wait_info value in order to select the bits that correctly implement the condition
If neither subtraction has the signbit set then that means the value is equal
*/
bool matched = false;
switch (wait_info & 0x7) {
case 0x0: // Never.
@ -1058,6 +1075,10 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_IM_LOAD_IMMEDIATE(
return true;
}
/*
todo: shouldn't this do something?
*/
bool COMMAND_PROCESSOR::ExecutePacketType3_INVALIDATE_STATE(
uint32_t packet, uint32_t count) XE_RESTRICT {
// selective invalidation of state pointers
@ -1099,3 +1120,76 @@ bool COMMAND_PROCESSOR::ExecutePacketType3_VIZ_QUERY(
return true;
}
uint32_t COMMAND_PROCESSOR::ExecutePrimaryBuffer(uint32_t read_index,
uint32_t write_index) {
SCOPE_profile_cpu_f("gpu");
#if XE_ENABLE_TRACE_WRITER_INSTRUMENTATION == 1
// If we have a pending trace stream open it now. That way we ensure we get
// all commands.
if (!trace_writer_.is_open() && trace_state_ == TraceState::kStreaming) {
uint32_t title_id = kernel_state_->GetExecutableModule()
? kernel_state_->GetExecutableModule()->title_id()
: 0;
auto file_name = fmt::format("{:08X}_stream.xtr", title_id);
auto path = trace_stream_path_ / file_name;
trace_writer_.Open(path, title_id);
InitializeTrace();
}
#endif
// Adjust pointer base.
uint32_t start_ptr = primary_buffer_ptr_ + read_index * sizeof(uint32_t);
start_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (start_ptr & 0x1FFFFFFF);
uint32_t end_ptr = primary_buffer_ptr_ + write_index * sizeof(uint32_t);
end_ptr = (primary_buffer_ptr_ & ~0x1FFFFFFF) | (end_ptr & 0x1FFFFFFF);
trace_writer_.WritePrimaryBufferStart(start_ptr, write_index - read_index);
// Execute commands!
RingBuffer old_reader = reader_;
new (&reader_) RingBuffer(memory_->TranslatePhysical(primary_buffer_ptr_),
primary_buffer_size_);
reader_.set_read_offset(read_index * sizeof(uint32_t));
reader_.set_write_offset(write_index * sizeof(uint32_t));
// prefetch the wraparound range
// it likely is already in L3 cache, but in a zen system it may be another
// chiplets l3
reader_.BeginPrefetchedRead<swcache::PrefetchTag::Level2>(
GetCurrentRingReadCount());
do {
if (!COMMAND_PROCESSOR::ExecutePacket()) {
// This probably should be fatal - but we're going to continue anyways.
XELOGE("**** PRIMARY RINGBUFFER: Failed to execute packet.");
assert_always();
break;
}
} while (reader_.read_count());
COMMAND_PROCESSOR::OnPrimaryBufferEnd();
trace_writer_.WritePrimaryBufferEnd();
reader_ = old_reader;
return write_index;
}
void COMMAND_PROCESSOR::ExecutePacket(uint32_t ptr, uint32_t count) {
// Execute commands!
RingBuffer old_reader = reader_;
new (&reader_)
RingBuffer{memory_->TranslatePhysical(ptr), count * sizeof(uint32_t)};
reader_.set_write_offset(count * sizeof(uint32_t));
do {
if (!COMMAND_PROCESSOR::ExecutePacket()) {
XELOGE("**** ExecutePacket: Failed to execute packet.");
assert_always();
break;
}
} while (reader_.read_count());
reader_ = old_reader;
}

View File

@ -11,7 +11,6 @@
#define XENIA_GPU_SAMPLER_INFO_H_
#include "xenia/gpu/shader.h"
#include "xenia/gpu/xenos.h"
namespace xe {
namespace gpu {

View File

@ -24,7 +24,6 @@
#include "xenia/base/string_buffer.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/ucode.h"
#include "xenia/gpu/xenos.h"
namespace xe {
namespace gpu {

View File

@ -13,13 +13,6 @@
#include <cmath>
#include <cstring>
#include "xenia/base/assert.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/math.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"
namespace xe {
namespace gpu {

View File

@ -14,12 +14,9 @@
#include <cstddef>
#include <cstdint>
#include "xenia/base/assert.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/ucode.h"
#include "xenia/gpu/xenos.h"
#include "xenia/memory.h"
namespace xe {

View File

@ -9,15 +9,9 @@
#include "xenia/gpu/shader_translator.h"
#include <algorithm>
#include <cstdarg>
#include <cstring>
#include <set>
#include <string>
#include "xenia/base/assert.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/gpu/gpu_flags.h"
namespace xe {

View File

@ -11,16 +11,7 @@
#define XENIA_GPU_SHADER_TRANSLATOR_H_
#include <memory>
#include <set>
#include <string>
#include <vector>
#include "xenia/base/math.h"
#include "xenia/base/string_buffer.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/ucode.h"
#include "xenia/gpu/xenos.h"
namespace xe {
namespace gpu {

View File

@ -10,15 +10,11 @@
#include "xenia/gpu/shared_memory.h"
#include <algorithm>
#include <utility>
#include "xenia/base/assert.h"
#include "xenia/base/bit_range.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/profiling.h"
#include "xenia/memory.h"
namespace xe {
namespace gpu {

View File

@ -10,12 +10,6 @@
#ifndef XENIA_GPU_SHARED_MEMORY_H_
#define XENIA_GPU_SHARED_MEMORY_H_
#include <cstdint>
#include <mutex>
#include <utility>
#include <vector>
#include "xenia/base/mutex.h"
#include "xenia/memory.h"
namespace xe {
@ -141,7 +135,7 @@ class SharedMemory {
// overall bounds of pages to be uploaded.
virtual bool UploadRanges(
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
unsigned num_upload_ranges) = 0;
uint32_t num_upload_ranges) = 0;
const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
return trace_download_ranges_;
@ -183,14 +177,10 @@ class SharedMemory {
FixedVMemVector<MAX_UPLOAD_RANGES * sizeof(std::pair<uint32_t, uint32_t>)>
upload_ranges_;
// GPU-written memory downloading for traces. <Start address, length>.
std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
uint32_t trace_download_page_count_ = 0;
// Mutex between the guest memory subsystem and the command processor, to be
// locked when checking or updating validity of pages/ranges and when firing
// watches.
xe::global_critical_region global_critical_region_;
static constexpr xe::global_critical_region global_critical_region_{};
// ***************************************************************************
// Things below should be fully protected by global_critical_region.
@ -266,6 +256,11 @@ class SharedMemory {
uint32_t watch_node_current_pool_allocated_ = 0;
WatchRange* watch_range_first_free_ = nullptr;
WatchNode* watch_node_first_free_ = nullptr;
// GPU-written memory downloading for traces. <Start address, length>.
std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
uint32_t trace_download_page_count_ = 0;
// Triggers the watches (global and per-range), removing triggered range
// watches.
void FireWatches(uint32_t page_first, uint32_t page_last,

View File

@ -9,21 +9,11 @@
#include "xenia/gpu/texture_cache.h"
#include <algorithm>
#include <cstdint>
#include <utility>
#include "xenia/base/assert.h"
#include "xenia/base/clock.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/profiling.h"
#include "xenia/gpu/gpu_flags.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/texture_info.h"
#include "xenia/gpu/texture_util.h"
#include "xenia/gpu/xenos.h"
DEFINE_int32(
draw_resolution_scale_x, 1,
@ -332,9 +322,13 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
uint32_t bindings_changed = 0;
uint32_t textures_remaining = used_texture_mask & ~texture_bindings_in_sync_;
uint32_t index = 0;
Texture* textures_to_load[64]; // max bits = 32, can be unsigned + signed
// means max array size = 64
uint32_t num_textures_to_load = 0;
while (xe::bit_scan_forward(textures_remaining, &index)) {
uint32_t index_bit = UINT32_C(1) << index;
textures_remaining &= ~index_bit;
textures_remaining = xe::clear_lowest_bit(textures_remaining);
TextureBinding& binding = texture_bindings_[index];
const auto& fetch = regs.Get<xenos::xe_gpu_texture_fetch_t>(
XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + index * 6);
@ -406,12 +400,15 @@ void TextureCache::RequestTextures(uint32_t used_texture_mask) {
binding.texture_signed = nullptr;
}
if (load_unsigned_data && binding.texture != nullptr) {
LoadTextureData(*binding.texture);
textures_to_load[num_textures_to_load++] = binding.texture;
}
if (load_signed_data && binding.texture_signed != nullptr) {
LoadTextureData(*binding.texture_signed);
textures_to_load[num_textures_to_load++] = binding.texture_signed;
}
}
LoadTexturesData(textures_to_load, num_textures_to_load);
if (bindings_changed) {
UpdateTextureBindingsImpl(bindings_changed);
}
@ -643,7 +640,134 @@ TextureCache::Texture* TextureCache::FindOrCreateTexture(TextureKey key) {
texture->LogAction("Created");
return texture;
}
void TextureCache::LoadTexturesData(Texture** textures, uint32_t n_textures) {
assert_true(n_textures <= 64);
if (n_textures < 2) {
if (!n_textures) {
return;
} else {
LoadTextureData(*textures[0]);
return;
}
}
uint64_t index_base_outdated = 0;
uint64_t index_mips_outdated = 0;
uint32_t nkept = 0;
{
auto global_lock = global_critical_region_.Acquire();
for (uint32_t i = 0; i < n_textures; ++i) {
Texture* current = textures[i];
auto base_outdated = current->base_outdated(global_lock);
auto mips_outdated = current->mips_outdated(global_lock);
index_base_outdated |= static_cast<uint64_t>(base_outdated) << i;
index_mips_outdated |= static_cast<uint64_t>(mips_outdated) << i;
if (!base_outdated && !mips_outdated) {
textures[i] = nullptr;
} else {
nkept++;
}
}
}
if (nkept == 0) {
return;
}
for (uint32_t i = 0; i < n_textures; ++i) {
Texture* p_texture = textures[i];
if (!p_texture) {
continue;
}
textures[i] = nullptr;
Texture& texture = *p_texture;
TextureKey texture_key = texture.key();
// Implementation may load multiple blocks at once via accesses of up to 128
// bits (R32G32B32A32_UINT), so aligning the size to this value to make sure
// if the texture is small (especially if it's linear), the last blocks
// won't be cut off (hosts may return 0, 0, 0, 0 for the whole
// R32G32B32A32_UINT access for the non-16-aligned tail even if 1...15 bytes
// are actually provided for it).
// Request uploading of the texture data to the shared memory.
// This is also necessary when resolution scaling is used - the texture
// cache relies on shared memory for invalidation of both unscaled and
// scaled textures. Plus a texture may be unscaled partially, when only a
// portion of its pages is invalidated, in this case we'll need the texture
// from the shared memory to load the unscaled parts.
// TODO(Triang3l): Load unscaled parts.
bool base_resolved = texture.GetBaseResolved();
if (index_base_outdated & (1ULL << i)) {
if (!shared_memory().RequestRange(
texture_key.base_page << 12,
xe::align(texture.GetGuestBaseSize(), UINT32_C(16)),
texture_key.scaled_resolve ? nullptr : &base_resolved)) {
continue;
}
}
bool mips_resolved = texture.GetMipsResolved();
if (index_mips_outdated & (1ULL << i)) {
if (!shared_memory().RequestRange(
texture_key.mip_page << 12,
xe::align(texture.GetGuestMipsSize(), UINT32_C(16)),
texture_key.scaled_resolve ? nullptr : &mips_resolved)) {
continue;
}
}
if (texture_key.scaled_resolve) {
// Make sure all the scaled resolve memory is resident and accessible from
// the shader, including any possible padding that hasn't yet been touched
// by an actual resolve, but is still included in the texture size, so the
// GPU won't be trying to access unmapped memory.
if (!EnsureScaledResolveMemoryCommitted(texture_key.base_page << 12,
texture.GetGuestBaseSize(), 4)) {
continue;
}
if (!EnsureScaledResolveMemoryCommitted(texture_key.mip_page << 12,
texture.GetGuestMipsSize(), 4)) {
continue;
}
}
// Actually load the texture data.
if (!LoadTextureDataFromResidentMemoryImpl(
texture, (index_base_outdated & (1ULL << i)) != 0,
(index_mips_outdated & (1ULL << i)) != 0)) {
continue;
}
// Update the source of the texture (resolve vs. CPU or memexport) for
// purposes of handling piecewise gamma emulation via sRGB and for
// resolution scale in sampling offsets.
if (!texture_key.scaled_resolve) {
texture.SetBaseResolved(base_resolved);
texture.SetMipsResolved(mips_resolved);
}
// reque for makeuptodatandwatch
textures[i] = &texture;
}
{
auto crit = global_critical_region_.Acquire();
for (uint32_t i = 0; i < n_textures; ++i) {
auto texture = textures[i];
if (!texture) {
continue;
}
// Mark the ranges as uploaded and watch them. This is needed for scaled
// resolves as well to detect when the CPU wants to reuse the memory for a
// regular texture or a vertex buffer, and thus the scaled resolve version
// is not up to date anymore.
texture->MakeUpToDateAndWatch(crit);
texture->LogAction("Loaded");
}
}
}
bool TextureCache::LoadTextureData(Texture& texture) {
// Check what needs to be uploaded.
bool base_outdated, mips_outdated;

View File

@ -559,6 +559,7 @@ class TextureCache {
return load_shader_info_[load_shader_index];
}
bool LoadTextureData(Texture& texture);
void LoadTexturesData(Texture** textures, uint32_t n_textures);
// Writes the texture data (for base, mips or both - but not neither) from the
// shared memory or the scaled resolve memory. The shared memory management is
// done outside this function, the implementation just needs to load the data

View File

@ -8,14 +8,7 @@
*/
#include "xenia/gpu/texture_info.h"
#include <algorithm>
#include <cmath>
#include <cstring>
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/xxhash.h"
namespace xe {

View File

@ -13,7 +13,6 @@
#include <array>
#include <cstring>
#include <memory>
#include "xenia/base/assert.h"
#include "xenia/gpu/xenos.h"
namespace xe {

View File

@ -8,13 +8,6 @@
*/
#include "xenia/gpu/texture_util.h"
#include <algorithm>
#include <cstring>
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
namespace xe {
namespace gpu {
namespace texture_util {

View File

@ -10,11 +10,6 @@
#ifndef XENIA_GPU_UCODE_H_
#define XENIA_GPU_UCODE_H_
#include <cstdint>
#include "xenia/base/assert.h"
#include "xenia/base/math.h"
#include "xenia/base/platform.h"
#include "xenia/gpu/xenos.h"
// The XNA Game Studio 3.1 contains Graphics.ShaderCompiler.AssembleFromSource,

View File

@ -46,6 +46,9 @@ namespace gpu {
namespace vulkan {
class VulkanCommandProcessor final : public CommandProcessor {
protected:
#include "../pm4_command_processor_declare.h"
public:
// Single-descriptor layouts for use within a single frame.
enum class SingleTransientDescriptorLayout {
@ -53,7 +56,6 @@ class VulkanCommandProcessor final : public CommandProcessor {
kStorageBufferCompute,
kCount,
};
#include "../pm4_command_processor_declare.h"
class ScratchBufferAcquisition {
public:

View File

@ -377,7 +377,7 @@ bool VulkanSharedMemory::AllocateSparseHostGpuMemoryRange(
bool VulkanSharedMemory::UploadRanges(
const std::pair<uint32_t, uint32_t>* upload_page_ranges,
unsigned num_upload_ranges) {
uint32_t num_upload_ranges) {
if (!num_upload_ranges) {
return true;
}

View File

@ -62,8 +62,8 @@ class VulkanSharedMemory : public SharedMemory {
bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
uint32_t length_allocations) override;
bool UploadRanges(const std::pair<uint32_t, uint32_t>*
upload_page_ranges, unsigned num_ranges) override;
bool UploadRanges(const std::pair<uint32_t, uint32_t>* upload_page_ranges,
uint32_t num_ranges) override;
private:
void GetUsageMasks(Usage usage, VkPipelineStageFlags& stage_mask,

View File

@ -9,10 +9,6 @@
#include "xenia/gpu/xenos.h"
#include <cmath>
#include "xenia/base/math.h"
namespace xe {
namespace gpu {
namespace xenos {

View File

@ -10,13 +10,10 @@
#ifndef XENIA_GPU_XENOS_H_
#define XENIA_GPU_XENOS_H_
#include <algorithm>
#include "xenia/base/assert.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/math.h"
#include "xenia/base/memory.h"
#include "xenia/base/platform.h"
#include "xenia/base/math.h"
namespace xe {
namespace gpu {

View File

@ -264,8 +264,9 @@ ObjectTable::ObjectTableEntry* ObjectTable::LookupTable(X_HANDLE handle) {
// Generic lookup
template <>
object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle) {
auto object = ObjectTable::LookupObject(handle, false);
object_ref<XObject> ObjectTable::LookupObject<XObject>(
X_HANDLE handle, bool already_locked) {
auto object = ObjectTable::LookupObject(handle, already_locked);
auto result = object_ref<XObject>(reinterpret_cast<XObject*>(object));
return result;
}

View File

@ -46,10 +46,9 @@ class ObjectTable {
// Restores a XObject reference with a handle. Mainly for internal use - do
// not use.
X_STATUS RestoreHandle(X_HANDLE handle, XObject* object);
template <typename T>
object_ref<T> LookupObject(X_HANDLE handle) {
auto object = LookupObject(handle, false);
object_ref<T> LookupObject(X_HANDLE handle, bool already_locked = false) {
auto object = LookupObject(handle, already_locked);
if (T::kObjectType == XObject::Type::Socket) {
object = LookupObject((handle | 0xF8000000), false);
}
@ -110,7 +109,8 @@ class ObjectTable {
// Generic lookup
template <>
object_ref<XObject> ObjectTable::LookupObject<XObject>(X_HANDLE handle);
object_ref<XObject> ObjectTable::LookupObject<XObject>(
X_HANDLE handle, bool already_locked);
} // namespace util
} // namespace kernel

View File

@ -54,7 +54,15 @@ DECLARE_XBOXKRNL_EXPORT1(XAudioEnableDucker, kAudio, kStub);
dword_result_t XAudioRegisterRenderDriverClient_entry(lpdword_t callback_ptr,
lpdword_t driver_ptr) {
if (!callback_ptr) {
return X_E_INVALIDARG;
}
uint32_t callback = callback_ptr[0];
if (!callback) {
return X_E_INVALIDARG;
}
uint32_t callback_arg = callback_ptr[1];
auto audio_system = kernel_state()->emulator()->audio_system();

View File

@ -515,7 +515,26 @@ dword_result_t NtPulseEvent_entry(dword_t handle,
}
DECLARE_XBOXKRNL_EXPORT2(NtPulseEvent, kThreading, kImplemented,
kHighFrequency);
dword_result_t NtQueryEvent_entry(dword_t handle, lpdword_t out_struc) {
X_STATUS result = X_STATUS_SUCCESS;
auto ev = kernel_state()->object_table()->LookupObject<XEvent>(handle);
if (ev) {
uint32_t type_tmp, state_tmp;
ev->Query(&type_tmp, &state_tmp);
out_struc[0] = type_tmp;
out_struc[1] = state_tmp;
} else {
result = X_STATUS_INVALID_HANDLE;
}
return result;
}
DECLARE_XBOXKRNL_EXPORT2(NtQueryEvent, kThreading, kImplemented,
kHighFrequency);
uint32_t xeNtClearEvent(uint32_t handle) {
X_STATUS result = X_STATUS_SUCCESS;
@ -832,23 +851,25 @@ dword_result_t KeWaitForMultipleObjects_entry(
lpqword_t timeout_ptr, lpvoid_t wait_block_array_ptr) {
assert_true(wait_type <= 1);
std::vector<object_ref<XObject>> objects;
for (uint32_t n = 0; n < count; n++) {
auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
auto object_ref =
XObject::GetNativeObject<XObject>(kernel_state(), object_ptr);
if (!object_ref) {
return X_STATUS_INVALID_PARAMETER;
assert_true(count <= 64);
object_ref<XObject> objects[64];
{
auto crit = global_critical_region::AcquireDirect();
for (uint32_t n = 0; n < count; n++) {
auto object_ptr = kernel_memory()->TranslateVirtual(objects_ptr[n]);
auto object_ref = XObject::GetNativeObject<XObject>(kernel_state(),
object_ptr, -1, true);
if (!object_ref) {
return X_STATUS_INVALID_PARAMETER;
}
objects[n] = std::move(object_ref);
}
objects.push_back(std::move(object_ref));
}
uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
return XObject::WaitMultiple(uint32_t(objects.size()),
reinterpret_cast<XObject**>(objects.data()),
wait_type, wait_reason, processor_mode,
alertable, timeout_ptr ? &timeout : nullptr);
return XObject::WaitMultiple(
uint32_t(count), reinterpret_cast<XObject**>(&objects[0]), wait_type,
wait_reason, processor_mode, alertable, timeout_ptr ? &timeout : nullptr);
}
DECLARE_XBOXKRNL_EXPORT3(KeWaitForMultipleObjects, kThreading, kImplemented,
kBlocking, kHighFrequency);
@ -859,19 +880,32 @@ uint32_t xeNtWaitForMultipleObjectsEx(uint32_t count, xe::be<uint32_t>* handles,
uint64_t* timeout_ptr) {
assert_true(wait_type <= 1);
std::vector<object_ref<XObject>> objects;
for (uint32_t n = 0; n < count; n++) {
uint32_t object_handle = handles[n];
auto object =
kernel_state()->object_table()->LookupObject<XObject>(object_handle);
if (!object) {
return X_STATUS_INVALID_PARAMETER;
assert_true(count <= 64);
object_ref<XObject> objects[64];
/*
Reserving to squash the constant reallocations, in a benchmark of one
particular game over a period of five minutes roughly 11% of CPU time was
spent inside a helper function to Windows' heap allocation function. 7% of
that time was traced back to here
edit: actually switched to fixed size array, as there can never be more
than 64 events specified
*/
{
auto crit = global_critical_region::AcquireDirect();
for (uint32_t n = 0; n < count; n++) {
uint32_t object_handle = handles[n];
auto object = kernel_state()->object_table()->LookupObject<XObject>(
object_handle, true);
if (!object) {
return X_STATUS_INVALID_PARAMETER;
}
objects[n] = std::move(object);
}
objects.push_back(std::move(object));
}
return XObject::WaitMultiple(count,
reinterpret_cast<XObject**>(objects.data()),
return XObject::WaitMultiple(count, reinterpret_cast<XObject**>(&objects[0]),
wait_type, 6, wait_mode, alertable, timeout_ptr);
}
@ -879,6 +913,9 @@ dword_result_t NtWaitForMultipleObjectsEx_entry(
dword_t count, lpdword_t handles, dword_t wait_type, dword_t wait_mode,
dword_t alertable, lpqword_t timeout_ptr) {
uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
if (!count || count > 64 || wait_type != 1 && wait_type) {
return X_STATUS_INVALID_PARAMETER;
}
return xeNtWaitForMultipleObjectsEx(count, handles, wait_type, wait_mode,
alertable,
timeout_ptr ? &timeout : nullptr);
@ -892,11 +929,14 @@ dword_result_t NtSignalAndWaitForSingleObjectEx_entry(dword_t signal_handle,
dword_t r6,
lpqword_t timeout_ptr) {
X_STATUS result = X_STATUS_SUCCESS;
// pre-lock for these two handle lookups
global_critical_region::mutex().lock();
auto signal_object =
kernel_state()->object_table()->LookupObject<XObject>(signal_handle);
auto signal_object = kernel_state()->object_table()->LookupObject<XObject>(
signal_handle, true);
auto wait_object =
kernel_state()->object_table()->LookupObject<XObject>(wait_handle);
kernel_state()->object_table()->LookupObject<XObject>(wait_handle, true);
global_critical_region::mutex().unlock();
if (signal_object && wait_object) {
uint64_t timeout = timeout_ptr ? static_cast<uint64_t>(*timeout_ptr) : 0u;
result =

View File

@ -71,7 +71,12 @@ int32_t XEvent::Reset() {
event_->Reset();
return 1;
}
void XEvent::Query(uint32_t* out_type, uint32_t* out_state) {
auto [type, state] = event_->Query();
*out_type = type;
*out_state = state;
}
void XEvent::Clear() { event_->Reset(); }
bool XEvent::Save(ByteStream* stream) {

View File

@ -36,6 +36,7 @@ class XEvent : public XObject {
int32_t Set(uint32_t priority_increment, bool wait);
int32_t Pulse(uint32_t priority_increment, bool wait);
int32_t Reset();
void Query(uint32_t* out_type, uint32_t* out_state);
void Clear();
bool Save(ByteStream* stream) override;

View File

@ -255,7 +255,8 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
uint32_t wait_type, uint32_t wait_reason,
uint32_t processor_mode, uint32_t alertable,
uint64_t* opt_timeout) {
std::vector<xe::threading::WaitHandle*> wait_handles(count);
xe::threading::WaitHandle* wait_handles[64];
for (size_t i = 0; i < count; ++i) {
wait_handles[i] = objects[i]->GetWaitHandle();
assert_not_null(wait_handles[i]);
@ -267,7 +268,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
: std::chrono::milliseconds::max();
if (wait_type) {
auto result = xe::threading::WaitAny(std::move(wait_handles),
auto result = xe::threading::WaitAny(wait_handles, count,
alertable ? true : false, timeout_ms);
switch (result.first) {
case xe::threading::WaitResult::kSuccess:
@ -287,7 +288,7 @@ X_STATUS XObject::WaitMultiple(uint32_t count, XObject** objects,
return X_STATUS_UNSUCCESSFUL;
}
} else {
auto result = xe::threading::WaitAll(std::move(wait_handles),
auto result = xe::threading::WaitAll(wait_handles, count,
alertable ? true : false, timeout_ms);
switch (result) {
case xe::threading::WaitResult::kSuccess:
@ -360,8 +361,8 @@ void XObject::SetNativePointer(uint32_t native_ptr, bool uninitialized) {
}
object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
void* native_ptr,
int32_t as_type) {
void* native_ptr, int32_t as_type,
bool already_locked) {
assert_not_null(native_ptr);
// Unfortunately the XDK seems to inline some KeInitialize calls, meaning
@ -373,10 +374,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
// We identify this by setting wait_list_flink to a magic value. When set,
// wait_list_blink will hold a handle to our object.
auto global_lock = xe::global_critical_region::AcquireDirect();
if (!already_locked) {
global_critical_region::mutex().lock();
}
auto header = reinterpret_cast<X_DISPATCH_HEADER*>(native_ptr);
XObject* result;
if (as_type == -1) {
as_type = header->type;
}
@ -385,10 +388,12 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
// Already initialized.
// TODO: assert if the type of the object != as_type
uint32_t handle = header->wait_list_blink;
auto object = kernel_state->object_table()->LookupObject<XObject>(handle);
result = kernel_state->object_table()
->LookupObject<XObject>(handle, true)
.release();
goto return_result;
// TODO(benvanik): assert nothing has been changed in the struct.
return object;
// return object;
} else {
// First use, create new.
// https://www.nirsoft.net/kernel_struct/vista/KOBJECTS.html
@ -430,14 +435,22 @@ object_ref<XObject> XObject::GetNativeObject(KernelState* kernel_state,
case 24: // ThreadedDpcObject
default:
assert_always();
return NULL;
result = nullptr;
goto return_result;
// return NULL;
}
// Stash pointer in struct.
// FIXME: This assumes the object contains a dispatch header (some don't!)
StashHandle(header, object->handle());
result = object;
return object_ref<XObject>(object);
return_result:
if (!already_locked) {
global_critical_region::mutex().unlock();
}
return object_ref<XObject>(result);
}
}

View File

@ -192,10 +192,12 @@ class XObject {
static object_ref<XObject> GetNativeObject(KernelState* kernel_state,
void* native_ptr,
int32_t as_type = -1);
int32_t as_type = -1,
bool already_locked = false);
template <typename T>
static object_ref<T> GetNativeObject(KernelState* kernel_state,
void* native_ptr, int32_t as_type = -1);
void* native_ptr, int32_t as_type = -1,
bool already_locked = false);
protected:
bool SaveObject(ByteStream* stream);
@ -362,9 +364,11 @@ object_ref<T> retain_object(T* ptr) {
template <typename T>
object_ref<T> XObject::GetNativeObject(KernelState* kernel_state,
void* native_ptr, int32_t as_type) {
void* native_ptr, int32_t as_type,
bool already_locked) {
return object_ref<T>(reinterpret_cast<T*>(
GetNativeObject(kernel_state, native_ptr, as_type).release()));
GetNativeObject(kernel_state, native_ptr, as_type, already_locked)
.release()));
}
} // namespace kernel

View File

@ -15,6 +15,7 @@ project("xenia-ui-d3d12")
"/Os",
"/O1"
})
filter {}
local_platform_files()
files({
"../shaders/bytecode/d3d12_5_1/*.h",

View File

@ -15,6 +15,7 @@ project("xenia-ui-vulkan")
"/Os",
"/O1"
})
filter {}
includedirs({
project_root.."/third_party/Vulkan-Headers/include",
})

View File

@ -16,6 +16,8 @@ project("xenia-vfs")
"/Os",
"/O1"
})
filter {}
recursive_platform_files()
removefiles({"vfs_dump.cc"})

View File

@ -16,6 +16,8 @@ project("capstone")
"/Os",
"/O1"
})
filter {}
includedirs({
"capstone",
"capstone/include",

2
third_party/fmt.lua vendored
View File

@ -13,6 +13,8 @@ project("fmt")
"/Os",
"/O1"
})
filter {}
includedirs({
"fmt/include",
})

View File

@ -15,6 +15,8 @@ project("glslang-spirv")
"/Os",
"/O1"
})
filter {}
files({
"glslang/SPIRV/bitutils.h",
"glslang/SPIRV/disassemble.cpp",

View File

@ -16,6 +16,7 @@ project("imgui")
"/Os",
"/O1"
})
filter{}
files({
"imgui/imconfig.h",
"imgui/imgui.cpp",