Proper(ish) VdSwap - fixes a bunch of things.
Caching is working a bit better, now.
This commit is contained in:
parent
8337820500
commit
6e76c169d6
|
@ -30,10 +30,9 @@ int BufferResource::Prepare() {
|
|||
}
|
||||
}
|
||||
|
||||
// DISABLED
|
||||
//if (!dirtied_) {
|
||||
// return 0;
|
||||
//}
|
||||
if (!dirtied_) {
|
||||
return 0;
|
||||
}
|
||||
dirtied_ = false;
|
||||
|
||||
// pass dirty regions?
|
||||
|
|
|
@ -300,6 +300,16 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) {
|
|||
}
|
||||
break;
|
||||
|
||||
case PM4_XE_SWAP:
|
||||
// Xenia-specific VdSwap hook.
|
||||
// VdSwap will post this to tell us we need to swap the screen/fire an interrupt.
|
||||
XETRACECP("[%.8X] Packet(%.8X): PM4_XE_SWAP",
|
||||
packet_ptr, packet);
|
||||
LOG_DATA(count);
|
||||
ADVANCE_PTR(count);
|
||||
graphics_system_->Swap();
|
||||
break;
|
||||
|
||||
case PM4_INDIRECT_BUFFER:
|
||||
// indirect buffer dispatch
|
||||
{
|
||||
|
@ -334,14 +344,11 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) {
|
|||
} else {
|
||||
// Register.
|
||||
XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount);
|
||||
|
||||
if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) {
|
||||
// Waiting for coherency. We should have all the info we need
|
||||
// now (base+size+mode), so kick it off.
|
||||
MakeCoherent();
|
||||
}
|
||||
|
||||
value = regs->values[poll_reg_addr].u32;
|
||||
if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) {
|
||||
MakeCoherent();
|
||||
value = regs->values[poll_reg_addr].u32;
|
||||
}
|
||||
}
|
||||
switch (wait_info & 0x7) {
|
||||
case 0x0: // Never.
|
||||
|
@ -768,16 +775,23 @@ void CommandProcessor::WriteRegister(
|
|||
}
|
||||
|
||||
void CommandProcessor::MakeCoherent() {
|
||||
RegisterFile* regs = driver_->register_file();
|
||||
auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32;
|
||||
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
|
||||
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
|
||||
|
||||
// Status host often has 0x01000000 or 0x03000000.
|
||||
// This is likely toggling VC (vertex cache) or TC (texture cache).
|
||||
// Or, it also has a direction in here maybe - there is probably
|
||||
// some way to check for dest coherency (what all the COHER_DEST_BASE_*
|
||||
// registers are for).
|
||||
// Best docs I've found on this are here:
|
||||
// http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf
|
||||
// http://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454
|
||||
|
||||
RegisterFile* regs = driver_->register_file();
|
||||
auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32;
|
||||
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
|
||||
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
|
||||
|
||||
if (!(status_host & 0x80000000ul)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// TODO(benvanik): notify resource cache of base->size and type.
|
||||
XETRACECP("Make %.8X -> %.8X (%db) coherent",
|
||||
|
|
|
@ -24,7 +24,7 @@ D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator)
|
|||
: GraphicsSystem(emulator),
|
||||
window_(nullptr), dxgi_factory_(nullptr), device_(nullptr),
|
||||
timer_queue_(nullptr), vsync_timer_(nullptr),
|
||||
interrupt_pending_(true) {
|
||||
last_swap_time_(0.0) {
|
||||
}
|
||||
|
||||
D3D11GraphicsSystem::~D3D11GraphicsSystem() {
|
||||
|
@ -141,36 +141,26 @@ void D3D11GraphicsSystem::Initialize() {
|
|||
void D3D11GraphicsSystem::Pump() {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
if (swap_pending_) {
|
||||
swap_pending_ = false;
|
||||
|
||||
// TODO(benvanik): remove this when commands are understood.
|
||||
driver_->Resolve();
|
||||
|
||||
// Swap window.
|
||||
// If we are set to vsync this will block.
|
||||
window_->Swap();
|
||||
|
||||
DispatchInterruptCallback(0);
|
||||
interrupt_pending_ = false;
|
||||
} else if (interrupt_pending_) {
|
||||
DispatchInterruptCallback(0);
|
||||
interrupt_pending_ = false;
|
||||
} else {
|
||||
double time_since_last_interrupt = xe_pal_now() - last_interrupt_time_;
|
||||
if (time_since_last_interrupt > 0.5) {
|
||||
// If we have gone too long without an interrupt, fire one.
|
||||
DispatchInterruptCallback(0);
|
||||
}
|
||||
if (time_since_last_interrupt > 0.3) {
|
||||
// Force a swap when profiling.
|
||||
if (Profiler::is_enabled()) {
|
||||
window_->Swap();
|
||||
}
|
||||
double time_since_last_swap = xe_pal_now() - last_swap_time_;
|
||||
if (time_since_last_swap > 1.0) {
|
||||
// Force a swap when profiling.
|
||||
if (Profiler::is_enabled()) {
|
||||
window_->Swap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void D3D11GraphicsSystem::Swap() {
|
||||
// TODO(benvanik): remove this when commands are understood.
|
||||
driver_->Resolve();
|
||||
|
||||
// Swap window.
|
||||
// If we are set to vsync this will block.
|
||||
window_->Swap();
|
||||
|
||||
last_swap_time_ = xe_pal_now();
|
||||
}
|
||||
|
||||
void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs,
|
||||
BOOLEAN) {
|
||||
static bool thread_name_set = false;
|
||||
|
@ -185,7 +175,6 @@ void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs,
|
|||
// TODO(benvanik): we shouldn't need to do the dispatch here, but there's
|
||||
// something wrong and the CP will block waiting for code that
|
||||
// needs to be run in the interrupt.
|
||||
// gs->interrupt_pending_ = true;
|
||||
gs->DispatchInterruptCallback(0);
|
||||
}
|
||||
|
||||
|
|
|
@ -35,6 +35,8 @@ public:
|
|||
|
||||
virtual void Shutdown();
|
||||
|
||||
void Swap() override;
|
||||
|
||||
protected:
|
||||
virtual void Initialize();
|
||||
virtual void Pump();
|
||||
|
@ -49,7 +51,7 @@ private:
|
|||
HANDLE timer_queue_;
|
||||
HANDLE vsync_timer_;
|
||||
|
||||
bool interrupt_pending_;
|
||||
double last_swap_time_;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@ GraphicsSystem::GraphicsSystem(Emulator* emulator) :
|
|||
thread_(nullptr), running_(false), driver_(nullptr),
|
||||
command_processor_(nullptr),
|
||||
interrupt_callback_(0), interrupt_callback_data_(0),
|
||||
last_interrupt_time_(0), swap_pending_(false), thread_wait_(nullptr) {
|
||||
last_interrupt_time_(0), thread_wait_(nullptr) {
|
||||
// Create the run loop used for any windows/etc.
|
||||
// This must be done on the thread we create the driver.
|
||||
run_loop_ = xe_run_loop_create();
|
||||
|
|
|
@ -45,8 +45,7 @@ public:
|
|||
|
||||
void MarkVblank();
|
||||
void DispatchInterruptCallback(uint32_t source, uint32_t cpu = 0xFFFFFFFF);
|
||||
bool swap_pending() const { return swap_pending_; }
|
||||
void set_swap_pending(bool value) { swap_pending_ = value; }
|
||||
virtual void Swap() = 0;
|
||||
|
||||
protected:
|
||||
virtual void Initialize();
|
||||
|
@ -83,7 +82,6 @@ protected:
|
|||
uint32_t interrupt_callback_;
|
||||
uint32_t interrupt_callback_data_;
|
||||
double last_interrupt_time_;
|
||||
bool swap_pending_;
|
||||
HANDLE thread_wait_;
|
||||
};
|
||||
|
||||
|
|
|
@ -28,6 +28,8 @@ public:
|
|||
|
||||
virtual void Shutdown();
|
||||
|
||||
void Swap() override {}
|
||||
|
||||
protected:
|
||||
virtual void Initialize();
|
||||
virtual void Pump();
|
||||
|
|
|
@ -9,6 +9,8 @@
|
|||
|
||||
#include <xenia/gpu/resource_cache.h>
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
|
||||
using namespace std;
|
||||
using namespace xe;
|
||||
|
@ -110,6 +112,8 @@ uint64_t ResourceCache::HashRange(const MemoryRange& memory_range) {
|
|||
}
|
||||
|
||||
void ResourceCache::SyncRange(uint32_t address, int length) {
|
||||
SCOPE_profile_cpu_f("gpu");
|
||||
|
||||
// Scan the page table in sync with our resource list. This means
|
||||
// we have O(n) complexity for updates, though we could definitely
|
||||
// make this faster/cleaner.
|
||||
|
@ -118,15 +122,12 @@ void ResourceCache::SyncRange(uint32_t address, int length) {
|
|||
// will not be changing, which allows us to do a foreach(res) and reload
|
||||
// and then clear the table.
|
||||
|
||||
// DISABLED
|
||||
return;
|
||||
|
||||
// total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768
|
||||
// each byte = 1 page
|
||||
// Walk as qwords so we can clear things up faster.
|
||||
uint64_t* page_table = reinterpret_cast<uint64_t*>(
|
||||
memory_->Translate(memory_->page_table()));
|
||||
int page_size = 16 * 1024; // 16KB pages
|
||||
uint32_t page_size = 16 * 1024; // 16KB pages
|
||||
|
||||
uint32_t lo_address = address % 0x20000000;
|
||||
uint32_t hi_address = lo_address + length;
|
||||
|
@ -134,24 +135,38 @@ void ResourceCache::SyncRange(uint32_t address, int length) {
|
|||
int start_page = lo_address / page_size;
|
||||
int end_page = hi_address / page_size;
|
||||
|
||||
auto it = paged_resources_.upper_bound(lo_address);
|
||||
auto end_it = paged_resources_.lower_bound(hi_address);
|
||||
while (it != end_it) {
|
||||
const auto& memory_range = it->second->memory_range();
|
||||
int lo_page = (memory_range.guest_base % 0x20000000) / page_size;
|
||||
int hi_page = lo_page + (memory_range.length / page_size);
|
||||
for (int i = lo_page / 8; i <= hi_page / 8; ++i) {
|
||||
uint64_t page_flags = page_table[i];
|
||||
if (page_flags) {
|
||||
// Dirty!
|
||||
it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size);
|
||||
{
|
||||
SCOPE_profile_cpu_i("gpu", "SyncRange:mark");
|
||||
auto it = lo_address > page_size ?
|
||||
paged_resources_.upper_bound(lo_address - page_size) :
|
||||
paged_resources_.begin();
|
||||
auto end_it = paged_resources_.lower_bound(hi_address + page_size);
|
||||
while (it != end_it) {
|
||||
const auto& memory_range = it->second->memory_range();
|
||||
int lo_page = (memory_range.guest_base % 0x20000000) / page_size;
|
||||
int hi_page = lo_page + (memory_range.length / page_size);
|
||||
lo_page = std::max(lo_page, start_page);
|
||||
hi_page = std::min(hi_page, end_page);
|
||||
if (lo_page > hi_page) {
|
||||
++it;
|
||||
continue;
|
||||
}
|
||||
for (int i = lo_page / 8; i <= hi_page / 8; ++i) {
|
||||
uint64_t page_flags = page_table[i];
|
||||
if (page_flags) {
|
||||
// Dirty!
|
||||
it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size);
|
||||
}
|
||||
}
|
||||
++it;
|
||||
}
|
||||
++it;
|
||||
}
|
||||
|
||||
// Reset page table.
|
||||
for (auto i = start_page / 8; i <= end_page / 8; ++i) {
|
||||
page_table[i] = 0;
|
||||
{
|
||||
SCOPE_profile_cpu_i("gpu", "SyncRange:reset");
|
||||
for (auto i = start_page / 8; i <= end_page / 8; ++i) {
|
||||
page_table[i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -291,11 +291,10 @@ int TextureResource::Prepare() {
|
|||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// DISABLED
|
||||
//if (!dirtied_) {
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
if (!dirtied_) {
|
||||
return 0;
|
||||
}
|
||||
dirtied_ = false;
|
||||
|
||||
// pass dirty regions?
|
||||
|
|
|
@ -70,6 +70,8 @@ enum Type3Opcode {
|
|||
PM4_CONTEXT_UPDATE = 0x5e, // updates the current context, if needed
|
||||
PM4_INTERRUPT = 0x54, // generate interrupt from the command stream
|
||||
|
||||
PM4_XE_SWAP = 0x55, // Xenia only: VdSwap uses this to trigger a swap.
|
||||
|
||||
PM4_IM_STORE = 0x2c, // copy sequencer instruction memory to system memory
|
||||
|
||||
// Tiled rendering:
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <xenia/emulator.h>
|
||||
#include <xenia/cpu/cpu.h>
|
||||
#include <xenia/gpu/gpu.h>
|
||||
#include <xenia/gpu/xenos/packets.h>
|
||||
#include <xenia/kernel/kernel_state.h>
|
||||
#include <xenia/kernel/xboxkrnl_private.h>
|
||||
#include <xenia/kernel/xboxkrnl_rtl.h>
|
||||
|
@ -422,19 +423,16 @@ SHIM_CALL VdSwap_shim(
|
|||
unk6,
|
||||
unk7);
|
||||
|
||||
KernelState* kernel_state = shared_kernel_state_;
|
||||
XEASSERTNOTNULL(kernel_state);
|
||||
GraphicsSystem* gs = kernel_state->emulator()->graphics_system();
|
||||
if (!gs) {
|
||||
return;
|
||||
}
|
||||
|
||||
gs->set_swap_pending(true);
|
||||
|
||||
// The caller seems to reserve 64 words (256b) in the primary ringbuffer
|
||||
// for this method to do what it needs. We just zero them out. We could
|
||||
// encode the parameters in the stream for the ringbuffer, if needed.
|
||||
// for this method to do what it needs. We just zero them out and send a
|
||||
// token value. It'd be nice to figure out what this is really doing so
|
||||
// that we could simulate it, though due to TCR I bet all games need to
|
||||
// use this method.
|
||||
xe_zero_struct(SHIM_MEM_ADDR(unk0), 64 * 4);
|
||||
auto dwords = reinterpret_cast<uint32_t*>(SHIM_MEM_ADDR(unk0));
|
||||
dwords[0] = XESWAP32((0x03 << 30) |
|
||||
((1 - 1) << 16) |
|
||||
(xenos::PM4_XE_SWAP << 8));
|
||||
|
||||
SHIM_SET_RETURN_64(0);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue