Proper(ish) VdSwap - fixes a bunch of things.

Caching is working a bit better, now.
This commit is contained in:
Ben Vanik 2014-06-08 21:24:29 -07:00
parent 8337820500
commit 6e76c169d6
11 changed files with 101 additions and 83 deletions

View File

@ -30,10 +30,9 @@ int BufferResource::Prepare() {
}
}
// DISABLED
//if (!dirtied_) {
// return 0;
//}
if (!dirtied_) {
return 0;
}
dirtied_ = false;
// pass dirty regions?

View File

@ -300,6 +300,16 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) {
}
break;
case PM4_XE_SWAP:
// Xenia-specific VdSwap hook.
// VdSwap will post this to tell us we need to swap the screen/fire an interrupt.
XETRACECP("[%.8X] Packet(%.8X): PM4_XE_SWAP",
packet_ptr, packet);
LOG_DATA(count);
ADVANCE_PTR(count);
graphics_system_->Swap();
break;
case PM4_INDIRECT_BUFFER:
// indirect buffer dispatch
{
@ -334,14 +344,11 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) {
} else {
// Register.
XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount);
if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) {
// Waiting for coherency. We should have all the info we need
// now (base+size+mode), so kick it off.
MakeCoherent();
}
value = regs->values[poll_reg_addr].u32;
if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) {
MakeCoherent();
value = regs->values[poll_reg_addr].u32;
}
}
switch (wait_info & 0x7) {
case 0x0: // Never.
@ -768,16 +775,23 @@ void CommandProcessor::WriteRegister(
}
void CommandProcessor::MakeCoherent() {
RegisterFile* regs = driver_->register_file();
auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32;
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
// Status host often has 0x01000000 or 0x03000000.
// This is likely toggling VC (vertex cache) or TC (texture cache).
// Or, it also has a direction in here maybe - there is probably
// some way to check for dest coherency (what all the COHER_DEST_BASE_*
// registers are for).
// Best docs I've found on this are here:
// http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf
// http://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454
RegisterFile* regs = driver_->register_file();
auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32;
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
if (!(status_host & 0x80000000ul)) {
return;
}
// TODO(benvanik): notify resource cache of base->size and type.
XETRACECP("Make %.8X -> %.8X (%db) coherent",

View File

@ -24,7 +24,7 @@ D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator)
: GraphicsSystem(emulator),
window_(nullptr), dxgi_factory_(nullptr), device_(nullptr),
timer_queue_(nullptr), vsync_timer_(nullptr),
interrupt_pending_(true) {
last_swap_time_(0.0) {
}
D3D11GraphicsSystem::~D3D11GraphicsSystem() {
@ -141,36 +141,26 @@ void D3D11GraphicsSystem::Initialize() {
void D3D11GraphicsSystem::Pump() {
SCOPE_profile_cpu_f("gpu");
if (swap_pending_) {
swap_pending_ = false;
// TODO(benvanik): remove this when commands are understood.
driver_->Resolve();
// Swap window.
// If we are set to vsync this will block.
window_->Swap();
DispatchInterruptCallback(0);
interrupt_pending_ = false;
} else if (interrupt_pending_) {
DispatchInterruptCallback(0);
interrupt_pending_ = false;
} else {
double time_since_last_interrupt = xe_pal_now() - last_interrupt_time_;
if (time_since_last_interrupt > 0.5) {
// If we have gone too long without an interrupt, fire one.
DispatchInterruptCallback(0);
}
if (time_since_last_interrupt > 0.3) {
// Force a swap when profiling.
if (Profiler::is_enabled()) {
window_->Swap();
}
double time_since_last_swap = xe_pal_now() - last_swap_time_;
if (time_since_last_swap > 1.0) {
// Force a swap when profiling.
if (Profiler::is_enabled()) {
window_->Swap();
}
}
}
void D3D11GraphicsSystem::Swap() {
// TODO(benvanik): remove this when commands are understood.
driver_->Resolve();
// Swap window.
// If we are set to vsync this will block.
window_->Swap();
last_swap_time_ = xe_pal_now();
}
void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs,
BOOLEAN) {
static bool thread_name_set = false;
@ -185,7 +175,6 @@ void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs,
// TODO(benvanik): we shouldn't need to do the dispatch here, but there's
// something wrong and the CP will block waiting for code that
// needs to be run in the interrupt.
// gs->interrupt_pending_ = true;
gs->DispatchInterruptCallback(0);
}

View File

@ -35,6 +35,8 @@ public:
virtual void Shutdown();
void Swap() override;
protected:
virtual void Initialize();
virtual void Pump();
@ -49,7 +51,7 @@ private:
HANDLE timer_queue_;
HANDLE vsync_timer_;
bool interrupt_pending_;
double last_swap_time_;
};

View File

@ -28,7 +28,7 @@ GraphicsSystem::GraphicsSystem(Emulator* emulator) :
thread_(nullptr), running_(false), driver_(nullptr),
command_processor_(nullptr),
interrupt_callback_(0), interrupt_callback_data_(0),
last_interrupt_time_(0), swap_pending_(false), thread_wait_(nullptr) {
last_interrupt_time_(0), thread_wait_(nullptr) {
// Create the run loop used for any windows/etc.
// This must be done on the thread we create the driver.
run_loop_ = xe_run_loop_create();

View File

@ -45,8 +45,7 @@ public:
void MarkVblank();
void DispatchInterruptCallback(uint32_t source, uint32_t cpu = 0xFFFFFFFF);
bool swap_pending() const { return swap_pending_; }
void set_swap_pending(bool value) { swap_pending_ = value; }
virtual void Swap() = 0;
protected:
virtual void Initialize();
@ -83,7 +82,6 @@ protected:
uint32_t interrupt_callback_;
uint32_t interrupt_callback_data_;
double last_interrupt_time_;
bool swap_pending_;
HANDLE thread_wait_;
};

View File

@ -28,6 +28,8 @@ public:
virtual void Shutdown();
void Swap() override {}
protected:
virtual void Initialize();
virtual void Pump();

View File

@ -9,6 +9,8 @@
#include <xenia/gpu/resource_cache.h>
#include <algorithm>
using namespace std;
using namespace xe;
@ -110,6 +112,8 @@ uint64_t ResourceCache::HashRange(const MemoryRange& memory_range) {
}
void ResourceCache::SyncRange(uint32_t address, int length) {
SCOPE_profile_cpu_f("gpu");
// Scan the page table in sync with our resource list. This means
// we have O(n) complexity for updates, though we could definitely
// make this faster/cleaner.
@ -118,15 +122,12 @@ void ResourceCache::SyncRange(uint32_t address, int length) {
// will not be changing, which allows us to do a foreach(res) and reload
// and then clear the table.
// DISABLED
return;
// total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768
// each byte = 1 page
// Walk as qwords so we can clear things up faster.
uint64_t* page_table = reinterpret_cast<uint64_t*>(
memory_->Translate(memory_->page_table()));
int page_size = 16 * 1024; // 16KB pages
uint32_t page_size = 16 * 1024; // 16KB pages
uint32_t lo_address = address % 0x20000000;
uint32_t hi_address = lo_address + length;
@ -134,24 +135,38 @@ void ResourceCache::SyncRange(uint32_t address, int length) {
int start_page = lo_address / page_size;
int end_page = hi_address / page_size;
auto it = paged_resources_.upper_bound(lo_address);
auto end_it = paged_resources_.lower_bound(hi_address);
while (it != end_it) {
const auto& memory_range = it->second->memory_range();
int lo_page = (memory_range.guest_base % 0x20000000) / page_size;
int hi_page = lo_page + (memory_range.length / page_size);
for (int i = lo_page / 8; i <= hi_page / 8; ++i) {
uint64_t page_flags = page_table[i];
if (page_flags) {
// Dirty!
it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size);
{
SCOPE_profile_cpu_i("gpu", "SyncRange:mark");
auto it = lo_address > page_size ?
paged_resources_.upper_bound(lo_address - page_size) :
paged_resources_.begin();
auto end_it = paged_resources_.lower_bound(hi_address + page_size);
while (it != end_it) {
const auto& memory_range = it->second->memory_range();
int lo_page = (memory_range.guest_base % 0x20000000) / page_size;
int hi_page = lo_page + (memory_range.length / page_size);
lo_page = std::max(lo_page, start_page);
hi_page = std::min(hi_page, end_page);
if (lo_page > hi_page) {
++it;
continue;
}
for (int i = lo_page / 8; i <= hi_page / 8; ++i) {
uint64_t page_flags = page_table[i];
if (page_flags) {
// Dirty!
it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size);
}
}
++it;
}
++it;
}
// Reset page table.
for (auto i = start_page / 8; i <= end_page / 8; ++i) {
page_table[i] = 0;
{
SCOPE_profile_cpu_i("gpu", "SyncRange:reset");
for (auto i = start_page / 8; i <= end_page / 8; ++i) {
page_table[i] = 0;
}
}
}

View File

@ -292,10 +292,9 @@ int TextureResource::Prepare() {
}
}
// DISABLED
//if (!dirtied_) {
// return 0;
//}
if (!dirtied_) {
return 0;
}
dirtied_ = false;
// pass dirty regions?

View File

@ -70,6 +70,8 @@ enum Type3Opcode {
PM4_CONTEXT_UPDATE = 0x5e, // updates the current context, if needed
PM4_INTERRUPT = 0x54, // generate interrupt from the command stream
PM4_XE_SWAP = 0x55, // Xenia only: VdSwap uses this to trigger a swap.
PM4_IM_STORE = 0x2c, // copy sequencer instruction memory to system memory
// Tiled rendering:

View File

@ -12,6 +12,7 @@
#include <xenia/emulator.h>
#include <xenia/cpu/cpu.h>
#include <xenia/gpu/gpu.h>
#include <xenia/gpu/xenos/packets.h>
#include <xenia/kernel/kernel_state.h>
#include <xenia/kernel/xboxkrnl_private.h>
#include <xenia/kernel/xboxkrnl_rtl.h>
@ -422,19 +423,16 @@ SHIM_CALL VdSwap_shim(
unk6,
unk7);
KernelState* kernel_state = shared_kernel_state_;
XEASSERTNOTNULL(kernel_state);
GraphicsSystem* gs = kernel_state->emulator()->graphics_system();
if (!gs) {
return;
}
gs->set_swap_pending(true);
// The caller seems to reserve 64 words (256b) in the primary ringbuffer
// for this method to do what it needs. We just zero them out. We could
// encode the parameters in the stream for the ringbuffer, if needed.
// for this method to do what it needs. We just zero them out and send a
// token value. It'd be nice to figure out what this is really doing so
// that we could simulate it, though due to TCR I bet all games need to
// use this method.
xe_zero_struct(SHIM_MEM_ADDR(unk0), 64 * 4);
auto dwords = reinterpret_cast<uint32_t*>(SHIM_MEM_ADDR(unk0));
dwords[0] = XESWAP32((0x03 << 30) |
((1 - 1) << 16) |
(xenos::PM4_XE_SWAP << 8));
SHIM_SET_RETURN_64(0);
}