Proper(ish) VdSwap - fixes a bunch of things.
Caching is working a bit better, now.
This commit is contained in:
parent
8337820500
commit
6e76c169d6
|
@ -30,10 +30,9 @@ int BufferResource::Prepare() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// DISABLED
|
if (!dirtied_) {
|
||||||
//if (!dirtied_) {
|
return 0;
|
||||||
// return 0;
|
}
|
||||||
//}
|
|
||||||
dirtied_ = false;
|
dirtied_ = false;
|
||||||
|
|
||||||
// pass dirty regions?
|
// pass dirty regions?
|
||||||
|
|
|
@ -300,6 +300,16 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) {
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case PM4_XE_SWAP:
|
||||||
|
// Xenia-specific VdSwap hook.
|
||||||
|
// VdSwap will post this to tell us we need to swap the screen/fire an interrupt.
|
||||||
|
XETRACECP("[%.8X] Packet(%.8X): PM4_XE_SWAP",
|
||||||
|
packet_ptr, packet);
|
||||||
|
LOG_DATA(count);
|
||||||
|
ADVANCE_PTR(count);
|
||||||
|
graphics_system_->Swap();
|
||||||
|
break;
|
||||||
|
|
||||||
case PM4_INDIRECT_BUFFER:
|
case PM4_INDIRECT_BUFFER:
|
||||||
// indirect buffer dispatch
|
// indirect buffer dispatch
|
||||||
{
|
{
|
||||||
|
@ -334,14 +344,11 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) {
|
||||||
} else {
|
} else {
|
||||||
// Register.
|
// Register.
|
||||||
XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount);
|
XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount);
|
||||||
|
|
||||||
if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) {
|
|
||||||
// Waiting for coherency. We should have all the info we need
|
|
||||||
// now (base+size+mode), so kick it off.
|
|
||||||
MakeCoherent();
|
|
||||||
}
|
|
||||||
|
|
||||||
value = regs->values[poll_reg_addr].u32;
|
value = regs->values[poll_reg_addr].u32;
|
||||||
|
if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) {
|
||||||
|
MakeCoherent();
|
||||||
|
value = regs->values[poll_reg_addr].u32;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
switch (wait_info & 0x7) {
|
switch (wait_info & 0x7) {
|
||||||
case 0x0: // Never.
|
case 0x0: // Never.
|
||||||
|
@ -768,16 +775,23 @@ void CommandProcessor::WriteRegister(
|
||||||
}
|
}
|
||||||
|
|
||||||
void CommandProcessor::MakeCoherent() {
|
void CommandProcessor::MakeCoherent() {
|
||||||
RegisterFile* regs = driver_->register_file();
|
|
||||||
auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32;
|
|
||||||
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
|
|
||||||
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
|
|
||||||
|
|
||||||
// Status host often has 0x01000000 or 0x03000000.
|
// Status host often has 0x01000000 or 0x03000000.
|
||||||
// This is likely toggling VC (vertex cache) or TC (texture cache).
|
// This is likely toggling VC (vertex cache) or TC (texture cache).
|
||||||
// Or, it also has a direction in here maybe - there is probably
|
// Or, it also has a direction in here maybe - there is probably
|
||||||
// some way to check for dest coherency (what all the COHER_DEST_BASE_*
|
// some way to check for dest coherency (what all the COHER_DEST_BASE_*
|
||||||
// registers are for).
|
// registers are for).
|
||||||
|
// Best docs I've found on this are here:
|
||||||
|
// http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf
|
||||||
|
// http://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454
|
||||||
|
|
||||||
|
RegisterFile* regs = driver_->register_file();
|
||||||
|
auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32;
|
||||||
|
auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32;
|
||||||
|
auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32;
|
||||||
|
|
||||||
|
if (!(status_host & 0x80000000ul)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO(benvanik): notify resource cache of base->size and type.
|
// TODO(benvanik): notify resource cache of base->size and type.
|
||||||
XETRACECP("Make %.8X -> %.8X (%db) coherent",
|
XETRACECP("Make %.8X -> %.8X (%db) coherent",
|
||||||
|
|
|
@ -24,7 +24,7 @@ D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator)
|
||||||
: GraphicsSystem(emulator),
|
: GraphicsSystem(emulator),
|
||||||
window_(nullptr), dxgi_factory_(nullptr), device_(nullptr),
|
window_(nullptr), dxgi_factory_(nullptr), device_(nullptr),
|
||||||
timer_queue_(nullptr), vsync_timer_(nullptr),
|
timer_queue_(nullptr), vsync_timer_(nullptr),
|
||||||
interrupt_pending_(true) {
|
last_swap_time_(0.0) {
|
||||||
}
|
}
|
||||||
|
|
||||||
D3D11GraphicsSystem::~D3D11GraphicsSystem() {
|
D3D11GraphicsSystem::~D3D11GraphicsSystem() {
|
||||||
|
@ -141,36 +141,26 @@ void D3D11GraphicsSystem::Initialize() {
|
||||||
void D3D11GraphicsSystem::Pump() {
|
void D3D11GraphicsSystem::Pump() {
|
||||||
SCOPE_profile_cpu_f("gpu");
|
SCOPE_profile_cpu_f("gpu");
|
||||||
|
|
||||||
if (swap_pending_) {
|
double time_since_last_swap = xe_pal_now() - last_swap_time_;
|
||||||
swap_pending_ = false;
|
if (time_since_last_swap > 1.0) {
|
||||||
|
// Force a swap when profiling.
|
||||||
// TODO(benvanik): remove this when commands are understood.
|
if (Profiler::is_enabled()) {
|
||||||
driver_->Resolve();
|
window_->Swap();
|
||||||
|
|
||||||
// Swap window.
|
|
||||||
// If we are set to vsync this will block.
|
|
||||||
window_->Swap();
|
|
||||||
|
|
||||||
DispatchInterruptCallback(0);
|
|
||||||
interrupt_pending_ = false;
|
|
||||||
} else if (interrupt_pending_) {
|
|
||||||
DispatchInterruptCallback(0);
|
|
||||||
interrupt_pending_ = false;
|
|
||||||
} else {
|
|
||||||
double time_since_last_interrupt = xe_pal_now() - last_interrupt_time_;
|
|
||||||
if (time_since_last_interrupt > 0.5) {
|
|
||||||
// If we have gone too long without an interrupt, fire one.
|
|
||||||
DispatchInterruptCallback(0);
|
|
||||||
}
|
|
||||||
if (time_since_last_interrupt > 0.3) {
|
|
||||||
// Force a swap when profiling.
|
|
||||||
if (Profiler::is_enabled()) {
|
|
||||||
window_->Swap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void D3D11GraphicsSystem::Swap() {
|
||||||
|
// TODO(benvanik): remove this when commands are understood.
|
||||||
|
driver_->Resolve();
|
||||||
|
|
||||||
|
// Swap window.
|
||||||
|
// If we are set to vsync this will block.
|
||||||
|
window_->Swap();
|
||||||
|
|
||||||
|
last_swap_time_ = xe_pal_now();
|
||||||
|
}
|
||||||
|
|
||||||
void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs,
|
void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs,
|
||||||
BOOLEAN) {
|
BOOLEAN) {
|
||||||
static bool thread_name_set = false;
|
static bool thread_name_set = false;
|
||||||
|
@ -185,7 +175,6 @@ void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs,
|
||||||
// TODO(benvanik): we shouldn't need to do the dispatch here, but there's
|
// TODO(benvanik): we shouldn't need to do the dispatch here, but there's
|
||||||
// something wrong and the CP will block waiting for code that
|
// something wrong and the CP will block waiting for code that
|
||||||
// needs to be run in the interrupt.
|
// needs to be run in the interrupt.
|
||||||
// gs->interrupt_pending_ = true;
|
|
||||||
gs->DispatchInterruptCallback(0);
|
gs->DispatchInterruptCallback(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,8 @@ public:
|
||||||
|
|
||||||
virtual void Shutdown();
|
virtual void Shutdown();
|
||||||
|
|
||||||
|
void Swap() override;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual void Initialize();
|
virtual void Initialize();
|
||||||
virtual void Pump();
|
virtual void Pump();
|
||||||
|
@ -49,7 +51,7 @@ private:
|
||||||
HANDLE timer_queue_;
|
HANDLE timer_queue_;
|
||||||
HANDLE vsync_timer_;
|
HANDLE vsync_timer_;
|
||||||
|
|
||||||
bool interrupt_pending_;
|
double last_swap_time_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,7 @@ GraphicsSystem::GraphicsSystem(Emulator* emulator) :
|
||||||
thread_(nullptr), running_(false), driver_(nullptr),
|
thread_(nullptr), running_(false), driver_(nullptr),
|
||||||
command_processor_(nullptr),
|
command_processor_(nullptr),
|
||||||
interrupt_callback_(0), interrupt_callback_data_(0),
|
interrupt_callback_(0), interrupt_callback_data_(0),
|
||||||
last_interrupt_time_(0), swap_pending_(false), thread_wait_(nullptr) {
|
last_interrupt_time_(0), thread_wait_(nullptr) {
|
||||||
// Create the run loop used for any windows/etc.
|
// Create the run loop used for any windows/etc.
|
||||||
// This must be done on the thread we create the driver.
|
// This must be done on the thread we create the driver.
|
||||||
run_loop_ = xe_run_loop_create();
|
run_loop_ = xe_run_loop_create();
|
||||||
|
|
|
@ -45,8 +45,7 @@ public:
|
||||||
|
|
||||||
void MarkVblank();
|
void MarkVblank();
|
||||||
void DispatchInterruptCallback(uint32_t source, uint32_t cpu = 0xFFFFFFFF);
|
void DispatchInterruptCallback(uint32_t source, uint32_t cpu = 0xFFFFFFFF);
|
||||||
bool swap_pending() const { return swap_pending_; }
|
virtual void Swap() = 0;
|
||||||
void set_swap_pending(bool value) { swap_pending_ = value; }
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual void Initialize();
|
virtual void Initialize();
|
||||||
|
@ -83,7 +82,6 @@ protected:
|
||||||
uint32_t interrupt_callback_;
|
uint32_t interrupt_callback_;
|
||||||
uint32_t interrupt_callback_data_;
|
uint32_t interrupt_callback_data_;
|
||||||
double last_interrupt_time_;
|
double last_interrupt_time_;
|
||||||
bool swap_pending_;
|
|
||||||
HANDLE thread_wait_;
|
HANDLE thread_wait_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,8 @@ public:
|
||||||
|
|
||||||
virtual void Shutdown();
|
virtual void Shutdown();
|
||||||
|
|
||||||
|
void Swap() override {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual void Initialize();
|
virtual void Initialize();
|
||||||
virtual void Pump();
|
virtual void Pump();
|
||||||
|
|
|
@ -9,6 +9,8 @@
|
||||||
|
|
||||||
#include <xenia/gpu/resource_cache.h>
|
#include <xenia/gpu/resource_cache.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
|
||||||
using namespace std;
|
using namespace std;
|
||||||
using namespace xe;
|
using namespace xe;
|
||||||
|
@ -110,6 +112,8 @@ uint64_t ResourceCache::HashRange(const MemoryRange& memory_range) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ResourceCache::SyncRange(uint32_t address, int length) {
|
void ResourceCache::SyncRange(uint32_t address, int length) {
|
||||||
|
SCOPE_profile_cpu_f("gpu");
|
||||||
|
|
||||||
// Scan the page table in sync with our resource list. This means
|
// Scan the page table in sync with our resource list. This means
|
||||||
// we have O(n) complexity for updates, though we could definitely
|
// we have O(n) complexity for updates, though we could definitely
|
||||||
// make this faster/cleaner.
|
// make this faster/cleaner.
|
||||||
|
@ -118,15 +122,12 @@ void ResourceCache::SyncRange(uint32_t address, int length) {
|
||||||
// will not be changing, which allows us to do a foreach(res) and reload
|
// will not be changing, which allows us to do a foreach(res) and reload
|
||||||
// and then clear the table.
|
// and then clear the table.
|
||||||
|
|
||||||
// DISABLED
|
|
||||||
return;
|
|
||||||
|
|
||||||
// total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768
|
// total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768
|
||||||
// each byte = 1 page
|
// each byte = 1 page
|
||||||
// Walk as qwords so we can clear things up faster.
|
// Walk as qwords so we can clear things up faster.
|
||||||
uint64_t* page_table = reinterpret_cast<uint64_t*>(
|
uint64_t* page_table = reinterpret_cast<uint64_t*>(
|
||||||
memory_->Translate(memory_->page_table()));
|
memory_->Translate(memory_->page_table()));
|
||||||
int page_size = 16 * 1024; // 16KB pages
|
uint32_t page_size = 16 * 1024; // 16KB pages
|
||||||
|
|
||||||
uint32_t lo_address = address % 0x20000000;
|
uint32_t lo_address = address % 0x20000000;
|
||||||
uint32_t hi_address = lo_address + length;
|
uint32_t hi_address = lo_address + length;
|
||||||
|
@ -134,24 +135,38 @@ void ResourceCache::SyncRange(uint32_t address, int length) {
|
||||||
int start_page = lo_address / page_size;
|
int start_page = lo_address / page_size;
|
||||||
int end_page = hi_address / page_size;
|
int end_page = hi_address / page_size;
|
||||||
|
|
||||||
auto it = paged_resources_.upper_bound(lo_address);
|
{
|
||||||
auto end_it = paged_resources_.lower_bound(hi_address);
|
SCOPE_profile_cpu_i("gpu", "SyncRange:mark");
|
||||||
while (it != end_it) {
|
auto it = lo_address > page_size ?
|
||||||
const auto& memory_range = it->second->memory_range();
|
paged_resources_.upper_bound(lo_address - page_size) :
|
||||||
int lo_page = (memory_range.guest_base % 0x20000000) / page_size;
|
paged_resources_.begin();
|
||||||
int hi_page = lo_page + (memory_range.length / page_size);
|
auto end_it = paged_resources_.lower_bound(hi_address + page_size);
|
||||||
for (int i = lo_page / 8; i <= hi_page / 8; ++i) {
|
while (it != end_it) {
|
||||||
uint64_t page_flags = page_table[i];
|
const auto& memory_range = it->second->memory_range();
|
||||||
if (page_flags) {
|
int lo_page = (memory_range.guest_base % 0x20000000) / page_size;
|
||||||
// Dirty!
|
int hi_page = lo_page + (memory_range.length / page_size);
|
||||||
it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size);
|
lo_page = std::max(lo_page, start_page);
|
||||||
|
hi_page = std::min(hi_page, end_page);
|
||||||
|
if (lo_page > hi_page) {
|
||||||
|
++it;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
for (int i = lo_page / 8; i <= hi_page / 8; ++i) {
|
||||||
|
uint64_t page_flags = page_table[i];
|
||||||
|
if (page_flags) {
|
||||||
|
// Dirty!
|
||||||
|
it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
++it;
|
||||||
}
|
}
|
||||||
++it;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reset page table.
|
// Reset page table.
|
||||||
for (auto i = start_page / 8; i <= end_page / 8; ++i) {
|
{
|
||||||
page_table[i] = 0;
|
SCOPE_profile_cpu_i("gpu", "SyncRange:reset");
|
||||||
|
for (auto i = start_page / 8; i <= end_page / 8; ++i) {
|
||||||
|
page_table[i] = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -291,11 +291,10 @@ int TextureResource::Prepare() {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// DISABLED
|
if (!dirtied_) {
|
||||||
//if (!dirtied_) {
|
return 0;
|
||||||
// return 0;
|
}
|
||||||
//}
|
|
||||||
dirtied_ = false;
|
dirtied_ = false;
|
||||||
|
|
||||||
// pass dirty regions?
|
// pass dirty regions?
|
||||||
|
|
|
@ -70,6 +70,8 @@ enum Type3Opcode {
|
||||||
PM4_CONTEXT_UPDATE = 0x5e, // updates the current context, if needed
|
PM4_CONTEXT_UPDATE = 0x5e, // updates the current context, if needed
|
||||||
PM4_INTERRUPT = 0x54, // generate interrupt from the command stream
|
PM4_INTERRUPT = 0x54, // generate interrupt from the command stream
|
||||||
|
|
||||||
|
PM4_XE_SWAP = 0x55, // Xenia only: VdSwap uses this to trigger a swap.
|
||||||
|
|
||||||
PM4_IM_STORE = 0x2c, // copy sequencer instruction memory to system memory
|
PM4_IM_STORE = 0x2c, // copy sequencer instruction memory to system memory
|
||||||
|
|
||||||
// Tiled rendering:
|
// Tiled rendering:
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include <xenia/emulator.h>
|
#include <xenia/emulator.h>
|
||||||
#include <xenia/cpu/cpu.h>
|
#include <xenia/cpu/cpu.h>
|
||||||
#include <xenia/gpu/gpu.h>
|
#include <xenia/gpu/gpu.h>
|
||||||
|
#include <xenia/gpu/xenos/packets.h>
|
||||||
#include <xenia/kernel/kernel_state.h>
|
#include <xenia/kernel/kernel_state.h>
|
||||||
#include <xenia/kernel/xboxkrnl_private.h>
|
#include <xenia/kernel/xboxkrnl_private.h>
|
||||||
#include <xenia/kernel/xboxkrnl_rtl.h>
|
#include <xenia/kernel/xboxkrnl_rtl.h>
|
||||||
|
@ -422,19 +423,16 @@ SHIM_CALL VdSwap_shim(
|
||||||
unk6,
|
unk6,
|
||||||
unk7);
|
unk7);
|
||||||
|
|
||||||
KernelState* kernel_state = shared_kernel_state_;
|
|
||||||
XEASSERTNOTNULL(kernel_state);
|
|
||||||
GraphicsSystem* gs = kernel_state->emulator()->graphics_system();
|
|
||||||
if (!gs) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
gs->set_swap_pending(true);
|
|
||||||
|
|
||||||
// The caller seems to reserve 64 words (256b) in the primary ringbuffer
|
// The caller seems to reserve 64 words (256b) in the primary ringbuffer
|
||||||
// for this method to do what it needs. We just zero them out. We could
|
// for this method to do what it needs. We just zero them out and send a
|
||||||
// encode the parameters in the stream for the ringbuffer, if needed.
|
// token value. It'd be nice to figure out what this is really doing so
|
||||||
|
// that we could simulate it, though due to TCR I bet all games need to
|
||||||
|
// use this method.
|
||||||
xe_zero_struct(SHIM_MEM_ADDR(unk0), 64 * 4);
|
xe_zero_struct(SHIM_MEM_ADDR(unk0), 64 * 4);
|
||||||
|
auto dwords = reinterpret_cast<uint32_t*>(SHIM_MEM_ADDR(unk0));
|
||||||
|
dwords[0] = XESWAP32((0x03 << 30) |
|
||||||
|
((1 - 1) << 16) |
|
||||||
|
(xenos::PM4_XE_SWAP << 8));
|
||||||
|
|
||||||
SHIM_SET_RETURN_64(0);
|
SHIM_SET_RETURN_64(0);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue