diff --git a/src/xenia/gpu/buffer_resource.cc b/src/xenia/gpu/buffer_resource.cc index 949bfe02b..9f9accb9b 100644 --- a/src/xenia/gpu/buffer_resource.cc +++ b/src/xenia/gpu/buffer_resource.cc @@ -30,10 +30,9 @@ int BufferResource::Prepare() { } } - // DISABLED - //if (!dirtied_) { - // return 0; - //} + if (!dirtied_) { + return 0; + } dirtied_ = false; // pass dirty regions? diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index c7a6a166b..23c27c5a9 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -300,6 +300,16 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) { } break; + case PM4_XE_SWAP: + // Xenia-specific VdSwap hook. + // VdSwap will post this to tell us we need to swap the screen/fire an interrupt. + XETRACECP("[%.8X] Packet(%.8X): PM4_XE_SWAP", + packet_ptr, packet); + LOG_DATA(count); + ADVANCE_PTR(count); + graphics_system_->Swap(); + break; + case PM4_INDIRECT_BUFFER: // indirect buffer dispatch { @@ -334,14 +344,11 @@ uint32_t CommandProcessor::ExecutePacket(PacketArgs& args) { } else { // Register. XEASSERT(poll_reg_addr < RegisterFile::kRegisterCount); - - if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { - // Waiting for coherency. We should have all the info we need - // now (base+size+mode), so kick it off. - MakeCoherent(); - } - value = regs->values[poll_reg_addr].u32; + if (poll_reg_addr == XE_GPU_REG_COHER_STATUS_HOST) { + MakeCoherent(); + value = regs->values[poll_reg_addr].u32; + } } switch (wait_info & 0x7) { case 0x0: // Never. @@ -768,16 +775,23 @@ void CommandProcessor::WriteRegister( } void CommandProcessor::MakeCoherent() { - RegisterFile* regs = driver_->register_file(); - auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32; - auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32; - auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32; - // Status host often has 0x01000000 or 0x03000000. // This is likely toggling VC (vertex cache) or TC (texture cache). // Or, it also has a direction in here maybe - there is probably // some way to check for dest coherency (what all the COHER_DEST_BASE_* // registers are for). + // Best docs I've found on this are here: + // http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2013/10/R6xx_R7xx_3D.pdf + // http://cgit.freedesktop.org/xorg/driver/xf86-video-radeonhd/tree/src/r6xx_accel.c?id=3f8b6eccd9dba116cc4801e7f80ce21a879c67d2#n454 + + RegisterFile* regs = driver_->register_file(); + auto status_host = regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32; + auto base_host = regs->values[XE_GPU_REG_COHER_BASE_HOST].u32; + auto size_host = regs->values[XE_GPU_REG_COHER_SIZE_HOST].u32; + + if (!(status_host & 0x80000000ul)) { + return; + } // TODO(benvanik): notify resource cache of base->size and type. XETRACECP("Make %.8X -> %.8X (%db) coherent", diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc index ba20e5797..8e6fc5a7e 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.cc +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.cc @@ -24,7 +24,7 @@ D3D11GraphicsSystem::D3D11GraphicsSystem(Emulator* emulator) : GraphicsSystem(emulator), window_(nullptr), dxgi_factory_(nullptr), device_(nullptr), timer_queue_(nullptr), vsync_timer_(nullptr), - interrupt_pending_(true) { + last_swap_time_(0.0) { } D3D11GraphicsSystem::~D3D11GraphicsSystem() { @@ -141,36 +141,26 @@ void D3D11GraphicsSystem::Initialize() { void D3D11GraphicsSystem::Pump() { SCOPE_profile_cpu_f("gpu"); - if (swap_pending_) { - swap_pending_ = false; - - // TODO(benvanik): remove this when commands are understood. - driver_->Resolve(); - - // Swap window. - // If we are set to vsync this will block. - window_->Swap(); - - DispatchInterruptCallback(0); - interrupt_pending_ = false; - } else if (interrupt_pending_) { - DispatchInterruptCallback(0); - interrupt_pending_ = false; - } else { - double time_since_last_interrupt = xe_pal_now() - last_interrupt_time_; - if (time_since_last_interrupt > 0.5) { - // If we have gone too long without an interrupt, fire one. - DispatchInterruptCallback(0); - } - if (time_since_last_interrupt > 0.3) { - // Force a swap when profiling. - if (Profiler::is_enabled()) { - window_->Swap(); - } + double time_since_last_swap = xe_pal_now() - last_swap_time_; + if (time_since_last_swap > 1.0) { + // Force a swap when profiling. + if (Profiler::is_enabled()) { + window_->Swap(); } } } +void D3D11GraphicsSystem::Swap() { + // TODO(benvanik): remove this when commands are understood. + driver_->Resolve(); + + // Swap window. + // If we are set to vsync this will block. + window_->Swap(); + + last_swap_time_ = xe_pal_now(); +} + void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs, BOOLEAN) { static bool thread_name_set = false; @@ -185,7 +175,6 @@ void __stdcall D3D11GraphicsSystem::VsyncCallback(D3D11GraphicsSystem* gs, // TODO(benvanik): we shouldn't need to do the dispatch here, but there's // something wrong and the CP will block waiting for code that // needs to be run in the interrupt. - // gs->interrupt_pending_ = true; gs->DispatchInterruptCallback(0); } diff --git a/src/xenia/gpu/d3d11/d3d11_graphics_system.h b/src/xenia/gpu/d3d11/d3d11_graphics_system.h index 00ca43e76..7bd641667 100644 --- a/src/xenia/gpu/d3d11/d3d11_graphics_system.h +++ b/src/xenia/gpu/d3d11/d3d11_graphics_system.h @@ -35,6 +35,8 @@ public: virtual void Shutdown(); + void Swap() override; + protected: virtual void Initialize(); virtual void Pump(); @@ -49,7 +51,7 @@ private: HANDLE timer_queue_; HANDLE vsync_timer_; - bool interrupt_pending_; + double last_swap_time_; }; diff --git a/src/xenia/gpu/graphics_system.cc b/src/xenia/gpu/graphics_system.cc index be3e4e0de..212074168 100644 --- a/src/xenia/gpu/graphics_system.cc +++ b/src/xenia/gpu/graphics_system.cc @@ -28,7 +28,7 @@ GraphicsSystem::GraphicsSystem(Emulator* emulator) : thread_(nullptr), running_(false), driver_(nullptr), command_processor_(nullptr), interrupt_callback_(0), interrupt_callback_data_(0), - last_interrupt_time_(0), swap_pending_(false), thread_wait_(nullptr) { + last_interrupt_time_(0), thread_wait_(nullptr) { // Create the run loop used for any windows/etc. // This must be done on the thread we create the driver. run_loop_ = xe_run_loop_create(); diff --git a/src/xenia/gpu/graphics_system.h b/src/xenia/gpu/graphics_system.h index 8c0a542c8..3b8fdabb1 100644 --- a/src/xenia/gpu/graphics_system.h +++ b/src/xenia/gpu/graphics_system.h @@ -45,8 +45,7 @@ public: void MarkVblank(); void DispatchInterruptCallback(uint32_t source, uint32_t cpu = 0xFFFFFFFF); - bool swap_pending() const { return swap_pending_; } - void set_swap_pending(bool value) { swap_pending_ = value; } + virtual void Swap() = 0; protected: virtual void Initialize(); @@ -83,7 +82,6 @@ protected: uint32_t interrupt_callback_; uint32_t interrupt_callback_data_; double last_interrupt_time_; - bool swap_pending_; HANDLE thread_wait_; }; diff --git a/src/xenia/gpu/nop/nop_graphics_system.h b/src/xenia/gpu/nop/nop_graphics_system.h index 54f77e04e..cf5f43b8a 100644 --- a/src/xenia/gpu/nop/nop_graphics_system.h +++ b/src/xenia/gpu/nop/nop_graphics_system.h @@ -28,6 +28,8 @@ public: virtual void Shutdown(); + void Swap() override {} + protected: virtual void Initialize(); virtual void Pump(); diff --git a/src/xenia/gpu/resource_cache.cc b/src/xenia/gpu/resource_cache.cc index 46eec8f0b..5641c8318 100644 --- a/src/xenia/gpu/resource_cache.cc +++ b/src/xenia/gpu/resource_cache.cc @@ -9,6 +9,8 @@ #include +#include + using namespace std; using namespace xe; @@ -110,6 +112,8 @@ uint64_t ResourceCache::HashRange(const MemoryRange& memory_range) { } void ResourceCache::SyncRange(uint32_t address, int length) { + SCOPE_profile_cpu_f("gpu"); + // Scan the page table in sync with our resource list. This means // we have O(n) complexity for updates, though we could definitely // make this faster/cleaner. @@ -118,15 +122,12 @@ void ResourceCache::SyncRange(uint32_t address, int length) { // will not be changing, which allows us to do a foreach(res) and reload // and then clear the table. - // DISABLED - return; - // total bytes = (512 * 1024 * 1024) / (16 * 1024) = 32768 // each byte = 1 page // Walk as qwords so we can clear things up faster. uint64_t* page_table = reinterpret_cast( memory_->Translate(memory_->page_table())); - int page_size = 16 * 1024; // 16KB pages + uint32_t page_size = 16 * 1024; // 16KB pages uint32_t lo_address = address % 0x20000000; uint32_t hi_address = lo_address + length; @@ -134,24 +135,38 @@ void ResourceCache::SyncRange(uint32_t address, int length) { int start_page = lo_address / page_size; int end_page = hi_address / page_size; - auto it = paged_resources_.upper_bound(lo_address); - auto end_it = paged_resources_.lower_bound(hi_address); - while (it != end_it) { - const auto& memory_range = it->second->memory_range(); - int lo_page = (memory_range.guest_base % 0x20000000) / page_size; - int hi_page = lo_page + (memory_range.length / page_size); - for (int i = lo_page / 8; i <= hi_page / 8; ++i) { - uint64_t page_flags = page_table[i]; - if (page_flags) { - // Dirty! - it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size); + { + SCOPE_profile_cpu_i("gpu", "SyncRange:mark"); + auto it = lo_address > page_size ? + paged_resources_.upper_bound(lo_address - page_size) : + paged_resources_.begin(); + auto end_it = paged_resources_.lower_bound(hi_address + page_size); + while (it != end_it) { + const auto& memory_range = it->second->memory_range(); + int lo_page = (memory_range.guest_base % 0x20000000) / page_size; + int hi_page = lo_page + (memory_range.length / page_size); + lo_page = std::max(lo_page, start_page); + hi_page = std::min(hi_page, end_page); + if (lo_page > hi_page) { + ++it; + continue; } + for (int i = lo_page / 8; i <= hi_page / 8; ++i) { + uint64_t page_flags = page_table[i]; + if (page_flags) { + // Dirty! + it->second->MarkDirty(i * 8 * page_size, (i * 8 + 7) * page_size); + } + } + ++it; } - ++it; } // Reset page table. - for (auto i = start_page / 8; i <= end_page / 8; ++i) { - page_table[i] = 0; + { + SCOPE_profile_cpu_i("gpu", "SyncRange:reset"); + for (auto i = start_page / 8; i <= end_page / 8; ++i) { + page_table[i] = 0; + } } } diff --git a/src/xenia/gpu/texture_resource.cc b/src/xenia/gpu/texture_resource.cc index e7cfdee0d..531796c11 100644 --- a/src/xenia/gpu/texture_resource.cc +++ b/src/xenia/gpu/texture_resource.cc @@ -291,11 +291,10 @@ int TextureResource::Prepare() { return 1; } } - - // DISABLED - //if (!dirtied_) { - // return 0; - //} + + if (!dirtied_) { + return 0; + } dirtied_ = false; // pass dirty regions? diff --git a/src/xenia/gpu/xenos/packets.h b/src/xenia/gpu/xenos/packets.h index 4b7124310..459ab7e6e 100644 --- a/src/xenia/gpu/xenos/packets.h +++ b/src/xenia/gpu/xenos/packets.h @@ -70,6 +70,8 @@ enum Type3Opcode { PM4_CONTEXT_UPDATE = 0x5e, // updates the current context, if needed PM4_INTERRUPT = 0x54, // generate interrupt from the command stream + PM4_XE_SWAP = 0x55, // Xenia only: VdSwap uses this to trigger a swap. + PM4_IM_STORE = 0x2c, // copy sequencer instruction memory to system memory // Tiled rendering: diff --git a/src/xenia/kernel/xboxkrnl_video.cc b/src/xenia/kernel/xboxkrnl_video.cc index 951606bde..6519a067c 100644 --- a/src/xenia/kernel/xboxkrnl_video.cc +++ b/src/xenia/kernel/xboxkrnl_video.cc @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -422,19 +423,16 @@ SHIM_CALL VdSwap_shim( unk6, unk7); - KernelState* kernel_state = shared_kernel_state_; - XEASSERTNOTNULL(kernel_state); - GraphicsSystem* gs = kernel_state->emulator()->graphics_system(); - if (!gs) { - return; - } - - gs->set_swap_pending(true); - // The caller seems to reserve 64 words (256b) in the primary ringbuffer - // for this method to do what it needs. We just zero them out. We could - // encode the parameters in the stream for the ringbuffer, if needed. + // for this method to do what it needs. We just zero them out and send a + // token value. It'd be nice to figure out what this is really doing so + // that we could simulate it, though due to TCR I bet all games need to + // use this method. xe_zero_struct(SHIM_MEM_ADDR(unk0), 64 * 4); + auto dwords = reinterpret_cast(SHIM_MEM_ADDR(unk0)); + dwords[0] = XESWAP32((0x03 << 30) | + ((1 - 1) << 16) | + (xenos::PM4_XE_SWAP << 8)); SHIM_SET_RETURN_64(0); }