diff --git a/rpcs3/Emu/Cell/PPUThread.cpp b/rpcs3/Emu/Cell/PPUThread.cpp index 04d5bb5f26..3d17622535 100644 --- a/rpcs3/Emu/Cell/PPUThread.cpp +++ b/rpcs3/Emu/Cell/PPUThread.cpp @@ -9,7 +9,7 @@ #include "Emu/perf_meter.hpp" #include "Emu/Memory/vm_reservation.h" #include "Emu/Memory/vm_locking.h" -#include "Emu/RSX/RSXThread.h" +#include "Emu/RSX/Core/RSXReservationLock.hpp" #include "Emu/VFS.h" #include "Emu/system_progress.hpp" #include "Emu/system_utils.hpp" diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index c2615d7cd4..525491bf73 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -9,7 +9,6 @@ #include "Emu/VFS.h" #include "Emu/IdManager.h" #include "Emu/perf_meter.hpp" -#include "Emu/RSX/RSXThread.h" #include "Emu/Cell/PPUThread.h" #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/lv2/sys_spu.h" @@ -23,6 +22,9 @@ #include "Emu/Cell/SPURecompiler.h" #include "Emu/Cell/timers.hpp" +#include "Emu/RSX/Core/RSXReservationLock.hpp" +#include "Emu/RSX/RSXThread.h" + #include #include #include diff --git a/rpcs3/Emu/Cell/lv2/sys_rsx.cpp b/rpcs3/Emu/Cell/lv2/sys_rsx.cpp index 31c7b93e2b..67557aa6d3 100644 --- a/rpcs3/Emu/Cell/lv2/sys_rsx.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_rsx.cpp @@ -5,6 +5,8 @@ #include "Emu/Cell/ErrorCodes.h" #include "Emu/Cell/timers.hpp" #include "Emu/Memory/vm_locking.h" +#include "Emu/RSX/Core/RSXEngLock.hpp" +#include "Emu/RSX/Core/RSXReservationLock.hpp" #include "Emu/RSX/RSXThread.h" #include "util/asm.hpp" #include "sys_event.h" diff --git a/rpcs3/Emu/RSX/Core/RSXDisplay.h b/rpcs3/Emu/RSX/Core/RSXDisplay.h new file mode 100644 index 0000000000..c2527c926a --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXDisplay.h @@ -0,0 +1,73 @@ +#pragma once + +#include +#include +#include + +namespace rsx +{ + struct frame_statistics_t + { + u32 draw_calls; + u32 submit_count; + + s64 setup_time; + s64 vertex_upload_time; + s64 textures_upload_time; + s64 draw_exec_time; + s64 flip_time; + }; + + struct display_flip_info_t + { + std::deque buffer_queue; + u32 buffer; + bool skip_frame; + bool emu_flip; + bool in_progress; + frame_statistics_t stats; + + inline void push(u32 _buffer) + { + buffer_queue.push_back(_buffer); + } + + inline bool pop(u32 _buffer) + { + if (buffer_queue.empty()) + { + return false; + } + + do + { + const auto index = buffer_queue.front(); + buffer_queue.pop_front(); + + if (index == _buffer) + { + buffer = _buffer; + return true; + } + } while (!buffer_queue.empty()); + + // Need to observe this happening in the wild + rsx_log.error("Display queue was discarded while not empty!"); + return false; + } + }; + + class vblank_thread + { + std::shared_ptr>> m_thread; + + public: + vblank_thread() = default; + vblank_thread(const vblank_thread&) = delete; + + void set_thread(std::shared_ptr>> thread); + + vblank_thread& operator=(thread_state); + vblank_thread& operator=(const vblank_thread&) = delete; + }; +} diff --git a/rpcs3/Emu/RSX/Core/RSXEngLock.hpp b/rpcs3/Emu/RSX/Core/RSXEngLock.hpp new file mode 100644 index 0000000000..8c50e38f15 --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXEngLock.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include "../RSXThread.h" + +namespace rsx +{ + class eng_lock + { + rsx::thread* pthr; + + public: + eng_lock(rsx::thread* target) + :pthr(target) + { + if (pthr->is_current_thread()) + { + pthr = nullptr; + } + else + { + pthr->pause(); + } + } + + ~eng_lock() + { + if (pthr) pthr->unpause(); + } + }; +} diff --git a/rpcs3/Emu/RSX/Core/RSXFrameBuffer.h b/rpcs3/Emu/RSX/Core/RSXFrameBuffer.h new file mode 100644 index 0000000000..fc623fd360 --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXFrameBuffer.h @@ -0,0 +1,42 @@ +#pragma once + +#include +#include "../gcm_enums.h" +#include "../GCM.h" + +namespace rsx +{ + struct tiled_region + { + u32 address; + u32 base; + GcmTileInfo* tile; + u8* ptr; + + void write(const void* src, u32 width, u32 height, u32 pitch); + void read(void* dst, u32 width, u32 height, u32 pitch); + }; + + struct framebuffer_layout + { + ENABLE_BITWISE_SERIALIZATION; + + u16 width; + u16 height; + std::array color_addresses; + std::array color_pitch; + std::array actual_color_pitch; + std::array color_write_enabled; + u32 zeta_address; + u32 zeta_pitch; + u32 actual_zeta_pitch; + bool zeta_write_enabled; + rsx::surface_target target; + rsx::surface_color_format color_format; + rsx::surface_depth_format2 depth_format; + rsx::surface_antialiasing aa_mode; + rsx::surface_raster_type raster_type; + u32 aa_factors[2]; + bool ignore_change; + }; +} diff --git a/rpcs3/Emu/RSX/Core/RSXIOMap.hpp b/rpcs3/Emu/RSX/Core/RSXIOMap.hpp new file mode 100644 index 0000000000..b4ea4b5bc8 --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXIOMap.hpp @@ -0,0 +1,86 @@ +#pragma once + +#include +#include "Utilities/mutex.h" +#include "Emu/CPU/CPUThread.h" + +namespace rsx +{ + struct rsx_iomap_table + { + static constexpr u32 c_lock_stride = 8192; + + std::array, 4096> ea; + std::array, 4096> io; + std::array rs; + + rsx_iomap_table() noexcept; + + // Try to get the real address given a mapped address + // Returns -1 on failure + u32 get_addr(u32 offs) const noexcept + { + return this->ea[offs >> 20] | (offs & 0xFFFFF); + } + + template + bool lock(u32 addr, u32 len, cpu_thread* self = nullptr) noexcept + { + if (len <= 1) return false; + const u32 end = addr + len - 1; + + bool added_wait = false; + + for (u32 block = addr / c_lock_stride; block <= (end / c_lock_stride); block += Stride) + { + auto& mutex_ = rs[block]; + + if (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared()) [[ unlikely ]] + { + if (self) + { + added_wait |= !self->state.test_and_set(cpu_flag::wait); + } + + if (!self || self->id_type() != 0x55u) + { + IsFullLock ? mutex_.lock() : mutex_.lock_shared(); + } + else + { + while (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared()) + { + self->cpu_wait({}); + } + } + } + } + + if (added_wait) + { + self->check_state(); + } + + return true; + } + + template + void unlock(u32 addr, u32 len) noexcept + { + ensure(len >= 1); + const u32 end = addr + len - 1; + + for (u32 block = (addr / 8192); block <= (end / 8192); block += Stride) + { + if constexpr (IsFullLock) + { + rs[block].unlock(); + } + else + { + rs[block].unlock_shared(); + } + } + } + }; +} diff --git a/rpcs3/Emu/RSX/Core/RSXReservationLock.hpp b/rpcs3/Emu/RSX/Core/RSXReservationLock.hpp new file mode 100644 index 0000000000..2e7df6a9a5 --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXReservationLock.hpp @@ -0,0 +1,106 @@ +#pragma once + +#include +#include "../RSXThread.h" + +namespace rsx +{ + template + class reservation_lock + { + u32 addr = 0; + u32 length = 0; + + inline void lock_range(u32 addr, u32 length) + { + if (!get_current_renderer()->iomap_table.lock(addr, length, get_current_cpu_thread())) + { + length = 0; + } + + this->addr = addr; + this->length = length; + } + + public: + reservation_lock(u32 addr, u32 length) + { + if (g_cfg.core.rsx_accurate_res_access && + addr < constants::local_mem_base) + { + lock_range(addr, length); + } + } + + reservation_lock(u32 addr, u32 length, bool setting) + { + if (setting) + { + lock_range(addr, length); + } + } + + // Multi-range lock. If ranges overlap, the combined range will be acquired. + // If ranges do not overlap, the first range that is in main memory will be acquired. + reservation_lock(u32 dst_addr, u32 dst_length, u32 src_addr, u32 src_length) + { + if (g_cfg.core.rsx_accurate_res_access) + { + const auto range1 = utils::address_range::start_length(dst_addr, dst_length); + const auto range2 = utils::address_range::start_length(src_addr, src_length); + utils::address_range target_range; + + if (!range1.overlaps(range2)) [[likely]] + { + target_range = (dst_addr < constants::local_mem_base) ? range1 : range2; + } + else + { + // Very unlikely + target_range = range1.get_min_max(range2); + } + + if (target_range.start < constants::local_mem_base) + { + lock_range(target_range.start, target_range.length()); + } + } + } + + // Very special utility for batched transfers (SPU related) + template + void update_if_enabled(u32 addr, u32 _length, const std::add_pointer_t& lock_release = std::add_pointer_t{}) + { + // This check is not perfect but it covers the important cases fast (this check is only an optimization - forcing true disables it) + if (length && (this->addr / rsx_iomap_table::c_lock_stride != addr / rsx_iomap_table::c_lock_stride || (addr % rsx_iomap_table::c_lock_stride + _length) > rsx_iomap_table::c_lock_stride) && _length > 1) + { + if constexpr (!std::is_void_v) + { + // See SPUThread.cpp + lock_release->release(0); + } + + unlock(); + lock_range(addr, _length); + } + } + + void unlock(bool destructor = false) + { + if (length) + { + get_current_renderer()->iomap_table.unlock(addr, length); + + if (!destructor) + { + length = 0; + } + } + } + + ~reservation_lock() + { + unlock(true); + } + }; +} diff --git a/rpcs3/Emu/RSX/Core/RSXVertexTypes.h b/rpcs3/Emu/RSX/Core/RSXVertexTypes.h new file mode 100644 index 0000000000..e65c168a09 --- /dev/null +++ b/rpcs3/Emu/RSX/Core/RSXVertexTypes.h @@ -0,0 +1,168 @@ +#pragma once + +#include +#include "../Common/simple_array.hpp" +#include "../gcm_enums.h" + +#include + +namespace rsx +{ + struct vertex_array_buffer + { + rsx::vertex_base_type type; + u8 attribute_size; + u8 stride; + std::span data; + u8 index; + bool is_be; + }; + + struct vertex_array_register + { + rsx::vertex_base_type type; + u8 attribute_size; + std::array data; + u8 index; + }; + + struct empty_vertex_array + { + u8 index; + }; + + struct draw_array_command + { + u32 __dummy; + }; + + struct draw_indexed_array_command + { + std::span raw_index_buffer; + }; + + struct draw_inlined_array + { + u32 __dummy; + u32 __dummy2; + }; + + struct interleaved_attribute_t + { + u8 index; + bool modulo; + u16 frequency; + }; + + struct interleaved_range_info + { + bool interleaved = false; + bool single_vertex = false; + u32 base_offset = 0; + u32 real_offset_address = 0; + u8 memory_location = 0; + u8 attribute_stride = 0; + + rsx::simple_array locations; + + // Check if we need to upload a full unoptimized range, i.e [0-max_index] + std::pair calculate_required_range(u32 first, u32 count) const; + }; + + enum attribute_buffer_placement : u8 + { + none = 0, + persistent = 1, + transient = 2 + }; + + class vertex_input_layout + { + int m_num_used_blocks = 0; + std::array m_blocks_data{}; + + public: + rsx::simple_array interleaved_blocks{}; // Interleaved blocks to be uploaded as-is + std::vector> volatile_blocks{}; // Volatile data blocks (immediate draw vertex data for example) + rsx::simple_array referenced_registers{}; // Volatile register data + + std::array attribute_placement = fill_array(attribute_buffer_placement::none); + + vertex_input_layout() = default; + + interleaved_range_info* alloc_interleaved_block() + { + auto result = &m_blocks_data[m_num_used_blocks++]; + result->attribute_stride = 0; + result->base_offset = 0; + result->memory_location = 0; + result->real_offset_address = 0; + result->single_vertex = false; + result->locations.clear(); + result->interleaved = true; + return result; + } + + void clear() + { + m_num_used_blocks = 0; + interleaved_blocks.clear(); + volatile_blocks.clear(); + referenced_registers.clear(); + } + + bool validate() const + { + // Criteria: At least one array stream has to be defined to feed vertex positions + // This stream cannot be a const register as the vertices cannot create a zero-area primitive + + if (!interleaved_blocks.empty() && interleaved_blocks[0]->attribute_stride != 0) + return true; + + if (!volatile_blocks.empty()) + return true; + + for (u8 index = 0; index < limits::vertex_count; ++index) + { + switch (attribute_placement[index]) + { + case attribute_buffer_placement::transient: + { + // Ignore register reference + if (std::find(referenced_registers.begin(), referenced_registers.end(), index) != referenced_registers.end()) + continue; + + // The source is inline array or immediate draw push buffer + return true; + } + case attribute_buffer_placement::persistent: + { + return true; + } + case attribute_buffer_placement::none: + { + continue; + } + default: + { + fmt::throw_exception("Unreachable"); + } + } + } + + return false; + } + + u32 calculate_interleaved_memory_requirements(u32 first_vertex, u32 vertex_count) const + { + u32 mem = 0; + for (auto& block : interleaved_blocks) + { + const auto range = block->calculate_required_range(first_vertex, vertex_count); + mem += range.second * block->attribute_stride; + } + + return mem; + } + }; +} diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index fdc7bb1747..09633c7c5d 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -1037,11 +1037,11 @@ void GLGSRender::on_semaphore_acquire_wait() if (!work_queue.empty() || (async_flip_requested & flip_request::emu_requested)) { - do_local_task(rsx::FIFO_state::lock_wait); + do_local_task(rsx::FIFO::state::lock_wait); } } -void GLGSRender::do_local_task(rsx::FIFO_state state) +void GLGSRender::do_local_task(rsx::FIFO::state state) { if (!work_queue.empty()) { @@ -1058,7 +1058,7 @@ void GLGSRender::do_local_task(rsx::FIFO_state state) q.processed = true; } } - else if (!in_begin_end && state != rsx::FIFO_state::lock_wait) + else if (!in_begin_end && state != rsx::FIFO::state::lock_wait) { if (m_graphics_state & rsx::pipeline_state::framebuffer_reads_dirty) { @@ -1071,7 +1071,7 @@ void GLGSRender::do_local_task(rsx::FIFO_state state) rsx::thread::do_local_task(state); - if (state == rsx::FIFO_state::lock_wait) + if (state == rsx::FIFO::state::lock_wait) { // Critical check finished return; diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index 068a7aacb5..0950124634 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -193,7 +193,7 @@ protected: void on_exit() override; void flip(const rsx::display_flip_info_t& info) override; - void do_local_task(rsx::FIFO_state state) override; + void do_local_task(rsx::FIFO::state state) override; bool on_access_violation(u32 address, bool is_writing) override; void on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause) override; diff --git a/rpcs3/Emu/RSX/RSXFIFO.cpp b/rpcs3/Emu/RSX/RSXFIFO.cpp index dcf0a88e5f..0dfc0ccabf 100644 --- a/rpcs3/Emu/RSX/RSXFIFO.cpp +++ b/rpcs3/Emu/RSX/RSXFIFO.cpp @@ -4,6 +4,7 @@ #include "RSXThread.h" #include "Capture/rsx_capture.h" #include "Common/time.hpp" +#include "Core/RSXReservationLock.hpp" #include "Emu/Memory/vm_reservation.h" #include "Emu/Cell/lv2/sys_rsx.h" #include "util/asm.hpp" @@ -613,20 +614,20 @@ namespace rsx { case FIFO::FIFO_NOP: { - if (performance_counters.state == FIFO_state::running) + if (performance_counters.state == FIFO::state::running) { performance_counters.FIFO_idle_timestamp = rsx::uclock(); - performance_counters.state = FIFO_state::nop; + performance_counters.state = FIFO::state::nop; } return; } case FIFO::FIFO_EMPTY: { - if (performance_counters.state == FIFO_state::running) + if (performance_counters.state == FIFO::state::running) { performance_counters.FIFO_idle_timestamp = rsx::uclock(); - performance_counters.state = FIFO_state::empty; + performance_counters.state = FIFO::state::empty; } else { @@ -658,13 +659,13 @@ namespace rsx if (offs == fifo_ctrl->get_pos()) { //Jump to self. Often preceded by NOP - if (performance_counters.state == FIFO_state::running) + if (performance_counters.state == FIFO::state::running) { performance_counters.FIFO_idle_timestamp = rsx::uclock(); sync_point_request.release(true); } - performance_counters.state = FIFO_state::spinning; + performance_counters.state = FIFO::state::spinning; } else { @@ -710,14 +711,14 @@ namespace rsx } if (const auto state = performance_counters.state; - state != FIFO_state::running) + state != FIFO::state::running) { - performance_counters.state = FIFO_state::running; + performance_counters.state = FIFO::state::running; // Hack: Delay FIFO wake-up according to setting // NOTE: The typical spin setup is a NOP followed by a jump-to-self // NOTE: There is a small delay when the jump address is dynamically edited by cell - if (state != FIFO_state::nop) + if (state != FIFO::state::nop) { fifo_wake_delay(); } diff --git a/rpcs3/Emu/RSX/RSXFIFO.h b/rpcs3/Emu/RSX/RSXFIFO.h index 1dc0d1edad..7cb91239e1 100644 --- a/rpcs3/Emu/RSX/RSXFIFO.h +++ b/rpcs3/Emu/RSX/RSXFIFO.h @@ -32,6 +32,22 @@ namespace rsx EMIT_BARRIER = 2 }; + enum class state : u8 + { + running = 0, + empty = 1, // PUT == GET + spinning = 2, // Puller continuously jumps to self addr (synchronization technique) + nop = 3, // Puller is processing a NOP command + lock_wait = 4,// Puller is processing a lock acquire + paused = 5, // Puller is paused externallly + }; + + enum class interrupt_hint : u8 + { + conditional_render_eval = 1, + zcull_sync = 2 + }; + struct register_pair { u32 reg; diff --git a/rpcs3/Emu/RSX/RSXOffload.cpp b/rpcs3/Emu/RSX/RSXOffload.cpp index 413acf2025..97a27ae0ee 100644 --- a/rpcs3/Emu/RSX/RSXOffload.cpp +++ b/rpcs3/Emu/RSX/RSXOffload.cpp @@ -2,6 +2,7 @@ #include "Emu/Memory/vm.h" #include "Common/BufferUtils.h" +#include "Core/RSXReservationLock.hpp" #include "RSXOffload.h" #include "RSXThread.h" diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index 0fd0d5ccc8..01909e97fe 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -5,12 +5,14 @@ #include "Emu/Cell/SPUThread.h" #include "Emu/Cell/timers.hpp" +#include "Capture/rsx_capture.h" #include "Common/BufferUtils.h" #include "Common/buffer_stream.hpp" #include "Common/texture_cache.h" #include "Common/surface_store.h" #include "Common/time.hpp" -#include "Capture/rsx_capture.h" +#include "Core/RSXReservationLock.hpp" +#include "Core/RSXEngLock.hpp" #include "rsx_methods.h" #include "gcm_printing.h" #include "RSXDisAsm.h" @@ -733,7 +735,7 @@ namespace rsx if ((state & (cpu_flag::dbg_global_pause + cpu_flag::exit)) == cpu_flag::dbg_global_pause) { // Wait 16ms during emulation pause. This reduces cpu load while still giving us the chance to render overlays. - do_local_task(rsx::FIFO_state::paused); + do_local_task(rsx::FIFO::state::paused); thread_ctrl::wait_on(state, old, 16000); } else @@ -803,7 +805,7 @@ namespace rsx check_zcull_status(false); nv4097::set_render_mode(this, 0, method_registers.registers[NV4097_SET_RENDER_ENABLE]); - performance_counters.state = FIFO_state::empty; + performance_counters.state = FIFO::state::empty; const u64 event_flags = unsent_gcm_events.exchange(0); @@ -832,7 +834,7 @@ namespace rsx thread_ctrl::wait_for(1000); } - performance_counters.state = FIFO_state::running; + performance_counters.state = FIFO::state::running; fifo_ctrl = std::make_unique<::rsx::FIFO::FIFO_control>(this); fifo_ctrl->set_get(ctrl->get); @@ -994,7 +996,7 @@ namespace rsx // Clear any pending flush requests to release threads std::this_thread::sleep_for(10ms); - do_local_task(rsx::FIFO_state::lock_wait); + do_local_task(rsx::FIFO::state::lock_wait); g_fxo->get().join(); g_fxo->get() = thread_state::finished; @@ -1261,7 +1263,7 @@ namespace rsx fmt::throw_exception("ill-formed draw command"); } - void thread::do_local_task(FIFO_state state) + void thread::do_local_task(FIFO::state state) { m_eng_interrupt_mask.clear(rsx::backend_interrupt); @@ -1272,7 +1274,7 @@ namespace rsx handle_emu_flip(async_flip_buffer); } - if (!in_begin_end && state != FIFO_state::lock_wait) + if (!in_begin_end && state != FIFO::state::lock_wait) { if (atomic_storage::load(m_invalidated_memory_range.end) != 0) { @@ -2845,7 +2847,7 @@ namespace rsx if (!result.queries.empty()) { cond_render_ctrl.set_eval_sources(result.queries); - sync_hint(FIFO_hint::hint_conditional_render_eval, { .query = cond_render_ctrl.eval_sources.front(), .address = ref }); + sync_hint(FIFO::interrupt_hint::conditional_render_eval, { .query = cond_render_ctrl.eval_sources.front(), .address = ref }); } else { @@ -2895,7 +2897,7 @@ namespace rsx //ensure(async_tasks_pending.load() == 0); } - void thread::sync_hint(FIFO_hint /*hint*/, rsx::reports::sync_hint_payload_t payload) + void thread::sync_hint(FIFO::interrupt_hint /*hint*/, rsx::reports::sync_hint_payload_t payload) { zcull_ctrl->on_sync_hint(payload); } diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index 6caf20b12c..caa74768a0 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -28,6 +28,11 @@ #include "Emu/IdManager.h" #include "Emu/system_config.h" +#include "Core/RSXDisplay.h" +#include "Core/RSXFrameBuffer.h" +#include "Core/RSXIOMap.hpp" +#include "Core/RSXVertexTypes.h" + extern atomic_t g_user_asked_for_frame_capture; extern atomic_t g_disable_frame_limit; extern rsx::frame_trace_data frame_debug; @@ -40,84 +45,6 @@ namespace rsx class display_manager; } - struct rsx_iomap_table - { - static constexpr u32 c_lock_stride = 8192; - - std::array, 4096> ea; - std::array, 4096> io; - std::array rs; - - rsx_iomap_table() noexcept; - - // Try to get the real address given a mapped address - // Returns -1 on failure - u32 get_addr(u32 offs) const noexcept - { - return this->ea[offs >> 20] | (offs & 0xFFFFF); - } - - template - bool lock(u32 addr, u32 len, cpu_thread* self = nullptr) noexcept - { - if (len <= 1) return false; - const u32 end = addr + len - 1; - - bool added_wait = false; - - for (u32 block = addr / c_lock_stride; block <= (end / c_lock_stride); block += Stride) - { - auto& mutex_ = rs[block]; - - if (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared()) [[ unlikely ]] - { - if (self) - { - added_wait |= !self->state.test_and_set(cpu_flag::wait); - } - - if (!self || self->id_type() != 0x55u) - { - IsFullLock ? mutex_.lock() : mutex_.lock_shared(); - } - else - { - while (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared()) - { - self->cpu_wait({}); - } - } - } - } - - if (added_wait) - { - self->check_state(); - } - - return true; - } - - template - void unlock(u32 addr, u32 len) noexcept - { - ensure(len >= 1); - const u32 end = addr + len - 1; - - for (u32 block = (addr / 8192); block <= (end / 8192); block += Stride) - { - if constexpr (IsFullLock) - { - rs[block].unlock(); - } - else - { - rs[block].unlock_shared(); - } - } - } - }; - enum framebuffer_creation_context : u8 { context_draw = 0, @@ -175,22 +102,6 @@ namespace rsx all_interrupt_bits = memory_config_interrupt | backend_interrupt | display_interrupt | pipe_flush_interrupt }; - enum FIFO_state : u8 - { - running = 0, - empty = 1, // PUT == GET - spinning = 2, // Puller continuously jumps to self addr (synchronization technique) - nop = 3, // Puller is processing a NOP command - lock_wait = 4,// Puller is processing a lock acquire - paused = 5, // Puller is paused externallly - }; - - enum FIFO_hint : u8 - { - hint_conditional_render_eval = 1, - hint_zcull_sync = 2 - }; - enum result_flags: u8 { result_none = 0, @@ -206,264 +117,6 @@ namespace rsx const char* file = __builtin_FILE(), const char* func = __builtin_FUNCTION()); - struct tiled_region - { - u32 address; - u32 base; - GcmTileInfo *tile; - u8 *ptr; - - void write(const void *src, u32 width, u32 height, u32 pitch); - void read(void *dst, u32 width, u32 height, u32 pitch); - }; - - struct vertex_array_buffer - { - rsx::vertex_base_type type; - u8 attribute_size; - u8 stride; - std::span data; - u8 index; - bool is_be; - }; - - struct vertex_array_register - { - rsx::vertex_base_type type; - u8 attribute_size; - std::array data; - u8 index; - }; - - struct empty_vertex_array - { - u8 index; - }; - - struct draw_array_command - { - u32 __dummy; - }; - - struct draw_indexed_array_command - { - std::span raw_index_buffer; - }; - - struct draw_inlined_array - { - u32 __dummy; - u32 __dummy2; - }; - - struct interleaved_attribute_t - { - u8 index; - bool modulo; - u16 frequency; - }; - - struct interleaved_range_info - { - bool interleaved = false; - bool single_vertex = false; - u32 base_offset = 0; - u32 real_offset_address = 0; - u8 memory_location = 0; - u8 attribute_stride = 0; - - rsx::simple_array locations; - - // Check if we need to upload a full unoptimized range, i.e [0-max_index] - std::pair calculate_required_range(u32 first, u32 count) const; - }; - - enum attribute_buffer_placement : u8 - { - none = 0, - persistent = 1, - transient = 2 - }; - - class vertex_input_layout - { - int m_num_used_blocks = 0; - std::array m_blocks_data{}; - - public: - rsx::simple_array interleaved_blocks{}; // Interleaved blocks to be uploaded as-is - std::vector> volatile_blocks{}; // Volatile data blocks (immediate draw vertex data for example) - rsx::simple_array referenced_registers{}; // Volatile register data - - std::array attribute_placement = fill_array(attribute_buffer_placement::none); - - vertex_input_layout() = default; - - interleaved_range_info* alloc_interleaved_block() - { - auto result = &m_blocks_data[m_num_used_blocks++]; - result->attribute_stride = 0; - result->base_offset = 0; - result->memory_location = 0; - result->real_offset_address = 0; - result->single_vertex = false; - result->locations.clear(); - result->interleaved = true; - return result; - } - - void clear() - { - m_num_used_blocks = 0; - interleaved_blocks.clear(); - volatile_blocks.clear(); - referenced_registers.clear(); - } - - bool validate() const - { - // Criteria: At least one array stream has to be defined to feed vertex positions - // This stream cannot be a const register as the vertices cannot create a zero-area primitive - - if (!interleaved_blocks.empty() && interleaved_blocks[0]->attribute_stride != 0) - return true; - - if (!volatile_blocks.empty()) - return true; - - for (u8 index = 0; index < limits::vertex_count; ++index) - { - switch (attribute_placement[index]) - { - case attribute_buffer_placement::transient: - { - // Ignore register reference - if (std::find(referenced_registers.begin(), referenced_registers.end(), index) != referenced_registers.end()) - continue; - - // The source is inline array or immediate draw push buffer - return true; - } - case attribute_buffer_placement::persistent: - { - return true; - } - case attribute_buffer_placement::none: - { - continue; - } - default: - { - fmt::throw_exception("Unreachable"); - } - } - } - - return false; - } - - u32 calculate_interleaved_memory_requirements(u32 first_vertex, u32 vertex_count) const - { - u32 mem = 0; - for (auto &block : interleaved_blocks) - { - const auto range = block->calculate_required_range(first_vertex, vertex_count); - mem += range.second * block->attribute_stride; - } - - return mem; - } - }; - - struct framebuffer_layout - { - ENABLE_BITWISE_SERIALIZATION; - - u16 width; - u16 height; - std::array color_addresses; - std::array color_pitch; - std::array actual_color_pitch; - std::array color_write_enabled; - u32 zeta_address; - u32 zeta_pitch; - u32 actual_zeta_pitch; - bool zeta_write_enabled; - rsx::surface_target target; - rsx::surface_color_format color_format; - rsx::surface_depth_format2 depth_format; - rsx::surface_antialiasing aa_mode; - rsx::surface_raster_type raster_type; - u32 aa_factors[2]; - bool ignore_change; - }; - - struct frame_statistics_t - { - u32 draw_calls; - u32 submit_count; - - s64 setup_time; - s64 vertex_upload_time; - s64 textures_upload_time; - s64 draw_exec_time; - s64 flip_time; - }; - - struct display_flip_info_t - { - std::deque buffer_queue; - u32 buffer; - bool skip_frame; - bool emu_flip; - bool in_progress; - frame_statistics_t stats; - - inline void push(u32 _buffer) - { - buffer_queue.push_back(_buffer); - } - - inline bool pop(u32 _buffer) - { - if (buffer_queue.empty()) - { - return false; - } - - do - { - const auto index = buffer_queue.front(); - buffer_queue.pop_front(); - - if (index == _buffer) - { - buffer = _buffer; - return true; - } - } - while (!buffer_queue.empty()); - - // Need to observe this happening in the wild - rsx_log.error("Display queue was discarded while not empty!"); - return false; - } - }; - - class vblank_thread - { - std::shared_ptr>> m_thread; - - public: - vblank_thread() = default; - vblank_thread(const vblank_thread&) = delete; - - void set_thread(std::shared_ptr>> thread); - - vblank_thread& operator=(thread_state); - vblank_thread& operator=(const vblank_thread&) = delete; - }; - struct backend_configuration { bool supports_multidraw; // Draw call batching @@ -493,6 +146,7 @@ namespace rsx u64 tsc; }; + // TODO: This class is a mess, this needs to be broken into smaller chunks, like I did for RSXFIFO and RSXZCULL (kd) class thread : public cpu_thread { u64 timestamp_ctrl = 0; @@ -586,7 +240,7 @@ namespace rsx atomic_t idle_time{ 0 }; // Time spent idling in microseconds u64 last_update_timestamp = 0; // Timestamp of last load update u64 FIFO_idle_timestamp = 0; // Timestamp of when FIFO queue becomes idle - FIFO_state state = FIFO_state::running; + FIFO::state state = FIFO::state::running; u32 approximate_load = 0; u32 sampled_frames = 0; } @@ -736,7 +390,7 @@ namespace rsx /** * Execute a backend local task queue */ - virtual void do_local_task(FIFO_state state); + virtual void do_local_task(FIFO::state state); virtual void emit_geometry(u32) {} @@ -778,7 +432,7 @@ namespace rsx // sync void sync(); flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional); - virtual void sync_hint(FIFO_hint hint, reports::sync_hint_payload_t payload); + virtual void sync_hint(FIFO::interrupt_hint hint, reports::sync_hint_payload_t payload); virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; } std::span get_raw_index_array(const draw_clause& draw_indexed_clause) const; @@ -899,126 +553,4 @@ namespace rsx { return g_fxo->try_get(); } - - template - class reservation_lock - { - u32 addr = 0; - u32 length = 0; - - inline void lock_range(u32 addr, u32 length) - { - if (!get_current_renderer()->iomap_table.lock(addr, length, get_current_cpu_thread())) - { - length = 0; - } - - this->addr = addr; - this->length = length; - } - - public: - reservation_lock(u32 addr, u32 length) - { - if (g_cfg.core.rsx_accurate_res_access && - addr < constants::local_mem_base) - { - lock_range(addr, length); - } - } - - reservation_lock(u32 addr, u32 length, bool setting) - { - if (setting) - { - lock_range(addr, length); - } - } - - // Multi-range lock. If ranges overlap, the combined range will be acquired. - // If ranges do not overlap, the first range that is in main memory will be acquired. - reservation_lock(u32 dst_addr, u32 dst_length, u32 src_addr, u32 src_length) - { - if (g_cfg.core.rsx_accurate_res_access) - { - const auto range1 = utils::address_range::start_length(dst_addr, dst_length); - const auto range2 = utils::address_range::start_length(src_addr, src_length); - utils::address_range target_range; - - if (!range1.overlaps(range2)) [[likely]] - { - target_range = (dst_addr < constants::local_mem_base) ? range1 : range2; - } - else - { - // Very unlikely - target_range = range1.get_min_max(range2); - } - - if (target_range.start < constants::local_mem_base) - { - lock_range(target_range.start, target_range.length()); - } - } - } - - // Very special utility for batched transfers (SPU related) - template - void update_if_enabled(u32 addr, u32 _length, const std::add_pointer_t& lock_release = std::add_pointer_t{}) - { - // This check is not perfect but it covers the important cases fast (this check is only an optimization - forcing true disables it) - if (length && (this->addr / rsx_iomap_table::c_lock_stride != addr / rsx_iomap_table::c_lock_stride || (addr % rsx_iomap_table::c_lock_stride + _length) > rsx_iomap_table::c_lock_stride) && _length > 1) - { - if constexpr (!std::is_void_v) - { - // See SPUThread.cpp - lock_release->release(0); - } - - unlock(); - lock_range(addr, _length); - } - } - - void unlock(bool destructor = false) - { - if (length) - { - get_current_renderer()->iomap_table.unlock(addr, length); - - if (!destructor) - { - length = 0; - } - } - } - - ~reservation_lock() - { - unlock(true); - } - }; - - class eng_lock - { - rsx::thread* pthr; - public: - eng_lock(rsx::thread* target) - :pthr(target) - { - if (pthr->is_current_thread()) - { - pthr = nullptr; - } - else - { - pthr->pause(); - } - } - - ~eng_lock() - { - if (pthr) pthr->unpause(); - } - }; } diff --git a/rpcs3/Emu/RSX/RSXZCULL.cpp b/rpcs3/Emu/RSX/RSXZCULL.cpp index 0d41f26d0c..cdc1d14b0a 100644 --- a/rpcs3/Emu/RSX/RSXZCULL.cpp +++ b/rpcs3/Emu/RSX/RSXZCULL.cpp @@ -1,4 +1,6 @@ #include "stdafx.h" +#include "Core/RSXEngLock.hpp" +#include "Core/RSXReservationLock.hpp" #include "RSXThread.h" namespace rsx @@ -422,7 +424,7 @@ namespace rsx if (It->query->sync_tag > m_sync_tag) { // rsx_log.trace("[Performance warning] Query hint emit during sync command."); - ptimer->sync_hint(FIFO_hint::hint_zcull_sync, { .query = It->query }); + ptimer->sync_hint(FIFO::interrupt_hint::zcull_sync, { .query = It->query }); } break; @@ -531,7 +533,7 @@ namespace rsx { if (It->query->num_draws && It->query->sync_tag > m_sync_tag) { - ptimer->sync_hint(FIFO_hint::hint_zcull_sync, { .query = It->query }); + ptimer->sync_hint(FIFO::interrupt_hint::zcull_sync, { .query = It->query }); ensure(It->query->sync_tag <= m_sync_tag); } @@ -556,7 +558,7 @@ namespace rsx const auto elapsed = m_tsc - front.query->timestamp; if (elapsed > max_zcull_delay_us) { - ptimer->sync_hint(FIFO_hint::hint_zcull_sync, { .query = front.query }); + ptimer->sync_hint(FIFO::interrupt_hint::zcull_sync, { .query = front.query }); ensure(front.query->sync_tag <= m_sync_tag); } @@ -704,7 +706,7 @@ namespace rsx { if (query->sync_tag > m_sync_tag) [[unlikely]] { - ptimer->sync_hint(FIFO_hint::hint_zcull_sync, { .query = query }); + ptimer->sync_hint(FIFO::interrupt_hint::zcull_sync, { .query = query }); ensure(m_sync_tag >= query->sync_tag); } } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index aa476a4b12..bb993b221c 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -673,7 +673,7 @@ VKGSRender::~VKGSRender() // Flush DMA queue while (!g_fxo->get().sync()) { - do_local_task(rsx::FIFO_state::lock_wait); + do_local_task(rsx::FIFO::state::lock_wait); } //Wait for device to finish up with resources @@ -895,7 +895,7 @@ void VKGSRender::on_semaphore_acquire_wait() (async_flip_requested & flip_request::emu_requested) || (m_queue_status & flush_queue_state::deadlock)) { - do_local_task(rsx::FIFO_state::lock_wait); + do_local_task(rsx::FIFO::state::lock_wait); } } @@ -1602,7 +1602,7 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args) return true; } -void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_t payload) +void VKGSRender::sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hint_payload_t payload) { rsx::thread::sync_hint(hint, payload); @@ -1615,7 +1615,7 @@ void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_ // Occlusion test result evaluation is coming up, avoid a hard sync switch (hint) { - case rsx::FIFO_hint::hint_conditional_render_eval: + case rsx::FIFO::interrupt_hint::conditional_render_eval: { // If a flush request is already enqueued, do nothing if (m_flush_requests.pending()) @@ -1645,7 +1645,7 @@ void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_ m_last_cond_render_eval_hint = now; break; } - case rsx::FIFO_hint::hint_zcull_sync: + case rsx::FIFO::interrupt_hint::zcull_sync: { // Check if the required report is synced to this CB auto& data = m_occlusion_map[payload.query->driver_handle]; @@ -1672,7 +1672,7 @@ void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_ } } -void VKGSRender::do_local_task(rsx::FIFO_state state) +void VKGSRender::do_local_task(rsx::FIFO::state state) { if (m_queue_status & flush_queue_state::deadlock) { @@ -1702,7 +1702,7 @@ void VKGSRender::do_local_task(rsx::FIFO_state state) m_flush_queue_mutex.unlock(); } } - else if (!in_begin_end && state != rsx::FIFO_state::lock_wait) + else if (!in_begin_end && state != rsx::FIFO::state::lock_wait) { if (m_graphics_state & rsx::pipeline_state::framebuffer_reads_dirty) { @@ -1717,11 +1717,11 @@ void VKGSRender::do_local_task(rsx::FIFO_state state) switch (state) { - case rsx::FIFO_state::lock_wait: + case rsx::FIFO::state::lock_wait: // Critical check finished return; - //case rsx::FIFO_state::spinning: - //case rsx::FIFO_state::empty: + //case rsx::FIFO::state::spinning: + //case rsx::FIFO::state::empty: // We have some time, check the present queue //check_present_status(); //break; diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 383d9f7d79..4af1d1d80f 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -248,7 +248,7 @@ public: void set_scissor(bool clip_viewport); void bind_viewport(); - void sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_t payload) override; + void sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hint_payload_t payload) override; bool release_GCM_label(u32 address, u32 data) override; void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override; @@ -282,7 +282,7 @@ protected: void renderctl(u32 request_code, void* args) override; - void do_local_task(rsx::FIFO_state state) override; + void do_local_task(rsx::FIFO::state state) override; bool scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate) override; void notify_tile_unbound(u32 tile) override; diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index e588697a8c..87d1764944 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -4,6 +4,7 @@ #include "rsx_utils.h" #include "rsx_decode.h" #include "Common/time.hpp" +#include "Core/RSXReservationLock.hpp" #include "Emu/Cell/PPUCallback.h" #include "Emu/Cell/lv2/sys_rsx.h" #include "Emu/RSX/Common/BufferUtils.h" @@ -1278,6 +1279,11 @@ namespace rsx out_pitch = out_bpp * out_w; } + if (in_pitch == 0) + { + in_pitch = in_bpp * in_w; + } + if (in_bpp != out_bpp) { is_block_transfer = false; @@ -1680,12 +1686,6 @@ namespace rsx const u8 in_format = method_registers.nv0039_input_format(); const u32 notify = arg; - // The existing GCM commands use only the value 0x1 for inFormat and outFormat - if (in_format != 0x01 || out_format != 0x01) - { - rsx_log.error("NV0039_BUFFER_NOTIFY: Unsupported format: inFormat=%d, outFormat=%d", in_format, out_format); - } - if (!line_count || !line_length) { rsx_log.warning("NV0039_BUFFER_NOTIFY NOPed out: pitch(in=0x%x, out=0x%x), line(len=0x%x, cnt=0x%x), fmt(in=0x%x, out=0x%x), notify=0x%x", @@ -1734,7 +1734,28 @@ namespace rsx (dst_offset >= src_offset && dst_offset < src_max); }(); - if (is_overlapping) + if (in_format > 1 || out_format > 1) [[ unlikely ]] + { + // The formats are just input channel strides. You can use this to do cool tricks like gathering channels + // Very rare, only seen in use by Destiny + // TODO: Hw accel + for (u32 row = 0; row < line_count; ++row) + { + auto dst_ptr = dst; + auto src_ptr = src; + while (src_ptr < src + line_length) + { + *dst_ptr = *src_ptr; + + src_ptr += in_format; + dst_ptr += out_format; + } + + dst += out_pitch; + src += in_pitch; + } + } + else if (is_overlapping) [[ unlikely ]] { if (is_block_transfer) { diff --git a/rpcs3/emucore.vcxproj b/rpcs3/emucore.vcxproj index fa8fb35f1f..9ff45e96e5 100644 --- a/rpcs3/emucore.vcxproj +++ b/rpcs3/emucore.vcxproj @@ -525,6 +525,12 @@ + + + + + + diff --git a/rpcs3/emucore.vcxproj.filters b/rpcs3/emucore.vcxproj.filters index 0bfea0134d..6002b65491 100644 --- a/rpcs3/emucore.vcxproj.filters +++ b/rpcs3/emucore.vcxproj.filters @@ -76,6 +76,9 @@ {bc97b324-1eea-445a-8fa9-6fc49e3df47c} + + {99b3a1c9-93ea-4498-86b0-1000793013fa} + @@ -2206,6 +2209,24 @@ Emu\Io + + Emu\GPU\RSX\Core + + + Emu\GPU\RSX\Core + + + Emu\GPU\RSX\Core + + + Emu\GPU\RSX\Core + + + Emu\GPU\RSX\Core + + + Emu\GPU\RSX\Core +