diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index a31a6b3411..f5a95095ab 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -77,8 +77,10 @@ namespace X = X << 5; return{ X, Y, Z, 1 }; } +} - inline void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride) + template + void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride) { const __m128i mask = _mm_set_epi8( 0xC, 0xD, 0xE, 0xF, @@ -99,7 +101,15 @@ namespace { const __m128i vector = _mm_loadu_si128(src_ptr); const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask); - _mm_stream_si128(dst_ptr, shuffled_vector); + + if constexpr (!unaligned) + { + _mm_stream_si128(dst_ptr, shuffled_vector); + } + else + { + _mm_storeu_si128(dst_ptr, shuffled_vector); + } src_ptr++; dst_ptr++; @@ -112,7 +122,15 @@ namespace const __m128i vec0 = _mm_loadu_si128(src_ptr); const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8)); const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16)); - _mm_stream_si128(dst_ptr, vec2); + + if constexpr (!unaligned) + { + _mm_stream_si128(dst_ptr, vec2); + } + else + { + _mm_storeu_si128(dst_ptr, vec2); + } src_ptr++; dst_ptr++; @@ -129,6 +147,11 @@ namespace } } + template void stream_data_to_memory_swapped_u32(void *, const void *, u32, u8); + template void stream_data_to_memory_swapped_u32(void*, const void*, u32, u8); + +namespace +{ inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride) { const __m128i mask = _mm_set_epi8( diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.h b/rpcs3/Emu/RSX/Common/BufferUtils.h index 5cf04e4134..a6b56f711e 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.h +++ b/rpcs3/Emu/RSX/Common/BufferUtils.h @@ -55,3 +55,11 @@ void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w); * Stream a 128 bits vector from src to dst. */ void stream_vector_from_memory(void *dst, void *src); + +/** + * Stream and swap data in u32 units. + */ +template +void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride); + + diff --git a/rpcs3/Emu/RSX/RSXFIFO.cpp b/rpcs3/Emu/RSX/RSXFIFO.cpp index 1190b44aff..480dbb0fc9 100644 --- a/rpcs3/Emu/RSX/RSXFIFO.cpp +++ b/rpcs3/Emu/RSX/RSXFIFO.cpp @@ -524,32 +524,44 @@ namespace rsx rsx::frame_capture_data::replay_command replay_cmd; replay_cmd.rsx_command = std::make_pair((reg << 2) | (1u << 18), value); - frame_capture.replay_commands.push_back(replay_cmd); - auto it = frame_capture.replay_commands.back(); + auto& commands = frame_capture.replay_commands; + commands.push_back(replay_cmd); switch (reg) { case NV3089_IMAGE_IN: - capture::capture_image_in(this, it); + capture::capture_image_in(this, commands.back()); break; case NV0039_BUFFER_NOTIFY: - capture::capture_buffer_notify(this, it); + capture::capture_buffer_notify(this, commands.back()); break; default: { - // Use legacy logic for NV308A_COLOR - enqueue leading command with count + static constexpr std::array, 2> ranges + {{ + {NV308A_COLOR, 0x700}, + {NV4097_SET_TRANSFORM_PROGRAM, 32} + }}; + + // Use legacy logic - enqueue leading command with count // Then enqueue each command arg alone with a no-op command - if (reg >= NV308A_COLOR && reg < NV308A_COLOR + 0x700) + for (const auto& range : ranges) { - const u32 remaining = std::min(fifo_ctrl->get_remaining_args_count(), (NV308A_COLOR + 0x700) - reg); - - it.rsx_command.first = (fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) | (reg << 2) | (remaining << 18); - - for (u32 i = 0; i < remaining && fifo_ctrl->get_pos() + (i + 1) * 4 != (ctrl->put & ~3); i++) + if (reg >= range.first && reg < range.first + range.second) { - replay_cmd.rsx_command = std::make_pair(0, vm::read32(fifo_ctrl->get_current_arg_ptr() + (i + 1) * 4)); + const u32 remaining = std::min(fifo_ctrl->get_remaining_args_count() + 1, + (fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) ? UINT32_MAX : (range.first + range.second) - reg); - frame_capture.replay_commands.push_back(replay_cmd); + commands.back().rsx_command.first = (fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) | (reg << 2) | (remaining << 18); + + for (u32 i = 1; i < remaining && fifo_ctrl->get_pos() + (i - 1) * 4 != (ctrl->put & ~3); i++) + { + replay_cmd.rsx_command = std::make_pair(0, vm::read32(fifo_ctrl->get_current_arg_ptr() + (i * 4))); + + commands.push_back(replay_cmd); + } + + break; } } diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 5b51f8943c..495b2dd426 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -6,6 +6,7 @@ #include "rsx_decode.h" #include "Emu/Cell/PPUCallback.h" #include "Emu/Cell/lv2/sys_rsx.h" +#include "Emu/RSX/Common/BufferUtils.h" #include #include @@ -450,17 +451,30 @@ namespace rsx { static void impl(thread* rsx, u32 _reg, u32 arg) { - if (rsx::method_registers.transform_program_load() >= 512) + // Get real args count + const u32 count = std::min({rsx->fifo_ctrl->get_remaining_args_count() + 1, + static_cast(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos() - 4)) / 4), 32 - index}); + + const u32 load_pos = rsx::method_registers.transform_program_load(); + + u32 rcount = count; + + if (const u32 max = load_pos * 4 + rcount + (index % 4); + max > 512 * 4) { // PS3 seems to allow exceeding the program buffer by upto 32 instructions before crashing // Discard the "excess" instructions to not overflow our transform program buffer // TODO: Check if the instructions in the overflow area are executed by PS3 rsx_log.warning("Program buffer overflow!"); - return; + rcount -= max - (512 * 4); } - method_registers.commit_4_transform_program_instructions(index); + stream_data_to_memory_swapped_u32(&rsx::method_registers.transform_program[load_pos * 4 + index % 4] + , vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount, 4); + rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_dirty; + rsx::method_registers.transform_program_load_set(load_pos + ((rcount + index % 4) / 4)); + rsx->fifo_ctrl->skip_methods(count - 1); } }; @@ -2994,7 +3008,7 @@ namespace rsx bind_range(); bind_range(); bind_range(); - bind_range(); + bind_range(); bind(); bind(); bind(); diff --git a/rpcs3/Emu/RSX/rsx_methods.h b/rpcs3/Emu/RSX/rsx_methods.h index 890767aa17..4ea03957b9 100644 --- a/rpcs3/Emu/RSX/rsx_methods.h +++ b/rpcs3/Emu/RSX/rsx_methods.h @@ -1610,20 +1610,14 @@ namespace rsx return u16(registers[NV308A_SIZE_OUT] & 0xFFFF); } - u32 transform_program_load() + u32 transform_program_load() const { return registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD]; } - void commit_4_transform_program_instructions(u32 index) + void transform_program_load_set(u32 value) { - u32& load = registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD]; - - transform_program[load * 4] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4]; - transform_program[load * 4 + 1] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 1]; - transform_program[load * 4 + 2] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 2]; - transform_program[load * 4 + 3] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 3]; - load++; + registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD] = value; } u32 transform_constant_load()