rsx: Batch vertex program load methods

This commit is contained in:
Eladash 2020-03-28 12:53:30 +03:00 committed by kd-11
parent 85c4321c24
commit d97e9f7b4a
5 changed files with 80 additions and 29 deletions

View File

@ -77,8 +77,10 @@ namespace
X = X << 5;
return{ X, Y, Z, 1 };
}
}
inline void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride)
template <bool unaligned>
void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride)
{
const __m128i mask = _mm_set_epi8(
0xC, 0xD, 0xE, 0xF,
@ -99,7 +101,15 @@ namespace
{
const __m128i vector = _mm_loadu_si128(src_ptr);
const __m128i shuffled_vector = ssse3_shuffle_epi8(vector, mask);
if constexpr (!unaligned)
{
_mm_stream_si128(dst_ptr, shuffled_vector);
}
else
{
_mm_storeu_si128(dst_ptr, shuffled_vector);
}
src_ptr++;
dst_ptr++;
@ -112,7 +122,15 @@ namespace
const __m128i vec0 = _mm_loadu_si128(src_ptr);
const __m128i vec1 = _mm_or_si128(_mm_slli_epi16(vec0, 8), _mm_srli_epi16(vec0, 8));
const __m128i vec2 = _mm_or_si128(_mm_slli_epi32(vec1, 16), _mm_srli_epi32(vec1, 16));
if constexpr (!unaligned)
{
_mm_stream_si128(dst_ptr, vec2);
}
else
{
_mm_storeu_si128(dst_ptr, vec2);
}
src_ptr++;
dst_ptr++;
@ -129,6 +147,11 @@ namespace
}
}
template void stream_data_to_memory_swapped_u32<false>(void *, const void *, u32, u8);
template void stream_data_to_memory_swapped_u32<true>(void*, const void*, u32, u8);
namespace
{
inline void stream_data_to_memory_swapped_u16(void *dst, const void *src, u32 vertex_count, u8 stride)
{
const __m128i mask = _mm_set_epi8(

View File

@ -55,3 +55,11 @@ void stream_vector(void *dst, u32 x, u32 y, u32 z, u32 w);
* Stream a 128 bits vector from src to dst.
*/
void stream_vector_from_memory(void *dst, void *src);
/**
* Stream and swap data in u32 units.
*/
template <bool unaligned = false>
void stream_data_to_memory_swapped_u32(void *dst, const void *src, u32 vertex_count, u8 stride);

View File

@ -524,32 +524,44 @@ namespace rsx
rsx::frame_capture_data::replay_command replay_cmd;
replay_cmd.rsx_command = std::make_pair((reg << 2) | (1u << 18), value);
frame_capture.replay_commands.push_back(replay_cmd);
auto it = frame_capture.replay_commands.back();
auto& commands = frame_capture.replay_commands;
commands.push_back(replay_cmd);
switch (reg)
{
case NV3089_IMAGE_IN:
capture::capture_image_in(this, it);
capture::capture_image_in(this, commands.back());
break;
case NV0039_BUFFER_NOTIFY:
capture::capture_buffer_notify(this, it);
capture::capture_buffer_notify(this, commands.back());
break;
default:
{
// Use legacy logic for NV308A_COLOR - enqueue leading command with count
static constexpr std::array<std::pair<u32, u32>, 2> ranges
{{
{NV308A_COLOR, 0x700},
{NV4097_SET_TRANSFORM_PROGRAM, 32}
}};
// Use legacy logic - enqueue leading command with count
// Then enqueue each command arg alone with a no-op command
if (reg >= NV308A_COLOR && reg < NV308A_COLOR + 0x700)
for (const auto& range : ranges)
{
const u32 remaining = std::min<u32>(fifo_ctrl->get_remaining_args_count(), (NV308A_COLOR + 0x700) - reg);
it.rsx_command.first = (fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) | (reg << 2) | (remaining << 18);
for (u32 i = 0; i < remaining && fifo_ctrl->get_pos() + (i + 1) * 4 != (ctrl->put & ~3); i++)
if (reg >= range.first && reg < range.first + range.second)
{
replay_cmd.rsx_command = std::make_pair(0, vm::read32(fifo_ctrl->get_current_arg_ptr() + (i + 1) * 4));
const u32 remaining = std::min<u32>(fifo_ctrl->get_remaining_args_count() + 1,
(fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) ? UINT32_MAX : (range.first + range.second) - reg);
frame_capture.replay_commands.push_back(replay_cmd);
commands.back().rsx_command.first = (fifo_ctrl->last_cmd() & RSX_METHOD_NON_INCREMENT_CMD_MASK) | (reg << 2) | (remaining << 18);
for (u32 i = 1; i < remaining && fifo_ctrl->get_pos() + (i - 1) * 4 != (ctrl->put & ~3); i++)
{
replay_cmd.rsx_command = std::make_pair(0, vm::read32(fifo_ctrl->get_current_arg_ptr() + (i * 4)));
commands.push_back(replay_cmd);
}
break;
}
}

View File

@ -6,6 +6,7 @@
#include "rsx_decode.h"
#include "Emu/Cell/PPUCallback.h"
#include "Emu/Cell/lv2/sys_rsx.h"
#include "Emu/RSX/Common/BufferUtils.h"
#include <thread>
#include <atomic>
@ -450,17 +451,30 @@ namespace rsx
{
static void impl(thread* rsx, u32 _reg, u32 arg)
{
if (rsx::method_registers.transform_program_load() >= 512)
// Get real args count
const u32 count = std::min<u32>({rsx->fifo_ctrl->get_remaining_args_count() + 1,
static_cast<u32>(((rsx->ctrl->put & ~3ull) - (rsx->fifo_ctrl->get_pos() - 4)) / 4), 32 - index});
const u32 load_pos = rsx::method_registers.transform_program_load();
u32 rcount = count;
if (const u32 max = load_pos * 4 + rcount + (index % 4);
max > 512 * 4)
{
// PS3 seems to allow exceeding the program buffer by upto 32 instructions before crashing
// Discard the "excess" instructions to not overflow our transform program buffer
// TODO: Check if the instructions in the overflow area are executed by PS3
rsx_log.warning("Program buffer overflow!");
return;
rcount -= max - (512 * 4);
}
method_registers.commit_4_transform_program_instructions(index);
stream_data_to_memory_swapped_u32<true>(&rsx::method_registers.transform_program[load_pos * 4 + index % 4]
, vm::base(rsx->fifo_ctrl->get_current_arg_ptr()), rcount, 4);
rsx->m_graphics_state |= rsx::pipeline_state::vertex_program_dirty;
rsx::method_registers.transform_program_load_set(load_pos + ((rcount + index % 4) / 4));
rsx->fifo_ctrl->skip_methods(count - 1);
}
};
@ -2994,7 +3008,7 @@ namespace rsx
bind_range<NV4097_SET_VERTEX_DATA2S_M, 1, 16, nv4097::set_vertex_data2s_m>();
bind_range<NV4097_SET_VERTEX_DATA4S_M, 1, 32, nv4097::set_vertex_data4s_m>();
bind_range<NV4097_SET_TRANSFORM_CONSTANT, 1, 32, nv4097::set_transform_constant>();
bind_range<NV4097_SET_TRANSFORM_PROGRAM + 3, 4, 32 / 4, nv4097::set_transform_program>();
bind_range<NV4097_SET_TRANSFORM_PROGRAM, 1, 32, nv4097::set_transform_program>();
bind<NV4097_GET_REPORT, nv4097::get_report>();
bind<NV4097_CLEAR_REPORT_VALUE, nv4097::clear_report_value>();
bind<NV4097_SET_SURFACE_CLIP_HORIZONTAL, nv4097::set_surface_dirty_bit>();

View File

@ -1610,20 +1610,14 @@ namespace rsx
return u16(registers[NV308A_SIZE_OUT] & 0xFFFF);
}
u32 transform_program_load()
u32 transform_program_load() const
{
return registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD];
}
void commit_4_transform_program_instructions(u32 index)
void transform_program_load_set(u32 value)
{
u32& load = registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD];
transform_program[load * 4] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4];
transform_program[load * 4 + 1] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 1];
transform_program[load * 4 + 2] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 2];
transform_program[load * 4 + 3] = registers[NV4097_SET_TRANSFORM_PROGRAM + index * 4 + 3];
load++;
registers[NV4097_SET_TRANSFORM_PROGRAM_LOAD] = value;
}
u32 transform_constant_load()