From 39b2854b981aadec1576254ad5e4d91dc21c9709 Mon Sep 17 00:00:00 2001 From: Pokechu22 Date: Thu, 14 Apr 2022 12:01:57 -0700 Subject: [PATCH] VertexLoader: Convert count register to remaining register This more accurately represents what's going on, and also ends at 0 instead of 1, making some indexing operations easier. This also changes it so that position_matrix_index_cache actually starts from index 0 instead of index 1. --- Source/Core/VideoCommon/VertexLoader.cpp | 6 ++-- Source/Core/VideoCommon/VertexLoader.h | 2 +- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 25 +++++++-------- .../Core/VideoCommon/VertexLoaderManager.cpp | 10 +++--- Source/Core/VideoCommon/VertexLoaderManager.h | 2 +- Source/Core/VideoCommon/VertexLoaderX64.cpp | 31 +++++++++++-------- .../VideoCommon/VertexLoader_Position.cpp | 8 ++--- Source/Core/VideoCommon/VertexManagerBase.cpp | 2 +- 8 files changed, 44 insertions(+), 42 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index c703c96508..751778dd24 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -22,8 +22,8 @@ u8* g_vertex_manager_write_ptr; static void PosMtx_ReadDirect_UByte(VertexLoader* loader) { u32 posmtx = DataRead() & 0x3f; - if (loader->m_counter < 3) - VertexLoaderManager::position_matrix_index_cache[loader->m_counter + 1] = posmtx; + if (loader->m_remaining < 3) + VertexLoaderManager::position_matrix_index_cache[loader->m_remaining] = posmtx; DataWrite(posmtx); PRIM_LOG("posmtx: {}, ", posmtx); } @@ -257,7 +257,7 @@ int VertexLoader::RunVertices(DataReader src, DataReader dst, int count) m_numLoadedVertices += count; m_skippedVertices = 0; - for (m_counter = count - 1; m_counter >= 0; m_counter--) + for (m_remaining = count - 1; m_remaining >= 0; m_remaining--) { m_tcIndex = 0; m_colIndex = 0; diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index d1f80f80c1..b3bb0b270a 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -35,7 +35,7 @@ public: int m_texmtxread; bool m_vertexSkip; int m_skippedVertices; - int m_counter; + int m_remaining; private: // Pipeline. diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index a191dbb2c8..ab9ced5d8a 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -14,7 +14,7 @@ using namespace Arm64Gen; constexpr ARM64Reg src_reg = ARM64Reg::X0; constexpr ARM64Reg dst_reg = ARM64Reg::X1; -constexpr ARM64Reg count_reg = ARM64Reg::W2; +constexpr ARM64Reg remaining_reg = ARM64Reg::W2; constexpr ARM64Reg skipped_reg = ARM64Reg::W17; constexpr ARM64Reg scratch1_reg = ARM64Reg::W16; constexpr ARM64Reg scratch2_reg = ARM64Reg::W15; @@ -209,13 +209,10 @@ int VertexLoaderARM64::ReadVertex(VertexComponentFormat attribute, ComponentForm // Z-Freeze if (native_format == &m_native_vtx_decl.position) { - CMP(count_reg, 3); - FixupBranch dont_store = B(CC_GT); + CMP(remaining_reg, 3); + FixupBranch dont_store = B(CC_GE); MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::position_cache.data()); - ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg), EncodeRegTo64(count_reg), - ArithOption(EncodeRegTo64(count_reg), ShiftType::LSL, 4)); - m_float_emit.STUR(write_size, coords, EncodeRegTo64(scratch1_reg), - -int(sizeof(decltype(VertexLoaderManager::position_cache[0])))); + m_float_emit.STR(128, coords, EncodeRegTo64(scratch2_reg), ArithOption(remaining_reg, true)); SetJumpTarget(dont_store); } @@ -404,7 +401,7 @@ void VertexLoaderARM64::GenerateVertexLoader() AlignCode16(); if (IsIndexed(m_VtxDesc.low.Position)) MOV(skipped_reg, ARM64Reg::WZR); - MOV(saved_count, count_reg); + ADD(saved_count, remaining_reg, 1); MOVP2R(stride_reg, g_main_cp_state.array_strides.data()); MOVP2R(arraybase_reg, VertexLoaderManager::cached_arraybases.data()); @@ -421,10 +418,10 @@ void VertexLoaderARM64::GenerateVertexLoader() STR(IndexType::Unsigned, scratch1_reg, dst_reg, m_dst_ofs); // Z-Freeze - CMP(count_reg, 3); - FixupBranch dont_store = B(CC_GT); + CMP(remaining_reg, 3); + FixupBranch dont_store = B(CC_GE); MOVP2R(EncodeRegTo64(scratch2_reg), VertexLoaderManager::position_matrix_index_cache.data()); - STR(scratch1_reg, EncodeRegTo64(scratch2_reg), ArithOption(count_reg, true)); + STR(scratch1_reg, EncodeRegTo64(scratch2_reg), ArithOption(remaining_reg, true)); SetJumpTarget(dont_store); m_native_vtx_decl.posmtx.components = 4; @@ -584,8 +581,8 @@ void VertexLoaderARM64::GenerateVertexLoader() const u8* cont = GetCodePtr(); ADD(src_reg, src_reg, m_src_ofs); - SUB(count_reg, count_reg, 1); - CBNZ(count_reg, loop_start); + SUBS(remaining_reg, remaining_reg, 1); + B(CCFlags::CC_GE, loop_start); if (IsIndexed(m_VtxDesc.low.Position)) { @@ -612,5 +609,5 @@ int VertexLoaderARM64::RunVertices(DataReader src, DataReader dst, int count) { m_numLoadedVertices += count; return ((int (*)(u8 * src, u8 * dst, int count)) region)(src.GetPointer(), dst.GetPointer(), - count); + count - 1); } diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index e2881a8e3d..f5c9578a70 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -32,10 +32,9 @@ namespace VertexLoaderManager { // Used by zfreeze -std::array, 3> position_cache; -// The counter added to the address of the array is 1, 2, or 3, but never zero. -// So only index 1 - 3 are used. -std::array position_matrix_index_cache; +std::array position_matrix_index_cache; +// 3 vertices, 4 floats each to allow SIMD overwrite +alignas(sizeof(std::array)) std::array, 3> position_cache; static NativeVertexFormatMap s_native_vertex_map; static NativeVertexFormat* s_current_vtx_fmt; @@ -251,8 +250,9 @@ static VertexLoaderBase* RefreshLoader(int vtx_attr_group, bool preprocess = fal int RunVertices(int vtx_attr_group, OpcodeDecoder::Primitive primitive, int count, DataReader src, bool is_preprocess) { - if (!count) + if (count == 0) return 0; + ASSERT(count > 0); VertexLoaderBase* loader = RefreshLoader(vtx_attr_group, is_preprocess); diff --git a/Source/Core/VideoCommon/VertexLoaderManager.h b/Source/Core/VideoCommon/VertexLoaderManager.h index 2ba724d75d..5573e08103 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.h +++ b/Source/Core/VideoCommon/VertexLoaderManager.h @@ -54,7 +54,7 @@ void UpdateVertexArrayPointers(); // Position cache for zfreeze (3 vertices, 4 floats each to allow SIMD overwrite). // These arrays are in reverse order. extern std::array, 3> position_cache; -extern std::array position_matrix_index_cache; +extern std::array position_matrix_index_cache; // VB_HAS_X. Bitmask telling what vertex components are present. extern u32 g_current_components; diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index 91889c83d9..da52788d3c 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -26,7 +26,9 @@ static const X64Reg dst_reg = ABI_PARAM2; static const X64Reg scratch1 = RAX; static const X64Reg scratch2 = ABI_PARAM3; static const X64Reg scratch3 = ABI_PARAM4; -static const X64Reg count_reg = R10; +// The remaining number of vertices to be processed. Starts at count - 1, and the final loop has it +// at 0. +static const X64Reg remaining_reg = R10; static const X64Reg skipped_reg = R11; static const X64Reg base_reg = RBX; @@ -117,10 +119,11 @@ int VertexLoaderX64::ReadVertex(OpArg data, VertexComponentFormat attribute, Com const auto write_zfreeze = [&]() { // zfreeze if (native_format == &m_native_vtx_decl.position) { - CMP(32, R(count_reg), Imm8(3)); - FixupBranch dont_store = J_CC(CC_A); - LEA(32, scratch3, - MScaled(count_reg, SCALE_4, -int(VertexLoaderManager::position_cache[0].size()))); + CMP(32, R(remaining_reg), Imm8(3)); + FixupBranch dont_store = J_CC(CC_AE); + // The position cache is composed of 3 rows of 4 floats each; since each float is 4 bytes, + // we need to scale by 4 twice to cover the 4 floats. + LEA(32, scratch3, MScaled(remaining_reg, SCALE_4, 0)); MOVUPS(MPIC(VertexLoaderManager::position_cache.data(), scratch3, SCALE_4), coords); SetJumpTarget(dont_store); } @@ -380,8 +383,8 @@ void VertexLoaderX64::ReadColor(OpArg data, VertexComponentFormat attribute, Col void VertexLoaderX64::GenerateVertexLoader() { - BitSet32 regs = {src_reg, dst_reg, scratch1, scratch2, - scratch3, count_reg, skipped_reg, base_reg}; + BitSet32 regs = {src_reg, dst_reg, scratch1, scratch2, + scratch3, remaining_reg, skipped_reg, base_reg}; regs &= ABI_ALL_CALLEE_SAVED; ABI_PushRegistersAndAdjustStack(regs, 0); @@ -389,7 +392,9 @@ void VertexLoaderX64::GenerateVertexLoader() PUSH(32, R(ABI_PARAM3)); // ABI_PARAM3 is one of the lower registers, so free it for scratch2. - MOV(32, R(count_reg), R(ABI_PARAM3)); + // We also have it end at a value of 0, to simplify indexing for zfreeze; + // this requires subtracting 1 at the start. + LEA(32, remaining_reg, MDisp(ABI_PARAM3, -1)); MOV(64, R(base_reg), R(ABI_PARAM4)); @@ -407,9 +412,9 @@ void VertexLoaderX64::GenerateVertexLoader() MOV(32, MDisp(dst_reg, m_dst_ofs), R(scratch1)); // zfreeze - CMP(32, R(count_reg), Imm8(3)); - FixupBranch dont_store = J_CC(CC_A); - MOV(32, MPIC(VertexLoaderManager::position_matrix_index_cache.data(), count_reg, SCALE_4), + CMP(32, R(remaining_reg), Imm8(3)); + FixupBranch dont_store = J_CC(CC_AE); + MOV(32, MPIC(VertexLoaderManager::position_matrix_index_cache.data(), remaining_reg, SCALE_4), R(scratch1)); SetJumpTarget(dont_store); @@ -509,8 +514,8 @@ void VertexLoaderX64::GenerateVertexLoader() const u8* cont = GetCodePtr(); ADD(64, R(src_reg), Imm32(m_src_ofs)); - SUB(32, R(count_reg), Imm8(1)); - J_CC(CC_NZ, loop_start); + SUB(32, R(remaining_reg), Imm8(1)); + J_CC(CC_AE, loop_start); // Get the original count. POP(32, R(ABI_RETURN)); diff --git a/Source/Core/VideoCommon/VertexLoader_Position.cpp b/Source/Core/VideoCommon/VertexLoader_Position.cpp index 0fe8e7ba72..15d2f6d94e 100644 --- a/Source/Core/VideoCommon/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Position.cpp @@ -41,8 +41,8 @@ void Pos_ReadDirect(VertexLoader* loader) for (int i = 0; i < N; ++i) { const float value = PosScale(src.Read(), scale); - if (loader->m_counter < 3) - VertexLoaderManager::position_cache[loader->m_counter][i] = value; + if (loader->m_remaining < 3) + VertexLoaderManager::position_cache[loader->m_remaining][i] = value; dst.Write(value); } @@ -68,8 +68,8 @@ void Pos_ReadIndex(VertexLoader* loader) for (int i = 0; i < N; ++i) { const float value = PosScale(Common::FromBigEndian(data[i]), scale); - if (loader->m_counter < 3) - VertexLoaderManager::position_cache[loader->m_counter][i] = value; + if (loader->m_remaining < 3) + VertexLoaderManager::position_cache[loader->m_remaining][i] = value; dst.Write(value); } diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 4ffebaadf3..a93a9b34f1 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -558,7 +558,7 @@ void VertexManagerBase::CalculateZSlope(NativeVertexFormat* format) { // If this vertex format has per-vertex position matrix IDs, look it up. if (vert_decl.posmtx.enable) - mtxIdx = VertexLoaderManager::position_matrix_index_cache[3 - i]; + mtxIdx = VertexLoaderManager::position_matrix_index_cache[2 - i]; if (vert_decl.position.components == 2) VertexLoaderManager::position_cache[2 - i][2] = 0;