From 49ae99ffc9dd11695485d6f2fedc805080e5ff06 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 12 Mar 2015 11:27:22 +0100 Subject: [PATCH 1/6] UnitTests: rewrite vertex loader tests The position attribute now has complete coverage. --- .../VideoCommon/VertexLoaderTest.cpp | 375 ++++++++++-------- 1 file changed, 204 insertions(+), 171 deletions(-) diff --git a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp index 966b037312..d3a5bc7b82 100644 --- a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp +++ b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp @@ -1,13 +1,18 @@ +#include +#include +#include +#include #include -#include "Common/Common.h" -#include "VideoCommon/DataReader.h" -#include "VideoCommon/VertexLoaderBase.h" - -// Needs to be included later because it defines a TEST macro that conflicts -// with a TEST method definition in x64Emitter.h. #include // NOLINT +#include "Common/Common.h" +#include "Common/MathUtil.h" +#include "VideoCommon/CPMemory.h" +#include "VideoCommon/DataReader.h" +#include "VideoCommon/OpcodeDecoding.h" +#include "VideoCommon/VertexLoaderBase.h" + TEST(VertexLoaderUID, UniqueEnough) { std::unordered_set uids; @@ -38,181 +43,209 @@ protected: void SetUp() override { - memset(&input_memory[0], 0, sizeof(input_memory)); - memset(&output_memory[0], 0, sizeof(input_memory)); + memset(input_memory, 0, sizeof(input_memory)); + memset(output_memory, 0xFF, sizeof(input_memory)); memset(&m_vtx_desc, 0, sizeof(m_vtx_desc)); memset(&m_vtx_attr, 0, sizeof(m_vtx_attr)); + m_loader = nullptr; + ResetPointers(); } - // Pushes a value to the input stream. + void CreateAndCheckSizes(size_t input_size, size_t output_size = 3 * sizeof(float)) + { + m_loader.reset(VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr)); + ASSERT_EQ((int)input_size, m_loader->m_VertexSize); + ASSERT_EQ((int)output_size, m_loader->m_native_vtx_decl.stride); + } + template void Input(T val) { - // Converts *to* big endian, not from. - *(T*)(&input_memory[m_input_pos]) = Common::FromBigEndian(val); - m_input_pos += sizeof(val); + // Write swapped. + m_src.Write(val); } - // Reads a value from the output stream. - template - T Output() + void ExpectOut(float val) { - T out = *(T*)&output_memory[m_output_pos]; - m_output_pos += sizeof(out); - return out; + // Read unswapped. + MathUtil::IntFloat expected(val), actual(m_dst.Read()); + if (!actual.f || actual.f != actual.f) + EXPECT_EQ(expected.i, actual.i); + else + EXPECT_EQ(expected.f, actual.f); } - // Combination of EXPECT_EQ and Output. - template - void ExpectOut(T val) + void RunVertices(int count, int expected_count = -1) { - EXPECT_EQ(val, Output()); + if (expected_count == -1) + expected_count = count; + ResetPointers(); + int actual_count = m_loader->RunVertices(m_src, m_dst, count, GX_DRAW_POINTS); + EXPECT_EQ(actual_count, expected_count); } void ResetPointers() { - m_input_pos = m_output_pos = 0; - src = DataReader(input_memory, input_memory + sizeof(input_memory)); - dst = DataReader(output_memory, output_memory + sizeof(output_memory)); + m_src = DataReader(input_memory, input_memory + sizeof(input_memory)); + m_dst = DataReader(output_memory, output_memory + sizeof(output_memory)); } - u32 m_input_pos, m_output_pos; - DataReader src; - DataReader dst; + DataReader m_src; + DataReader m_dst; TVtxDesc m_vtx_desc; VAT m_vtx_attr; + std::unique_ptr m_loader; }; -TEST_F(VertexLoaderTest, PositionDirectFloatXYZ) +class VertexLoaderParamTest : public VertexLoaderTest, public ::testing::WithParamInterface> {}; +extern int gtest_AllCombinationsVertexLoaderParamTest_dummy_; +INSTANTIATE_TEST_CASE_P( + AllCombinations, VertexLoaderParamTest, + ::testing::Combine( + ::testing::Values(DIRECT, INDEX8, INDEX16), + ::testing::Values(FORMAT_UBYTE, FORMAT_BYTE, FORMAT_USHORT, FORMAT_SHORT, FORMAT_FLOAT), + ::testing::Values(0, 1), // elements + ::testing::Values(0, 1, 31) // frac + ) +); + +TEST_P(VertexLoaderParamTest, PositionAll) { - m_vtx_desc.Position = 1; // Direct - m_vtx_attr.g0.PosElements = 1; // XYZ - m_vtx_attr.g0.PosFormat = 4; // Float + int addr, format, elements, frac; + std::tie(addr, format, elements, frac) = GetParam(); + this->m_vtx_desc.Position = addr; + this->m_vtx_attr.g0.PosFormat = format; + this->m_vtx_attr.g0.PosElements = elements; + this->m_vtx_attr.g0.PosFrac = frac; + this->m_vtx_attr.g0.ByteDequant = true; + elements += 2; - VertexLoaderBase* loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); + std::vector values = { + std::numeric_limits::lowest(), + std::numeric_limits::denorm_min(), + std::numeric_limits::min(), + std::numeric_limits::max(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::infinity(), + -0x8000, -0x80, -1, -0, 0, 1, 123, 0x7F, 0xFF, 0x7FFF, 0xFFFF, 12345678, + }; + ASSERT_EQ(0u, values.size() % 2); + ASSERT_EQ(0u, values.size() % 3); - ASSERT_EQ(3 * sizeof(float), (u32)loader->m_native_vtx_decl.stride); - ASSERT_EQ(3 * sizeof(float), (u32)loader->m_VertexSize); - - // Write some vertices. - Input(0.0f); Input(0.0f); Input(0.0f); - Input(1.0f); Input(0.0f); Input(0.0f); - Input(0.0f); Input(1.0f); Input(0.0f); - Input(0.0f); Input(0.0f); Input(1.0f); - - // Convert 4 points. "7" -> primitive are points. - int count = loader->RunVertices(src, dst, 4, 7); - src.Skip(4 * loader->m_VertexSize); - dst.Skip(count * loader->m_native_vtx_decl.stride); - delete loader; - - ExpectOut(0.0f); ExpectOut(0.0f); ExpectOut(0.0f); - ExpectOut(1.0f); ExpectOut(0.0f); ExpectOut(0.0f); - ExpectOut(0.0f); ExpectOut(1.0f); ExpectOut(0.0f); - ExpectOut(0.0f); ExpectOut(0.0f); ExpectOut(1.0f); - - // Test that scale does nothing for floating point inputs. - Input(1.0f); Input(2.0f); Input(4.0f); - m_vtx_attr.g0.PosFrac = 1; - loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); - count = loader->RunVertices(src, dst, 1, 7); - src.Skip(1 * loader->m_VertexSize); - dst.Skip(count * loader->m_native_vtx_decl.stride); - ExpectOut(1.0f); ExpectOut(2.0f); ExpectOut(4.0f); - delete loader; -} - -TEST_F(VertexLoaderTest, PositionDirectU16XY) -{ - m_vtx_desc.Position = 1; // Direct - m_vtx_attr.g0.PosElements = 0; // XY - m_vtx_attr.g0.PosFormat = 2; // U16 - - VertexLoaderBase* loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); - - ASSERT_EQ(3 * sizeof(float), (u32)loader->m_native_vtx_decl.stride); - ASSERT_EQ(2 * sizeof(u16), (u32)loader->m_VertexSize); - - // Write some vertices. - Input(0); Input(0); - Input(1); Input(2); - Input(256); Input(257); - Input(65535); Input(65534); - Input(12345); Input(54321); - - // Convert 5 points. "7" -> primitive are points. - int count = loader->RunVertices(src, dst, 5, 7); - src.Skip(5 * loader->m_VertexSize); - dst.Skip(count * loader->m_native_vtx_decl.stride); - delete loader; - - ExpectOut(0.0f); ExpectOut(0.0f); ExpectOut(0.0f); - ExpectOut(1.0f); ExpectOut(2.0f); ExpectOut(0.0f); - ExpectOut(256.0f); ExpectOut(257.0f); ExpectOut(0.0f); - ExpectOut(65535.0f); ExpectOut(65534.0f); ExpectOut(0.0f); - ExpectOut(12345.0f); ExpectOut(54321.0f); ExpectOut(0.0f); - - // Test that scale works on U16 inputs. - Input(42); Input(24); - m_vtx_attr.g0.PosFrac = 1; - m_vtx_attr.g0.ByteDequant = 1; - loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); - count = loader->RunVertices(src, dst, 1, 7); - src.Skip(1 * loader->m_VertexSize); - dst.Skip(count * loader->m_native_vtx_decl.stride); - ExpectOut(21.0f); ExpectOut(12.0f); ExpectOut(0.0f); - delete loader; -} - -TEST_F(VertexLoaderTest, PositionDirectFloatXYZSpeed) -{ - m_vtx_desc.Position = 1; // Direct - m_vtx_attr.g0.PosElements = 1; // XYZ - m_vtx_attr.g0.PosFormat = 4; // Float - - VertexLoaderBase* loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); - - ASSERT_EQ(3 * sizeof(float), (u32)loader->m_native_vtx_decl.stride); - ASSERT_EQ(3 * sizeof(float), (u32)loader->m_VertexSize); - - for (int i = 0; i < 1000; ++i) + int count = (int)values.size() / elements; + u32 elem_size = 1 << (format / 2); + size_t input_size = elements * elem_size; + if (addr & MASK_INDEXED) { - ResetPointers(); - int count = loader->RunVertices(src, dst, 100000, 7); - src.Skip(100000 * loader->m_VertexSize); - dst.Skip(count * loader->m_native_vtx_decl.stride); + input_size = addr - 1; + for (int i = 0; i < count; i++) + if (addr == INDEX8) + Input(i); + else + Input(i); + cached_arraybases[ARRAY_POSITION] = m_src.GetPointer(); + g_main_cp_state.array_strides[ARRAY_POSITION] = elements * elem_size; + } + CreateAndCheckSizes(input_size); + for (float value : values) + { + switch (format) + { + case FORMAT_UBYTE: Input((u8)value); break; + case FORMAT_BYTE: Input((s8)value); break; + case FORMAT_USHORT: Input((u16)value); break; + case FORMAT_SHORT: Input((s16)value); break; + case FORMAT_FLOAT: Input(value); break; + } + } + + RunVertices(count); + + float scale = 1.f / (1u << (format == FORMAT_FLOAT ? 0 : frac)); + for (auto iter = values.begin(); iter != values.end();) + { + float f, g; + switch (format) + { + case FORMAT_UBYTE: f = (u8)*iter++; g = (u8)*iter++; break; + case FORMAT_BYTE: f = (s8)*iter++; g = (s8)*iter++; break; + case FORMAT_USHORT: f = (u16)*iter++; g = (u16)*iter++; break; + case FORMAT_SHORT: f = (s16)*iter++; g = (s16)*iter++; break; + case FORMAT_FLOAT: f = *iter++; g = *iter++; break; + } + ExpectOut(f * scale); + ExpectOut(g * scale); + if (elements == 2) + m_dst.Skip(sizeof(float)); } - delete loader; } -TEST_F(VertexLoaderTest, PositionDirectU16XYSpeed) +TEST_F(VertexLoaderTest, PositionIndex16FloatXY) { - m_vtx_desc.Position = 1; // Direct - m_vtx_attr.g0.PosElements = 0; // XY - m_vtx_attr.g0.PosFormat = 2; // U16 + m_vtx_desc.Position = INDEX16; + m_vtx_attr.g0.PosFormat = FORMAT_FLOAT; + CreateAndCheckSizes(sizeof(u16)); + Input(1); Input(0); + cached_arraybases[ARRAY_POSITION] = m_src.GetPointer(); + g_main_cp_state.array_strides[ARRAY_POSITION] = sizeof(float); // ;) + Input(1.f); Input(2.f); Input(3.f); + RunVertices(2); + ExpectOut(2); ExpectOut(3); m_dst.Skip(sizeof(float)); + ExpectOut(1); ExpectOut(2); m_dst.Skip(sizeof(float)); +} - VertexLoaderBase* loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); - - ASSERT_EQ(3 * sizeof(float), (u32)loader->m_native_vtx_decl.stride); - ASSERT_EQ(2 * sizeof(u16), (u32)loader->m_VertexSize); +class VertexLoaderSpeedTest : public VertexLoaderTest, public ::testing::WithParamInterface> {}; +extern int gtest_FormatsAndElementsVertexLoaderSpeedTest_dummy_; +INSTANTIATE_TEST_CASE_P( + FormatsAndElements, VertexLoaderSpeedTest, + ::testing::Combine( + ::testing::Values(FORMAT_UBYTE, FORMAT_BYTE, FORMAT_USHORT, FORMAT_SHORT, FORMAT_FLOAT), + ::testing::Values(0, 1) // elements + ) +); +TEST_P(VertexLoaderSpeedTest, PositionDirectAll) +{ + int format, elements; + std::tie(format, elements) = GetParam(); + const char* map[] = { "u8", "s8", "u16", "s16", "float" }; + printf("format: %s, elements: %d\n", map[format], elements); + m_vtx_desc.Position = DIRECT; + m_vtx_attr.g0.PosFormat = format; + m_vtx_attr.g0.PosElements = elements; + elements += 2; + size_t elem_size = 1 << (format / 2); + CreateAndCheckSizes(elements * elem_size); for (int i = 0; i < 1000; ++i) - { - ResetPointers(); - int count = loader->RunVertices(src, dst, 100000, 7); - src.Skip(100000 * loader->m_VertexSize); - dst.Skip(count * loader->m_native_vtx_decl.stride); - } - delete loader; + RunVertices(100000); +} + +TEST_P(VertexLoaderSpeedTest, TexCoordSingleElement) +{ + int format, elements; + std::tie(format, elements) = GetParam(); + const char* map[] = { "u8", "s8", "u16", "s16", "float" }; + printf("format: %s, elements: %d\n", map[format], elements); + m_vtx_desc.Position = DIRECT; + m_vtx_attr.g0.PosFormat = FORMAT_BYTE; + m_vtx_desc.Tex0Coord = DIRECT; + m_vtx_attr.g0.Tex0CoordFormat = format; + m_vtx_attr.g0.Tex0CoordElements = elements; + elements += 1; + size_t elem_size = 1 << (format / 2); + CreateAndCheckSizes(2 * sizeof(s8) + elements * elem_size, + 2 * sizeof(float) + elements * sizeof(float)); + for (int i = 0; i < 1000; ++i) + RunVertices(100000); } TEST_F(VertexLoaderTest, LargeFloatVertexSpeed) { - // Enables most attributes in floating point direct mode to test speed. + // Enables most attributes in floating point indexed mode to test speed. m_vtx_desc.PosMatIdx = 1; m_vtx_desc.Tex0MatIdx = 1; m_vtx_desc.Tex1MatIdx = 1; @@ -222,54 +255,54 @@ TEST_F(VertexLoaderTest, LargeFloatVertexSpeed) m_vtx_desc.Tex5MatIdx = 1; m_vtx_desc.Tex6MatIdx = 1; m_vtx_desc.Tex7MatIdx = 1; - m_vtx_desc.Position = 1; - m_vtx_desc.Normal = 1; - m_vtx_desc.Color0 = 1; - m_vtx_desc.Color1 = 1; - m_vtx_desc.Tex0Coord = 1; - m_vtx_desc.Tex1Coord = 1; - m_vtx_desc.Tex2Coord = 1; - m_vtx_desc.Tex3Coord = 1; - m_vtx_desc.Tex4Coord = 1; - m_vtx_desc.Tex5Coord = 1; - m_vtx_desc.Tex6Coord = 1; - m_vtx_desc.Tex7Coord = 1; + m_vtx_desc.Position = INDEX16; + m_vtx_desc.Normal = INDEX16; + m_vtx_desc.Color0 = INDEX16; + m_vtx_desc.Color1 = INDEX16; + m_vtx_desc.Tex0Coord = INDEX16; + m_vtx_desc.Tex1Coord = INDEX16; + m_vtx_desc.Tex2Coord = INDEX16; + m_vtx_desc.Tex3Coord = INDEX16; + m_vtx_desc.Tex4Coord = INDEX16; + m_vtx_desc.Tex5Coord = INDEX16; + m_vtx_desc.Tex6Coord = INDEX16; + m_vtx_desc.Tex7Coord = INDEX16; m_vtx_attr.g0.PosElements = 1; // XYZ - m_vtx_attr.g0.PosFormat = 4; // Float + m_vtx_attr.g0.PosFormat = FORMAT_FLOAT; m_vtx_attr.g0.NormalElements = 1; // NBT - m_vtx_attr.g0.NormalFormat = 4; // Float + m_vtx_attr.g0.NormalFormat = FORMAT_FLOAT; m_vtx_attr.g0.Color0Elements = 1; // Has Alpha - m_vtx_attr.g0.Color0Comp = 5; // RGBA8888 + m_vtx_attr.g0.Color0Comp = FORMAT_32B_8888; m_vtx_attr.g0.Color1Elements = 1; // Has Alpha - m_vtx_attr.g0.Color1Comp = 5; // RGBA8888 + m_vtx_attr.g0.Color1Comp = FORMAT_32B_8888; m_vtx_attr.g0.Tex0CoordElements = 1; // ST - m_vtx_attr.g0.Tex0CoordFormat = 4; // Float + m_vtx_attr.g0.Tex0CoordFormat = FORMAT_FLOAT; m_vtx_attr.g1.Tex1CoordElements = 1; // ST - m_vtx_attr.g1.Tex1CoordFormat = 4; // Float + m_vtx_attr.g1.Tex1CoordFormat = FORMAT_FLOAT; m_vtx_attr.g1.Tex2CoordElements = 1; // ST - m_vtx_attr.g1.Tex2CoordFormat = 4; // Float + m_vtx_attr.g1.Tex2CoordFormat = FORMAT_FLOAT; m_vtx_attr.g1.Tex3CoordElements = 1; // ST - m_vtx_attr.g1.Tex3CoordFormat = 4; // Float + m_vtx_attr.g1.Tex3CoordFormat = FORMAT_FLOAT; m_vtx_attr.g1.Tex4CoordElements = 1; // ST - m_vtx_attr.g1.Tex4CoordFormat = 4; // Float + m_vtx_attr.g1.Tex4CoordFormat = FORMAT_FLOAT; m_vtx_attr.g2.Tex5CoordElements = 1; // ST - m_vtx_attr.g2.Tex5CoordFormat = 4; // Float + m_vtx_attr.g2.Tex5CoordFormat = FORMAT_FLOAT; m_vtx_attr.g2.Tex6CoordElements = 1; // ST - m_vtx_attr.g2.Tex6CoordFormat = 4; // Float + m_vtx_attr.g2.Tex6CoordFormat = FORMAT_FLOAT; m_vtx_attr.g2.Tex7CoordElements = 1; // ST - m_vtx_attr.g2.Tex7CoordFormat = 4; // Float + m_vtx_attr.g2.Tex7CoordFormat = FORMAT_FLOAT; - VertexLoaderBase* loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); + CreateAndCheckSizes(33, 156); + + for (int i = 0; i < 16; i++) + { + cached_arraybases[i] = m_src.GetPointer(); + g_main_cp_state.array_strides[i] = 129; + } // This test is only done 100x in a row since it's ~20x slower using the // current vertex loader implementation. for (int i = 0; i < 100; ++i) - { - ResetPointers(); - int count = loader->RunVertices(src, dst, 100000, 7); - src.Skip(100000 * loader->m_VertexSize); - dst.Skip(count * loader->m_native_vtx_decl.stride); - } - delete loader; + RunVertices(100000); } From 5a51bc10e56c8709b55fe53cb438f7a42ac09ad4 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Sat, 14 Mar 2015 10:15:19 +0100 Subject: [PATCH 2/6] SWVertexLoader: fix truncated components --- .../Core/VideoBackends/Software/SWVertexLoader.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp index d824893243..859cf593b4 100644 --- a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp +++ b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp @@ -100,16 +100,17 @@ static T ReadNormalized(I value) } template -static void ReadVertexAttribute(T* dst, DataReader src, const AttributeFormat& format, int base_component, int max_components, bool reverse) +static void ReadVertexAttribute(T* dst, DataReader src, const AttributeFormat& format, int base_component, int components, bool reverse) { if (format.enable) { src.Skip(format.offset); src.Skip(base_component * (1<<(format.type>>1))); - for (int i = 0; i < std::min(format.components - base_component, max_components); i++) + int i; + for (i = 0; i < std::min(format.components - base_component, components); i++) { - int i_dst = reverse ? max_components - i - 1 : i; + int i_dst = reverse ? components - i - 1 : i; switch (format.type) { case VAR_UNSIGNED_BYTE: @@ -131,6 +132,11 @@ static void ReadVertexAttribute(T* dst, DataReader src, const AttributeFormat& f _assert_msg_(VIDEO, !format.integer || format.type != VAR_FLOAT, "only non-float values are allowed to be streamed as integer"); } + for (; i < components; i++) + { + int i_dst = reverse ? components - i - 1 : i; + dst[i_dst] = i == 3; + } } } From 8d90ecda7f8e08a37964f8664aed571dbc6e9051 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Tue, 17 Mar 2015 07:05:19 +0100 Subject: [PATCH 3/6] VertexLoaders: make positions more compact --- Source/Core/VideoCommon/VertexLoader.cpp | 5 +++-- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 5 +++-- Source/Core/VideoCommon/VertexLoaderX64.cpp | 5 +++-- Source/Core/VideoCommon/VertexLoader_Position.cpp | 8 ++++---- Source/UnitTests/VideoCommon/VertexLoaderTest.cpp | 14 ++++++-------- 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index d27d5287ad..49b6d23e1a 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -131,12 +131,13 @@ void VertexLoader::CompileVertexTranslator() WriteCall(VertexLoader_Position::GetFunction(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements)); m_VertexSize += VertexLoader_Position::GetSize(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements); - m_native_vtx_decl.position.components = 3; + int pos_elements = m_VtxAttr.PosElements + 2; + m_native_vtx_decl.position.components = pos_elements; m_native_vtx_decl.position.enable = true; m_native_vtx_decl.position.offset = nat_offset; m_native_vtx_decl.position.type = VAR_FLOAT; m_native_vtx_decl.position.integer = false; - nat_offset += 12; + nat_offset += pos_elements * sizeof(float); // Normals if (m_VtxDesc.Normal != NOT_PRESENT) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index 4a404e9a75..397a179e9e 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -370,8 +370,9 @@ void VertexLoaderARM64::GenerateVertexLoader() load_size <<= 3; s32 offset = GetAddressImm(ARRAY_POSITION, m_VtxDesc.Position, EncodeRegTo64(scratch1_reg), load_size); - ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, - m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position, offset); + int pos_elements = m_VtxAttr.PosElements + 2; + ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, pos_elements, pos_elements, + m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position, offset); } if (m_VtxDesc.Normal) diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index 8d01a5d598..df1a5580b7 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -332,7 +332,8 @@ void VertexLoaderX64::GenerateVertexLoader() } OpArg data = GetVertexAddr(ARRAY_POSITION, m_VtxDesc.Position); - ReadVertex(data, m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, + int pos_elements = 2 + m_VtxAttr.PosElements; + ReadVertex(data, m_VtxDesc.Position, m_VtxAttr.PosFormat, pos_elements, pos_elements, m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position); if (m_VtxDesc.Normal) @@ -408,7 +409,7 @@ void VertexLoaderX64::GenerateVertexLoader() m_native_vtx_decl.texcoords[i].offset = m_dst_ofs; PXOR(XMM0, R(XMM0)); CVTSI2SS(XMM0, R(scratch1)); - SHUFPS(XMM0, R(XMM0), 0x45); + SHUFPS(XMM0, R(XMM0), 0x45); // 000X -> 0X00 MOVUPS(MDisp(dst_reg, m_dst_ofs), XMM0); m_dst_ofs += sizeof(float) * 3; } diff --git a/Source/Core/VideoCommon/VertexLoader_Position.cpp b/Source/Core/VideoCommon/VertexLoader_Position.cpp index 18460ff1e1..3c30d05ed6 100644 --- a/Source/Core/VideoCommon/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Position.cpp @@ -30,8 +30,8 @@ void LOADERDECL Pos_ReadDirect(VertexLoader* loader) DataReader dst(g_vertex_manager_write_ptr, nullptr); DataReader src(g_video_buffer_read_ptr, nullptr); - for (int i = 0; i < 3; ++i) - dst.Write(i < N ? PosScale(src.Read(), scale) : 0.f); + for (int i = 0; i < N; ++i) + dst.Write(PosScale(src.Read(), scale)); g_vertex_manager_write_ptr = dst.GetPointer(); g_video_buffer_read_ptr = src.GetPointer(); @@ -50,8 +50,8 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader) auto const scale = loader->m_posScale; DataReader dst(g_vertex_manager_write_ptr, nullptr); - for (int i = 0; i < 3; ++i) - dst.Write(i < N ? PosScale(Common::FromBigEndian(data[i]), scale) : 0.f); + for (int i = 0; i < N; ++i) + dst.Write(PosScale(Common::FromBigEndian(data[i]), scale)); g_vertex_manager_write_ptr = dst.GetPointer(); LOG_VTX(); diff --git a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp index d3a5bc7b82..1c6579b795 100644 --- a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp +++ b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp @@ -54,7 +54,7 @@ protected: ResetPointers(); } - void CreateAndCheckSizes(size_t input_size, size_t output_size = 3 * sizeof(float)) + void CreateAndCheckSizes(size_t input_size, size_t output_size) { m_loader.reset(VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr)); ASSERT_EQ((int)input_size, m_loader->m_VertexSize); @@ -150,7 +150,7 @@ TEST_P(VertexLoaderParamTest, PositionAll) cached_arraybases[ARRAY_POSITION] = m_src.GetPointer(); g_main_cp_state.array_strides[ARRAY_POSITION] = elements * elem_size; } - CreateAndCheckSizes(input_size); + CreateAndCheckSizes(input_size, elements * sizeof(float)); for (float value : values) { switch (format) @@ -179,8 +179,6 @@ TEST_P(VertexLoaderParamTest, PositionAll) } ExpectOut(f * scale); ExpectOut(g * scale); - if (elements == 2) - m_dst.Skip(sizeof(float)); } } @@ -188,14 +186,14 @@ TEST_F(VertexLoaderTest, PositionIndex16FloatXY) { m_vtx_desc.Position = INDEX16; m_vtx_attr.g0.PosFormat = FORMAT_FLOAT; - CreateAndCheckSizes(sizeof(u16)); + CreateAndCheckSizes(sizeof(u16), 2 * sizeof(float)); Input(1); Input(0); cached_arraybases[ARRAY_POSITION] = m_src.GetPointer(); g_main_cp_state.array_strides[ARRAY_POSITION] = sizeof(float); // ;) Input(1.f); Input(2.f); Input(3.f); RunVertices(2); - ExpectOut(2); ExpectOut(3); m_dst.Skip(sizeof(float)); - ExpectOut(1); ExpectOut(2); m_dst.Skip(sizeof(float)); + ExpectOut(2); ExpectOut(3); + ExpectOut(1); ExpectOut(2); } class VertexLoaderSpeedTest : public VertexLoaderTest, public ::testing::WithParamInterface> {}; @@ -219,7 +217,7 @@ TEST_P(VertexLoaderSpeedTest, PositionDirectAll) m_vtx_attr.g0.PosElements = elements; elements += 2; size_t elem_size = 1 << (format / 2); - CreateAndCheckSizes(elements * elem_size); + CreateAndCheckSizes(elements * elem_size, elements * sizeof(float)); for (int i = 0; i < 1000; ++i) RunVertices(100000); } From 9793fed7426b3f17bdfd301a71bc1e701c1be92b Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Sun, 8 Mar 2015 15:19:59 +0100 Subject: [PATCH 4/6] XEmitter: add PUNPCKLQDQ --- Source/Core/Common/x64Emitter.cpp | 1 + Source/Core/Common/x64Emitter.h | 1 + Source/UnitTests/Common/x64EmitterTest.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/Source/Core/Common/x64Emitter.cpp b/Source/Core/Common/x64Emitter.cpp index e7ab88396d..0f37c547d2 100644 --- a/Source/Core/Common/x64Emitter.cpp +++ b/Source/Core/Common/x64Emitter.cpp @@ -1638,6 +1638,7 @@ void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, ar void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);} void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);} void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);} +void XEmitter::PUNPCKLQDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6C, dest, arg);} void XEmitter::PSRLW(X64Reg reg, int shift) { diff --git a/Source/Core/Common/x64Emitter.h b/Source/Core/Common/x64Emitter.h index 1c32c54969..962f685108 100644 --- a/Source/Core/Common/x64Emitter.h +++ b/Source/Core/Common/x64Emitter.h @@ -680,6 +680,7 @@ public: void PUNPCKLBW(X64Reg dest, const OpArg &arg); void PUNPCKLWD(X64Reg dest, const OpArg &arg); void PUNPCKLDQ(X64Reg dest, const OpArg &arg); + void PUNPCKLQDQ(X64Reg dest, const OpArg &arg); void PTEST(X64Reg dest, OpArg arg); void PAND(X64Reg dest, OpArg arg); diff --git a/Source/UnitTests/Common/x64EmitterTest.cpp b/Source/UnitTests/Common/x64EmitterTest.cpp index 614847e955..1f7d08f114 100644 --- a/Source/UnitTests/Common/x64EmitterTest.cpp +++ b/Source/UnitTests/Common/x64EmitterTest.cpp @@ -810,6 +810,7 @@ TWO_OP_SSE_TEST(PACKUSWB, "dqword") TWO_OP_SSE_TEST(PUNPCKLBW, "dqword") TWO_OP_SSE_TEST(PUNPCKLWD, "dqword") TWO_OP_SSE_TEST(PUNPCKLDQ, "dqword") +TWO_OP_SSE_TEST(PUNPCKLQDQ, "dqword") TWO_OP_SSE_TEST(PTEST, "dqword") TWO_OP_SSE_TEST(PAND, "dqword") From 70305425462843af192489da00389559cd6f5834 Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Wed, 18 Mar 2015 11:26:21 +0100 Subject: [PATCH 5/6] VertexLoaderX64: support SSE2 as a fallback With suggestions by Fiora and magumagu. --- Source/Core/VideoCommon/VertexLoaderX64.cpp | 137 ++++++++++++++------ Source/Core/VideoCommon/VertexLoaderX64.h | 2 +- 2 files changed, 99 insertions(+), 40 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index df1a5580b7..a71daba28e 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -1,3 +1,4 @@ +#include "Common/BitSet.h" #include "Common/CPUDetect.h" #include "Common/Intrinsics.h" #include "Common/JitRegister.h" @@ -6,8 +7,6 @@ using namespace Gen; -#define VERTEX_LOADER_REGS {XMM0+16} - static const X64Reg src_reg = ABI_PARAM1; static const X64Reg dst_reg = ABI_PARAM2; static const X64Reg scratch1 = RAX; @@ -98,47 +97,110 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count int elem_size = 1 << (format / 2); int load_bytes = elem_size * count_in; - if (load_bytes > 8) - MOVDQU(coords, data); - else if (load_bytes > 4) - MOVQ_xmm(coords, data); - else - MOVD_xmm(coords, data); - - PSHUFB(coords, M(&shuffle_lut[format][count_in - 1])); - - if (format != FORMAT_FLOAT) - { - // Sign extend - if (format == FORMAT_BYTE) - PSRAD(coords, 24); - if (format == FORMAT_SHORT) - PSRAD(coords, 16); - - CVTDQ2PS(coords, R(coords)); - - if (dequantize && scaling_exponent) - MULPS(coords, M(&scale_factors[scaling_exponent])); - } - OpArg dest = MDisp(dst_reg, m_dst_ofs); - switch (count_out) - { - case 1: MOVSS(dest, coords); break; - case 2: MOVLPS(dest, coords); break; - case 3: MOVUPS(dest, coords); break; - } native_format->components = count_out; native_format->enable = true; native_format->offset = m_dst_ofs; native_format->type = VAR_FLOAT; native_format->integer = false; + m_dst_ofs += sizeof(float) * count_out; if (attribute == DIRECT) m_src_ofs += load_bytes; + if (cpu_info.bSSSE3) + { + if (load_bytes > 8) + MOVDQU(coords, data); + else if (load_bytes > 4) + MOVQ_xmm(coords, data); + else + MOVD_xmm(coords, data); + + PSHUFB(coords, M(&shuffle_lut[format][count_in - 1])); + + // Sign-extend. + if (format == FORMAT_BYTE) + PSRAD(coords, 24); + if (format == FORMAT_SHORT) + PSRAD(coords, 16); + } + else + { + // SSE2 + X64Reg temp = XMM1; + switch (format) + { + case FORMAT_UBYTE: + MOVD_xmm(coords, data); + PXOR(temp, R(temp)); + PUNPCKLBW(coords, R(temp)); + PUNPCKLWD(coords, R(temp)); + break; + case FORMAT_BYTE: + MOVD_xmm(coords, data); + PUNPCKLBW(coords, R(coords)); + PUNPCKLWD(coords, R(coords)); + PSRAD(coords, 24); + break; + case FORMAT_USHORT: + case FORMAT_SHORT: + switch (count_in) + { + case 1: + LoadAndSwap(32, scratch3, data); + MOVD_xmm(coords, R(scratch3)); // ......X. + break; + case 2: + LoadAndSwap(32, scratch3, data); + MOVD_xmm(coords, R(scratch3)); // ......XY + PSHUFLW(coords, R(coords), 0x24); // ....Y.X. + break; + case 3: + LoadAndSwap(64, scratch3, data); + MOVQ_xmm(coords, R(scratch3)); // ....XYZ. + PUNPCKLQDQ(coords, R(coords)); // ..Z.XYZ. + PSHUFLW(coords, R(coords), 0xAC); // ..Z.Y.X. + break; + } + if (format == FORMAT_SHORT) + PSRAD(coords, 16); + else + PSRLD(coords, 16); + break; + case FORMAT_FLOAT: + // Floats don't need to be scaled or converted, + // so we can just load/swap/store them directly + // and return early. + // (In SSSE3 we still need to store them.) + for (int i = 0; i < count_in; i++) + { + LoadAndSwap(32, scratch3, data); + MOV(32, dest, R(scratch3)); + data.offset += sizeof(float); + dest.offset += sizeof(float); + } + return load_bytes; + } + } + + if (format != FORMAT_FLOAT) + { + CVTDQ2PS(coords, R(coords)); + + if (dequantize && scaling_exponent) + MULPS(coords, M(&scale_factors[scaling_exponent])); + } + + switch (count_out) + { + case 1: MOVSS(dest, coords); break; + case 2: MOVLPS(dest, coords); break; + case 3: MOVUPS(dest, coords); break; + } + return load_bytes; } @@ -290,7 +352,10 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format) void VertexLoaderX64::GenerateVertexLoader() { - ABI_PushRegistersAndAdjustStack(VERTEX_LOADER_REGS, 8); + BitSet32 xmm_regs; + xmm_regs[XMM0+16] = true; + xmm_regs[XMM1+16] = !cpu_info.bSSSE3; + ABI_PushRegistersAndAdjustStack(xmm_regs, 8); // Backup count since we're going to count it down. PUSH(32, R(ABI_PARAM3)); @@ -427,7 +492,7 @@ void VertexLoaderX64::GenerateVertexLoader() // Get the original count. POP(32, R(ABI_RETURN)); - ABI_PopRegistersAndAdjustStack(VERTEX_LOADER_REGS, 8); + ABI_PopRegistersAndAdjustStack(xmm_regs, 8); if (m_VtxDesc.Position & MASK_INDEXED) { @@ -447,12 +512,6 @@ void VertexLoaderX64::GenerateVertexLoader() m_native_vtx_decl.stride = m_dst_ofs; } -bool VertexLoaderX64::IsInitialized() -{ - // Uses PSHUFB. - return cpu_info.bSSSE3; -} - int VertexLoaderX64::RunVertices(DataReader src, DataReader dst, int count, int primitive) { m_numLoadedVertices += count; diff --git a/Source/Core/VideoCommon/VertexLoaderX64.h b/Source/Core/VideoCommon/VertexLoaderX64.h index 0139d392b7..53b7b58d9e 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.h +++ b/Source/Core/VideoCommon/VertexLoaderX64.h @@ -8,7 +8,7 @@ public: protected: std::string GetName() const override { return "VertexLoaderX64"; } - bool IsInitialized() override; + bool IsInitialized() override { return true; } int RunVertices(DataReader src, DataReader dst, int count, int primitive) override; private: From 9da86092aeb1fda7470a661a36b794d94aa1e38b Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Wed, 18 Mar 2015 11:44:23 +0100 Subject: [PATCH 6/6] VertexLoaderX64: use common code for FORMAT_FLOAT --- Source/Core/VideoCommon/VertexLoaderX64.cpp | 42 ++++++++++----------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index a71daba28e..f218befa39 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -65,7 +65,7 @@ OpArg VertexLoaderX64::GetVertexAddr(int array, u64 attribute) int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format) { - static const __m128i shuffle_lut[5][3] = { + static const __m128i shuffle_lut[4][3] = { {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF00L), // 1x u8 _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L), // 2x u8 _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L)}, // 3x u8 @@ -78,9 +78,6 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x0001FFFFL), // 1x s16 _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL), // 2x s16 _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL)}, // 3x s16 - {_mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x00010203L), // 1x float - _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L), // 2x float - _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L)}, // 3x float }; static const __m128 scale_factors[32] = { _mm_set_ps1(1./(1u<< 0)), _mm_set_ps1(1./(1u<< 1)), _mm_set_ps1(1./(1u<< 2)), _mm_set_ps1(1./(1u<< 3)), @@ -110,6 +107,21 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count if (attribute == DIRECT) m_src_ofs += load_bytes; + if (format == FORMAT_FLOAT) + { + // Floats don't need to be scaled or converted, + // so we can just load/swap/store them directly + // and return early. + for (int i = 0; i < count_in; i++) + { + LoadAndSwap(32, scratch3, data); + MOV(32, dest, R(scratch3)); + data.offset += sizeof(float); + dest.offset += sizeof(float); + } + return load_bytes; + } + if (cpu_info.bSSSE3) { if (load_bytes > 8) @@ -170,29 +182,13 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count else PSRLD(coords, 16); break; - case FORMAT_FLOAT: - // Floats don't need to be scaled or converted, - // so we can just load/swap/store them directly - // and return early. - // (In SSSE3 we still need to store them.) - for (int i = 0; i < count_in; i++) - { - LoadAndSwap(32, scratch3, data); - MOV(32, dest, R(scratch3)); - data.offset += sizeof(float); - dest.offset += sizeof(float); - } - return load_bytes; } } - if (format != FORMAT_FLOAT) - { - CVTDQ2PS(coords, R(coords)); + CVTDQ2PS(coords, R(coords)); - if (dequantize && scaling_exponent) - MULPS(coords, M(&scale_factors[scaling_exponent])); - } + if (dequantize && scaling_exponent) + MULPS(coords, M(&scale_factors[scaling_exponent])); switch (count_out) {