From 358169507cbe806afe0b33f91e124cf0022b61e4 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 17 Jun 2019 15:20:21 +0300 Subject: [PATCH] rsx: Use SSE to accelerate index buffer uploads --- rpcs3/Emu/RSX/Common/BufferUtils.cpp | 141 ++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 5 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/BufferUtils.cpp b/rpcs3/Emu/RSX/Common/BufferUtils.cpp index 53e20aba99..8445b8c794 100644 --- a/rpcs3/Emu/RSX/Common/BufferUtils.cpp +++ b/rpcs3/Emu/RSX/Common/BufferUtils.cpp @@ -561,19 +561,150 @@ namespace struct untouched_impl { + static + std::tuple upload_u16_swapped(const void *src, void *dst, u32 count) + { + const __m128i mask = _mm_set_epi8( + 0xE, 0xF, 0xC, 0xD, + 0xA, 0xB, 0x8, 0x9, + 0x6, 0x7, 0x4, 0x5, + 0x2, 0x3, 0x0, 0x1); + + auto src_stream = (const __m128i*)src; + auto dst_stream = (__m128i*)dst; + + __m128i min = _mm_set1_epi16(0xFFFF); + __m128i max = _mm_set1_epi16(0); + + const auto iterations = count / 8; + for (unsigned n = 0; n < iterations; ++n) + { + const __m128i raw = _mm_loadu_si128(src_stream++); + const __m128i value = _mm_shuffle_epi8(raw, mask); + max = _mm_max_epu16(max, value); + min = _mm_min_epu16(min, value); + _mm_storeu_si128(dst_stream++, value); + } + + const __m128i mask_step1 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); + + const __m128i mask_step2 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4); + + const __m128i mask_step3 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0x3, 0x2); + + __m128i tmp = _mm_shuffle_epi8(min, mask_step1); + min = _mm_min_epu16(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step2); + min = _mm_min_epu16(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step3); + min = _mm_min_epu16(min, tmp); + + tmp = _mm_shuffle_epi8(max, mask_step1); + max = _mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step2); + max = _mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step3); + max = _mm_max_epu16(max, tmp); + + const u16 min_index = u16(_mm_cvtsi128_si32(min) & 0xFFFF); + const u16 max_index = u16(_mm_cvtsi128_si32(max) & 0xFFFF); + + return std::make_tuple(min_index, max_index, count); + } + + static + std::tuple upload_u32_swapped(const void *src, void *dst, u32 count) + { + const __m128i mask = _mm_set_epi8( + 0xC, 0xD, 0xE, 0xF, + 0x8, 0x9, 0xA, 0xB, + 0x4, 0x5, 0x6, 0x7, + 0x0, 0x1, 0x2, 0x3); + + auto src_stream = (const __m128i*)src; + auto dst_stream = (__m128i*)dst; + + __m128i min = _mm_set1_epi32(~0u); + __m128i max = _mm_set1_epi32(0); + + const auto iterations = count / 4; + for (unsigned n = 0; n < iterations; ++n) + { + const __m128i raw = _mm_loadu_si128(src_stream++); + const __m128i value = _mm_shuffle_epi8(raw, mask); + max = _mm_max_epu32(max, value); + min = _mm_min_epu32(min, value); + _mm_storeu_si128(dst_stream++, value); + } + + // Aggregate min-max + const __m128i mask_step1 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0xF, 0xE, 0xD, 0xC, 0xB, 0xA, 0x9, 0x8); + + const __m128i mask_step2 = _mm_set_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0x7, 0x6, 0x5, 0x4); + + __m128i tmp = _mm_shuffle_epi8(min, mask_step1); + min = _mm_min_epu16(min, tmp); + tmp = _mm_shuffle_epi8(min, mask_step2); + min = _mm_min_epu16(min, tmp); + + tmp = _mm_shuffle_epi8(max, mask_step1); + max = _mm_max_epu16(max, tmp); + tmp = _mm_shuffle_epi8(max, mask_step2); + max = _mm_max_epu16(max, tmp); + + const u32 min_index = u32(_mm_cvtsi128_si32(min)); + const u32 max_index = u32(_mm_cvtsi128_si32(max)); + + return std::make_tuple(min_index, max_index, count); + } + template static std::tuple upload_untouched(gsl::span> src, gsl::span dst) { - T min_index = index_limit(), max_index = 0; - u32 dst_index = 0; + T min_index, max_index; + u32 written; + u32 remaining = src.size(); - for (const T index : src) + if (s_use_ssse3 && remaining >= 32) { - dst[dst_index++] = min_max(min_index, max_index, index); + if constexpr (std::is_same::value) + { + const auto count = (remaining & ~0x3); + std::tie(min_index, max_index, written) = upload_u32_swapped(src.data(), dst.data(), count); + } + else if constexpr (std::is_same::value) + { + const auto count = (remaining & ~0x7); + std::tie(min_index, max_index, written) = upload_u16_swapped(src.data(), dst.data(), count); + } + + remaining -= written; + } + else + { + min_index = index_limit(); + max_index = 0; + written = 0; } - return std::make_tuple(min_index, max_index, dst_index); + while (remaining--) + { + T index = src[written]; + dst[written++] = min_max(min_index, max_index, index); + } + + return std::make_tuple(min_index, max_index, written); } };