From 57c53a86b4a0309ff5fa2327b946f36415261df1 Mon Sep 17 00:00:00 2001 From: byuu <2107894+byuu@users.noreply.github.com> Date: Wed, 2 Oct 2019 09:56:13 +0900 Subject: [PATCH] v110.5 Merged Alcaro's AVX2 mode 7 renderer. Enable by editing mode7hd.cpp: USE_AVX2=1 --- bsnes/emulator/emulator.hpp | 10 ++- bsnes/sfc/ppu-fast/mode7hd-avx2.cpp | 107 ++++++++++++++++++++++++++++ bsnes/sfc/ppu-fast/mode7hd.cpp | 11 ++- bsnes/sfc/ppu-fast/ppu.cpp | 1 + bsnes/sfc/ppu-fast/ppu.hpp | 9 +++ 5 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 bsnes/sfc/ppu-fast/mode7hd-avx2.cpp diff --git a/bsnes/emulator/emulator.hpp b/bsnes/emulator/emulator.hpp index e83554a0..23739eeb 100644 --- a/bsnes/emulator/emulator.hpp +++ b/bsnes/emulator/emulator.hpp @@ -22,6 +22,14 @@ using namespace nall; #include +#ifdef __SSE2__ + #include +#endif + +#ifdef __AVX2__ + #include +#endif + #include #include #include @@ -29,7 +37,7 @@ using namespace nall; namespace Emulator { static const string Name = "bsnes"; - static const string Version = "110.4"; + static const string Version = "110.5"; static const string Author = "byuu"; static const string License = "GPLv3"; static const string Website = "https://byuu.org"; diff --git a/bsnes/sfc/ppu-fast/mode7hd-avx2.cpp b/bsnes/sfc/ppu-fast/mode7hd-avx2.cpp new file mode 100644 index 00000000..078b1832 --- /dev/null +++ b/bsnes/sfc/ppu-fast/mode7hd-avx2.cpp @@ -0,0 +1,107 @@ +#if USE_AVX2 == 1 +__attribute__((__target__("avx2"))) +auto PPU::Line::renderMode7HD_AVX2( + PPU::IO::Background& self, uint8 source, + Pixel* above, Pixel* below, + bool* windowAbove, bool* windowBelow, + float originX, float a, + float originY, float c +) -> void { + const bool extbg = source == Source::BG2; + const uint scale = ppu.hdScale(); + + //8 pixels at the time, one int32 per pixel + for (int x : range(256 * scale / 8)) { + //ensure everything has the expected types. + //some accesses are outside the buffers; the unwanted bytes are masked out, + //but it's still undefined behavior, and it will crash if those bytes are unmapped or if compiler becomes too clever. + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(sizeof(Pixel) == 4); + static_assert(offsetof(Pixel, source) == 0); + static_assert(std::is_same_v); + static_assert(offsetof(Pixel, priority) == 1); + static_assert(std::is_same_v); + static_assert(offsetof(Pixel, color) == 2); + static_assert(std::is_same_v); + + __m256 offset_f32 = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7); + __m256 posi_f32 = _mm256_add_ps(_mm256_set1_ps(x * 8), offset_f32); + __m256 pos_f32 = _mm256_div_ps(posi_f32, _mm256_set1_ps(scale)); + + __m256i doAbove = _mm256_setzero_si256(); //0x00000000 - ignore, 0xFFFFFFFF - render + if(self.aboveEnable) { + doAbove = _mm256_i32gather_epi32((int*)windowAbove, _mm256_cvtps_epi32(pos_f32), 1); + doAbove = _mm256_and_si256(doAbove, _mm256_set1_epi32(0x00000001)); + doAbove = _mm256_sub_epi32(doAbove, _mm256_set1_epi32(0x00000001)); + } + __m256i doBelow = _mm256_setzero_si256(); + if(self.belowEnable) { + doBelow = _mm256_i32gather_epi32((int*)windowBelow, _mm256_cvtps_epi32(pos_f32), 1); + doBelow = _mm256_and_si256(doBelow, _mm256_set1_epi32(0x00000001)); + doBelow = _mm256_sub_epi32(doBelow, _mm256_set1_epi32(0x00000001)); + } + + __m256 xf = _mm256_add_ps(pos_f32, _mm256_set1_ps(-0.5)); + if(io.mode7.hflip) xf = _mm256_sub_ps(_mm256_set1_ps(255), xf); + + __m256i pixelXx256 = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(originX), _mm256_mul_ps(_mm256_set1_ps(a), xf))); + __m256i pixelYx256 = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(originY), _mm256_mul_ps(_mm256_set1_ps(c), xf))); + + __m256i tile_addr = _mm256_or_si256(_mm256_and_si256(_mm256_srai_epi32(pixelYx256, 4), _mm256_set1_epi32(127 << 7)), + _mm256_and_si256(_mm256_srai_epi32(pixelXx256, 11), _mm256_set1_epi32(127))); + __m256i tile = _mm256_and_si256(_mm256_i32gather_epi32((int*)ppu.vram, tile_addr, sizeof(uint16_t)), _mm256_set1_epi32(0x000000FF)); + __m256i outOfBounds = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(pixelXx256, pixelYx256), _mm256_set1_epi32((~1023) * 256)), _mm256_setzero_si256()); + if(io.mode7.repeat == 3) + tile = _mm256_blendv_epi8(_mm256_setzero_si256(), tile, outOfBounds); + __m256i palette_addr = _mm256_or_si256(_mm256_or_si256( + _mm256_and_si256(_mm256_srli_epi32(pixelYx256, 5), _mm256_set1_epi32(7 << 3)), + _mm256_and_si256(_mm256_srli_epi32(pixelXx256, 8), _mm256_set1_epi32(7))), + _mm256_and_si256(_mm256_slli_epi32(tile, 6), _mm256_set1_epi32(0xFF << 6))); + __m256i palette = _mm256_and_si256(_mm256_srai_epi32(_mm256_i32gather_epi32((int*)ppu.vram, palette_addr, sizeof(uint16_t)), 8), _mm256_set1_epi32(0x000000FF)); + if(io.mode7.repeat == 2) + palette = _mm256_blendv_epi8(_mm256_setzero_si256(), palette, outOfBounds); + + __m256i priority; + if(!extbg) { + priority = _mm256_set1_epi32(self.priority[0]); + } else { + priority = _mm256_blendv_epi8(_mm256_set1_epi32(self.priority[0]), _mm256_set1_epi32(self.priority[1]), + _mm256_shuffle_epi8(palette, _mm256_set_epi32(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C, + 0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C))); + palette = _mm256_and_si256(palette, _mm256_set1_epi32(0x0000007F)); + } + + __m256i color; + if(io.col.directColor && !extbg) { + color = _mm256_or_si256(_mm256_or_si256( + _mm256_and_si256(_mm256_slli_epi32(palette, 2 + 16), _mm256_set1_epi32(0x001C0000)), + _mm256_and_si256(_mm256_slli_epi32(palette, 4 + 16), _mm256_set1_epi32(0x03800000))), + _mm256_and_si256(_mm256_slli_epi32(palette, 7 + 16), _mm256_set1_epi32(0x60000000))); + } else { + //cgram-1 gives me a free <<16 + color = _mm256_and_si256(_mm256_i32gather_epi32((int*)(cgram - 1), palette, sizeof(uint16_t)), _mm256_set1_epi32(0xFFFF0000)); + } + + __m256i pixels = _mm256_or_si256(_mm256_or_si256(color, _mm256_set1_epi32(source)), _mm256_slli_epi32(priority, 8)); + __m256i prevAbove = _mm256_loadu_si256((__m256i*)above); + __m256i prevBelow = _mm256_loadu_si256((__m256i*)below); + + if(extbg) { + doAbove = _mm256_and_si256(doAbove, _mm256_shuffle_epi8(_mm256_cmpgt_epi8(pixels, prevAbove), + _mm256_set_epi32(0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D, + 0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D))); + doBelow = _mm256_and_si256(doBelow, _mm256_shuffle_epi8(_mm256_cmpgt_epi8(pixels, prevBelow), + _mm256_set_epi32(0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D, + 0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D))); + } + + _mm256_storeu_si256((__m256i*)above, _mm256_blendv_epi8(prevAbove, pixels, doAbove)); + _mm256_storeu_si256((__m256i*)below, _mm256_blendv_epi8(prevBelow, pixels, doBelow)); + + above += 8; + below += 8; + } +} +#endif diff --git a/bsnes/sfc/ppu-fast/mode7hd.cpp b/bsnes/sfc/ppu-fast/mode7hd.cpp index 868504a1..9c247494 100644 --- a/bsnes/sfc/ppu-fast/mode7hd.cpp +++ b/bsnes/sfc/ppu-fast/mode7hd.cpp @@ -1,3 +1,5 @@ +#define USE_AVX2 0 + //determine mode 7 line groups for perspective correction auto PPU::Line::cacheMode7HD() -> void { ppu.mode7LineGroups.count = 0; @@ -168,7 +170,14 @@ auto PPU::Line::renderMode7HD(PPU::IO::Background& self, uint8 source) -> void { int ht = (hoffset - hcenter) % 1024; float vty = ((voffset - vcenter) % 1024) + yf; float originX = (a * ht) + (b * vty) + (hcenter << 8); - float originY = (c * ht) + (d * vty) + (vcenter << 8); + float originY = (c * ht) + (d * vty) + (vcenter << 8); + + if(USE_AVX2 == 1) { //__builtin_cpu_supports("avx2")) { + renderMode7HD_AVX2(self, source, above + 1, below + 1, windowAbove, windowBelow, originX, a, originY, c); + above += 256 * scale; + below += 256 * scale; + continue; + } int pixelXp = INT_MIN; for(int x : range(256)) { diff --git a/bsnes/sfc/ppu-fast/ppu.cpp b/bsnes/sfc/ppu-fast/ppu.cpp index 861669b5..28ec091f 100644 --- a/bsnes/sfc/ppu-fast/ppu.cpp +++ b/bsnes/sfc/ppu-fast/ppu.cpp @@ -13,6 +13,7 @@ PPU ppu; #include "background.cpp" #include "mode7.cpp" #include "mode7hd.cpp" +#include "mode7hd-avx2.cpp" #include "object.cpp" #include "window.cpp" #include "serialization.cpp" diff --git a/bsnes/sfc/ppu-fast/ppu.hpp b/bsnes/sfc/ppu-fast/ppu.hpp index 65621d9d..8e2e9b57 100644 --- a/bsnes/sfc/ppu-fast/ppu.hpp +++ b/bsnes/sfc/ppu-fast/ppu.hpp @@ -307,6 +307,15 @@ public: auto renderMode7HD(PPU::IO::Background&, uint8 source) -> void; alwaysinline auto lerp(float pa, float va, float pb, float vb, float pr) -> float; + //mode7hd-avx2.cpp + auto renderMode7HD_AVX2( + PPU::IO::Background&, uint8 source, + Pixel* above, Pixel* below, + bool* windowAbove, bool* windowBelow, + float originX, float a, + float originY, float c + ) -> void; + //object.cpp auto renderObject(PPU::IO::Object&) -> void;