mirror of https://github.com/bsnes-emu/bsnes.git
v110.5
Merged Alcaro's AVX2 mode 7 renderer. Enable by editing mode7hd.cpp: USE_AVX2=1
This commit is contained in:
parent
f19f31938b
commit
57c53a86b4
|
@ -22,6 +22,14 @@ using namespace nall;
|
|||
|
||||
#include <libco/libco.h>
|
||||
|
||||
#ifdef __SSE2__
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#include <emulator/types.hpp>
|
||||
#include <emulator/memory/readable.hpp>
|
||||
#include <emulator/memory/writable.hpp>
|
||||
|
@ -29,7 +37,7 @@ using namespace nall;
|
|||
|
||||
namespace Emulator {
|
||||
static const string Name = "bsnes";
|
||||
static const string Version = "110.4";
|
||||
static const string Version = "110.5";
|
||||
static const string Author = "byuu";
|
||||
static const string License = "GPLv3";
|
||||
static const string Website = "https://byuu.org";
|
||||
|
|
|
@ -0,0 +1,107 @@
|
|||
#if USE_AVX2 == 1
|
||||
__attribute__((__target__("avx2")))
|
||||
auto PPU::Line::renderMode7HD_AVX2(
|
||||
PPU::IO::Background& self, uint8 source,
|
||||
Pixel* above, Pixel* below,
|
||||
bool* windowAbove, bool* windowBelow,
|
||||
float originX, float a,
|
||||
float originY, float c
|
||||
) -> void {
|
||||
const bool extbg = source == Source::BG2;
|
||||
const uint scale = ppu.hdScale();
|
||||
|
||||
//8 pixels at the time, one int32 per pixel
|
||||
for (int x : range(256 * scale / 8)) {
|
||||
//ensure everything has the expected types.
|
||||
//some accesses are outside the buffers; the unwanted bytes are masked out,
|
||||
//but it's still undefined behavior, and it will crash if those bytes are unmapped or if compiler becomes too clever.
|
||||
static_assert(std::is_same_v<decltype(ppu.vram), uint16_t[32768]>);
|
||||
static_assert(std::is_same_v<decltype(cgram), uint16_t[256]>);
|
||||
static_assert(std::is_same_v<decltype(self.priority), uint8_t[2]>);
|
||||
static_assert(sizeof(Pixel) == 4);
|
||||
static_assert(offsetof(Pixel, source) == 0);
|
||||
static_assert(std::is_same_v<decltype(Pixel::source), uint8_t>);
|
||||
static_assert(offsetof(Pixel, priority) == 1);
|
||||
static_assert(std::is_same_v<decltype(Pixel::priority), uint8_t>);
|
||||
static_assert(offsetof(Pixel, color) == 2);
|
||||
static_assert(std::is_same_v<decltype(Pixel::color), uint16_t>);
|
||||
|
||||
__m256 offset_f32 = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
|
||||
__m256 posi_f32 = _mm256_add_ps(_mm256_set1_ps(x * 8), offset_f32);
|
||||
__m256 pos_f32 = _mm256_div_ps(posi_f32, _mm256_set1_ps(scale));
|
||||
|
||||
__m256i doAbove = _mm256_setzero_si256(); //0x00000000 - ignore, 0xFFFFFFFF - render
|
||||
if(self.aboveEnable) {
|
||||
doAbove = _mm256_i32gather_epi32((int*)windowAbove, _mm256_cvtps_epi32(pos_f32), 1);
|
||||
doAbove = _mm256_and_si256(doAbove, _mm256_set1_epi32(0x00000001));
|
||||
doAbove = _mm256_sub_epi32(doAbove, _mm256_set1_epi32(0x00000001));
|
||||
}
|
||||
__m256i doBelow = _mm256_setzero_si256();
|
||||
if(self.belowEnable) {
|
||||
doBelow = _mm256_i32gather_epi32((int*)windowBelow, _mm256_cvtps_epi32(pos_f32), 1);
|
||||
doBelow = _mm256_and_si256(doBelow, _mm256_set1_epi32(0x00000001));
|
||||
doBelow = _mm256_sub_epi32(doBelow, _mm256_set1_epi32(0x00000001));
|
||||
}
|
||||
|
||||
__m256 xf = _mm256_add_ps(pos_f32, _mm256_set1_ps(-0.5));
|
||||
if(io.mode7.hflip) xf = _mm256_sub_ps(_mm256_set1_ps(255), xf);
|
||||
|
||||
__m256i pixelXx256 = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(originX), _mm256_mul_ps(_mm256_set1_ps(a), xf)));
|
||||
__m256i pixelYx256 = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(originY), _mm256_mul_ps(_mm256_set1_ps(c), xf)));
|
||||
|
||||
__m256i tile_addr = _mm256_or_si256(_mm256_and_si256(_mm256_srai_epi32(pixelYx256, 4), _mm256_set1_epi32(127 << 7)),
|
||||
_mm256_and_si256(_mm256_srai_epi32(pixelXx256, 11), _mm256_set1_epi32(127)));
|
||||
__m256i tile = _mm256_and_si256(_mm256_i32gather_epi32((int*)ppu.vram, tile_addr, sizeof(uint16_t)), _mm256_set1_epi32(0x000000FF));
|
||||
__m256i outOfBounds = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(pixelXx256, pixelYx256), _mm256_set1_epi32((~1023) * 256)), _mm256_setzero_si256());
|
||||
if(io.mode7.repeat == 3)
|
||||
tile = _mm256_blendv_epi8(_mm256_setzero_si256(), tile, outOfBounds);
|
||||
__m256i palette_addr = _mm256_or_si256(_mm256_or_si256(
|
||||
_mm256_and_si256(_mm256_srli_epi32(pixelYx256, 5), _mm256_set1_epi32(7 << 3)),
|
||||
_mm256_and_si256(_mm256_srli_epi32(pixelXx256, 8), _mm256_set1_epi32(7))),
|
||||
_mm256_and_si256(_mm256_slli_epi32(tile, 6), _mm256_set1_epi32(0xFF << 6)));
|
||||
__m256i palette = _mm256_and_si256(_mm256_srai_epi32(_mm256_i32gather_epi32((int*)ppu.vram, palette_addr, sizeof(uint16_t)), 8), _mm256_set1_epi32(0x000000FF));
|
||||
if(io.mode7.repeat == 2)
|
||||
palette = _mm256_blendv_epi8(_mm256_setzero_si256(), palette, outOfBounds);
|
||||
|
||||
__m256i priority;
|
||||
if(!extbg) {
|
||||
priority = _mm256_set1_epi32(self.priority[0]);
|
||||
} else {
|
||||
priority = _mm256_blendv_epi8(_mm256_set1_epi32(self.priority[0]), _mm256_set1_epi32(self.priority[1]),
|
||||
_mm256_shuffle_epi8(palette, _mm256_set_epi32(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C,
|
||||
0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C)));
|
||||
palette = _mm256_and_si256(palette, _mm256_set1_epi32(0x0000007F));
|
||||
}
|
||||
|
||||
__m256i color;
|
||||
if(io.col.directColor && !extbg) {
|
||||
color = _mm256_or_si256(_mm256_or_si256(
|
||||
_mm256_and_si256(_mm256_slli_epi32(palette, 2 + 16), _mm256_set1_epi32(0x001C0000)),
|
||||
_mm256_and_si256(_mm256_slli_epi32(palette, 4 + 16), _mm256_set1_epi32(0x03800000))),
|
||||
_mm256_and_si256(_mm256_slli_epi32(palette, 7 + 16), _mm256_set1_epi32(0x60000000)));
|
||||
} else {
|
||||
//cgram-1 gives me a free <<16
|
||||
color = _mm256_and_si256(_mm256_i32gather_epi32((int*)(cgram - 1), palette, sizeof(uint16_t)), _mm256_set1_epi32(0xFFFF0000));
|
||||
}
|
||||
|
||||
__m256i pixels = _mm256_or_si256(_mm256_or_si256(color, _mm256_set1_epi32(source)), _mm256_slli_epi32(priority, 8));
|
||||
__m256i prevAbove = _mm256_loadu_si256((__m256i*)above);
|
||||
__m256i prevBelow = _mm256_loadu_si256((__m256i*)below);
|
||||
|
||||
if(extbg) {
|
||||
doAbove = _mm256_and_si256(doAbove, _mm256_shuffle_epi8(_mm256_cmpgt_epi8(pixels, prevAbove),
|
||||
_mm256_set_epi32(0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D,
|
||||
0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D)));
|
||||
doBelow = _mm256_and_si256(doBelow, _mm256_shuffle_epi8(_mm256_cmpgt_epi8(pixels, prevBelow),
|
||||
_mm256_set_epi32(0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D,
|
||||
0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D)));
|
||||
}
|
||||
|
||||
_mm256_storeu_si256((__m256i*)above, _mm256_blendv_epi8(prevAbove, pixels, doAbove));
|
||||
_mm256_storeu_si256((__m256i*)below, _mm256_blendv_epi8(prevBelow, pixels, doBelow));
|
||||
|
||||
above += 8;
|
||||
below += 8;
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -1,3 +1,5 @@
|
|||
#define USE_AVX2 0
|
||||
|
||||
//determine mode 7 line groups for perspective correction
|
||||
auto PPU::Line::cacheMode7HD() -> void {
|
||||
ppu.mode7LineGroups.count = 0;
|
||||
|
@ -168,7 +170,14 @@ auto PPU::Line::renderMode7HD(PPU::IO::Background& self, uint8 source) -> void {
|
|||
int ht = (hoffset - hcenter) % 1024;
|
||||
float vty = ((voffset - vcenter) % 1024) + yf;
|
||||
float originX = (a * ht) + (b * vty) + (hcenter << 8);
|
||||
float originY = (c * ht) + (d * vty) + (vcenter << 8);
|
||||
float originY = (c * ht) + (d * vty) + (vcenter << 8);
|
||||
|
||||
if(USE_AVX2 == 1) { //__builtin_cpu_supports("avx2")) {
|
||||
renderMode7HD_AVX2(self, source, above + 1, below + 1, windowAbove, windowBelow, originX, a, originY, c);
|
||||
above += 256 * scale;
|
||||
below += 256 * scale;
|
||||
continue;
|
||||
}
|
||||
|
||||
int pixelXp = INT_MIN;
|
||||
for(int x : range(256)) {
|
||||
|
|
|
@ -13,6 +13,7 @@ PPU ppu;
|
|||
#include "background.cpp"
|
||||
#include "mode7.cpp"
|
||||
#include "mode7hd.cpp"
|
||||
#include "mode7hd-avx2.cpp"
|
||||
#include "object.cpp"
|
||||
#include "window.cpp"
|
||||
#include "serialization.cpp"
|
||||
|
|
|
@ -307,6 +307,15 @@ public:
|
|||
auto renderMode7HD(PPU::IO::Background&, uint8 source) -> void;
|
||||
alwaysinline auto lerp(float pa, float va, float pb, float vb, float pr) -> float;
|
||||
|
||||
//mode7hd-avx2.cpp
|
||||
auto renderMode7HD_AVX2(
|
||||
PPU::IO::Background&, uint8 source,
|
||||
Pixel* above, Pixel* below,
|
||||
bool* windowAbove, bool* windowBelow,
|
||||
float originX, float a,
|
||||
float originY, float c
|
||||
) -> void;
|
||||
|
||||
//object.cpp
|
||||
auto renderObject(PPU::IO::Object&) -> void;
|
||||
|
||||
|
|
Loading…
Reference in New Issue