mirror of https://github.com/bsnes-emu/bsnes.git
parent
57c53a86b4
commit
b5301b7ea8
|
@ -22,14 +22,6 @@ using namespace nall;
|
||||||
|
|
||||||
#include <libco/libco.h>
|
#include <libco/libco.h>
|
||||||
|
|
||||||
#ifdef __SSE2__
|
|
||||||
#include <emmintrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __AVX2__
|
|
||||||
#include <immintrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include <emulator/types.hpp>
|
#include <emulator/types.hpp>
|
||||||
#include <emulator/memory/readable.hpp>
|
#include <emulator/memory/readable.hpp>
|
||||||
#include <emulator/memory/writable.hpp>
|
#include <emulator/memory/writable.hpp>
|
||||||
|
@ -37,7 +29,7 @@ using namespace nall;
|
||||||
|
|
||||||
namespace Emulator {
|
namespace Emulator {
|
||||||
static const string Name = "bsnes";
|
static const string Name = "bsnes";
|
||||||
static const string Version = "110.5";
|
static const string Version = "110.6";
|
||||||
static const string Author = "byuu";
|
static const string Author = "byuu";
|
||||||
static const string License = "GPLv3";
|
static const string License = "GPLv3";
|
||||||
static const string Website = "https://byuu.org";
|
static const string Website = "https://byuu.org";
|
||||||
|
|
|
@ -1,107 +0,0 @@
|
||||||
#if USE_AVX2 == 1
|
|
||||||
__attribute__((__target__("avx2")))
|
|
||||||
auto PPU::Line::renderMode7HD_AVX2(
|
|
||||||
PPU::IO::Background& self, uint8 source,
|
|
||||||
Pixel* above, Pixel* below,
|
|
||||||
bool* windowAbove, bool* windowBelow,
|
|
||||||
float originX, float a,
|
|
||||||
float originY, float c
|
|
||||||
) -> void {
|
|
||||||
const bool extbg = source == Source::BG2;
|
|
||||||
const uint scale = ppu.hdScale();
|
|
||||||
|
|
||||||
//8 pixels at the time, one int32 per pixel
|
|
||||||
for (int x : range(256 * scale / 8)) {
|
|
||||||
//ensure everything has the expected types.
|
|
||||||
//some accesses are outside the buffers; the unwanted bytes are masked out,
|
|
||||||
//but it's still undefined behavior, and it will crash if those bytes are unmapped or if compiler becomes too clever.
|
|
||||||
static_assert(std::is_same_v<decltype(ppu.vram), uint16_t[32768]>);
|
|
||||||
static_assert(std::is_same_v<decltype(cgram), uint16_t[256]>);
|
|
||||||
static_assert(std::is_same_v<decltype(self.priority), uint8_t[2]>);
|
|
||||||
static_assert(sizeof(Pixel) == 4);
|
|
||||||
static_assert(offsetof(Pixel, source) == 0);
|
|
||||||
static_assert(std::is_same_v<decltype(Pixel::source), uint8_t>);
|
|
||||||
static_assert(offsetof(Pixel, priority) == 1);
|
|
||||||
static_assert(std::is_same_v<decltype(Pixel::priority), uint8_t>);
|
|
||||||
static_assert(offsetof(Pixel, color) == 2);
|
|
||||||
static_assert(std::is_same_v<decltype(Pixel::color), uint16_t>);
|
|
||||||
|
|
||||||
__m256 offset_f32 = _mm256_setr_ps(0, 1, 2, 3, 4, 5, 6, 7);
|
|
||||||
__m256 posi_f32 = _mm256_add_ps(_mm256_set1_ps(x * 8), offset_f32);
|
|
||||||
__m256 pos_f32 = _mm256_div_ps(posi_f32, _mm256_set1_ps(scale));
|
|
||||||
|
|
||||||
__m256i doAbove = _mm256_setzero_si256(); //0x00000000 - ignore, 0xFFFFFFFF - render
|
|
||||||
if(self.aboveEnable) {
|
|
||||||
doAbove = _mm256_i32gather_epi32((int*)windowAbove, _mm256_cvtps_epi32(pos_f32), 1);
|
|
||||||
doAbove = _mm256_and_si256(doAbove, _mm256_set1_epi32(0x00000001));
|
|
||||||
doAbove = _mm256_sub_epi32(doAbove, _mm256_set1_epi32(0x00000001));
|
|
||||||
}
|
|
||||||
__m256i doBelow = _mm256_setzero_si256();
|
|
||||||
if(self.belowEnable) {
|
|
||||||
doBelow = _mm256_i32gather_epi32((int*)windowBelow, _mm256_cvtps_epi32(pos_f32), 1);
|
|
||||||
doBelow = _mm256_and_si256(doBelow, _mm256_set1_epi32(0x00000001));
|
|
||||||
doBelow = _mm256_sub_epi32(doBelow, _mm256_set1_epi32(0x00000001));
|
|
||||||
}
|
|
||||||
|
|
||||||
__m256 xf = _mm256_add_ps(pos_f32, _mm256_set1_ps(-0.5));
|
|
||||||
if(io.mode7.hflip) xf = _mm256_sub_ps(_mm256_set1_ps(255), xf);
|
|
||||||
|
|
||||||
__m256i pixelXx256 = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(originX), _mm256_mul_ps(_mm256_set1_ps(a), xf)));
|
|
||||||
__m256i pixelYx256 = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_set1_ps(originY), _mm256_mul_ps(_mm256_set1_ps(c), xf)));
|
|
||||||
|
|
||||||
__m256i tile_addr = _mm256_or_si256(_mm256_and_si256(_mm256_srai_epi32(pixelYx256, 4), _mm256_set1_epi32(127 << 7)),
|
|
||||||
_mm256_and_si256(_mm256_srai_epi32(pixelXx256, 11), _mm256_set1_epi32(127)));
|
|
||||||
__m256i tile = _mm256_and_si256(_mm256_i32gather_epi32((int*)ppu.vram, tile_addr, sizeof(uint16_t)), _mm256_set1_epi32(0x000000FF));
|
|
||||||
__m256i outOfBounds = _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_or_si256(pixelXx256, pixelYx256), _mm256_set1_epi32((~1023) * 256)), _mm256_setzero_si256());
|
|
||||||
if(io.mode7.repeat == 3)
|
|
||||||
tile = _mm256_blendv_epi8(_mm256_setzero_si256(), tile, outOfBounds);
|
|
||||||
__m256i palette_addr = _mm256_or_si256(_mm256_or_si256(
|
|
||||||
_mm256_and_si256(_mm256_srli_epi32(pixelYx256, 5), _mm256_set1_epi32(7 << 3)),
|
|
||||||
_mm256_and_si256(_mm256_srli_epi32(pixelXx256, 8), _mm256_set1_epi32(7))),
|
|
||||||
_mm256_and_si256(_mm256_slli_epi32(tile, 6), _mm256_set1_epi32(0xFF << 6)));
|
|
||||||
__m256i palette = _mm256_and_si256(_mm256_srai_epi32(_mm256_i32gather_epi32((int*)ppu.vram, palette_addr, sizeof(uint16_t)), 8), _mm256_set1_epi32(0x000000FF));
|
|
||||||
if(io.mode7.repeat == 2)
|
|
||||||
palette = _mm256_blendv_epi8(_mm256_setzero_si256(), palette, outOfBounds);
|
|
||||||
|
|
||||||
__m256i priority;
|
|
||||||
if(!extbg) {
|
|
||||||
priority = _mm256_set1_epi32(self.priority[0]);
|
|
||||||
} else {
|
|
||||||
priority = _mm256_blendv_epi8(_mm256_set1_epi32(self.priority[0]), _mm256_set1_epi32(self.priority[1]),
|
|
||||||
_mm256_shuffle_epi8(palette, _mm256_set_epi32(0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C,
|
|
||||||
0x00000000, 0x04040404, 0x08080808, 0x0C0C0C0C)));
|
|
||||||
palette = _mm256_and_si256(palette, _mm256_set1_epi32(0x0000007F));
|
|
||||||
}
|
|
||||||
|
|
||||||
__m256i color;
|
|
||||||
if(io.col.directColor && !extbg) {
|
|
||||||
color = _mm256_or_si256(_mm256_or_si256(
|
|
||||||
_mm256_and_si256(_mm256_slli_epi32(palette, 2 + 16), _mm256_set1_epi32(0x001C0000)),
|
|
||||||
_mm256_and_si256(_mm256_slli_epi32(palette, 4 + 16), _mm256_set1_epi32(0x03800000))),
|
|
||||||
_mm256_and_si256(_mm256_slli_epi32(palette, 7 + 16), _mm256_set1_epi32(0x60000000)));
|
|
||||||
} else {
|
|
||||||
//cgram-1 gives me a free <<16
|
|
||||||
color = _mm256_and_si256(_mm256_i32gather_epi32((int*)(cgram - 1), palette, sizeof(uint16_t)), _mm256_set1_epi32(0xFFFF0000));
|
|
||||||
}
|
|
||||||
|
|
||||||
__m256i pixels = _mm256_or_si256(_mm256_or_si256(color, _mm256_set1_epi32(source)), _mm256_slli_epi32(priority, 8));
|
|
||||||
__m256i prevAbove = _mm256_loadu_si256((__m256i*)above);
|
|
||||||
__m256i prevBelow = _mm256_loadu_si256((__m256i*)below);
|
|
||||||
|
|
||||||
if(extbg) {
|
|
||||||
doAbove = _mm256_and_si256(doAbove, _mm256_shuffle_epi8(_mm256_cmpgt_epi8(pixels, prevAbove),
|
|
||||||
_mm256_set_epi32(0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D,
|
|
||||||
0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D)));
|
|
||||||
doBelow = _mm256_and_si256(doBelow, _mm256_shuffle_epi8(_mm256_cmpgt_epi8(pixels, prevBelow),
|
|
||||||
_mm256_set_epi32(0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D,
|
|
||||||
0x01010101, 0x05050505, 0x09090909, 0x0D0D0D0D)));
|
|
||||||
}
|
|
||||||
|
|
||||||
_mm256_storeu_si256((__m256i*)above, _mm256_blendv_epi8(prevAbove, pixels, doAbove));
|
|
||||||
_mm256_storeu_si256((__m256i*)below, _mm256_blendv_epi8(prevBelow, pixels, doBelow));
|
|
||||||
|
|
||||||
above += 8;
|
|
||||||
below += 8;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
|
@ -1,5 +1,3 @@
|
||||||
#define USE_AVX2 0
|
|
||||||
|
|
||||||
//determine mode 7 line groups for perspective correction
|
//determine mode 7 line groups for perspective correction
|
||||||
auto PPU::Line::cacheMode7HD() -> void {
|
auto PPU::Line::cacheMode7HD() -> void {
|
||||||
ppu.mode7LineGroups.count = 0;
|
ppu.mode7LineGroups.count = 0;
|
||||||
|
@ -172,13 +170,6 @@ auto PPU::Line::renderMode7HD(PPU::IO::Background& self, uint8 source) -> void {
|
||||||
float originX = (a * ht) + (b * vty) + (hcenter << 8);
|
float originX = (a * ht) + (b * vty) + (hcenter << 8);
|
||||||
float originY = (c * ht) + (d * vty) + (vcenter << 8);
|
float originY = (c * ht) + (d * vty) + (vcenter << 8);
|
||||||
|
|
||||||
if(USE_AVX2 == 1) { //__builtin_cpu_supports("avx2")) {
|
|
||||||
renderMode7HD_AVX2(self, source, above + 1, below + 1, windowAbove, windowBelow, originX, a, originY, c);
|
|
||||||
above += 256 * scale;
|
|
||||||
below += 256 * scale;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
int pixelXp = INT_MIN;
|
int pixelXp = INT_MIN;
|
||||||
for(int x : range(256)) {
|
for(int x : range(256)) {
|
||||||
bool doAbove = self.aboveEnable && !windowAbove[x];
|
bool doAbove = self.aboveEnable && !windowAbove[x];
|
||||||
|
|
|
@ -13,7 +13,6 @@ PPU ppu;
|
||||||
#include "background.cpp"
|
#include "background.cpp"
|
||||||
#include "mode7.cpp"
|
#include "mode7.cpp"
|
||||||
#include "mode7hd.cpp"
|
#include "mode7hd.cpp"
|
||||||
#include "mode7hd-avx2.cpp"
|
|
||||||
#include "object.cpp"
|
#include "object.cpp"
|
||||||
#include "window.cpp"
|
#include "window.cpp"
|
||||||
#include "serialization.cpp"
|
#include "serialization.cpp"
|
||||||
|
|
Loading…
Reference in New Issue