Video Filters: Rework the Scanline and Nearest2X filters to modern standards, and also add an SSE2 version of the Scanline filter.

This commit is contained in:
rogerman 2017-01-23 12:31:52 -08:00
parent 48bef75fd9
commit 3b0920d7f0
1 changed files with 114 additions and 61 deletions

View File

@ -1,5 +1,5 @@
/* /*
Copyright (C) 2009-2015 DeSmuME team Copyright (C) 2009-2017 DeSmuME team
This file is free software: you can redistribute it and/or modify This file is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -20,53 +20,75 @@
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
typedef u64 uint64;
extern int scanline_filter_a, scanline_filter_b, scanline_filter_c, scanline_filter_d; extern int scanline_filter_a, scanline_filter_b, scanline_filter_c, scanline_filter_d;
static int fac_a, fac_b, fac_c, fac_d; static int fac_a, fac_b, fac_c, fac_d;
FORCEINLINE void ScanLine32( uint32 *lpDst, uint32 *lpSrc, unsigned int Width, int fac_left, int fac_right) #if defined(ENABLE_SSE2)
template <size_t LINEWIDTH>
FORCEINLINE static void ScanLine32_FastSSE2(u32 *__restrict lpDst, const u32 *__restrict lpSrc, const int fac_left, const int fac_right)
{ {
while(Width--) const v128u16 weight = _mm_set_epi16(16, fac_right, fac_right, fac_right, 16, fac_left, fac_left, fac_left);
for (size_t i = 0; i < LINEWIDTH; i+=4, lpSrc+=4, lpDst+=8)
{
const v128u32 src = _mm_load_si128((v128u32 *__restrict)lpSrc);
const v128u32 srcLo = _mm_unpacklo_epi32(src, src);
const v128u32 srcHi = _mm_unpackhi_epi32(src, src);
const v128u32 srcLo0 = _mm_srli_epi16( _mm_mullo_epi16(_mm_unpacklo_epi8(srcLo, _mm_setzero_si128()), weight), 4 );
const v128u32 srcLo1 = _mm_srli_epi16( _mm_mullo_epi16(_mm_unpackhi_epi8(srcLo, _mm_setzero_si128()), weight), 4 );
const v128u32 srcHi0 = _mm_srli_epi16( _mm_mullo_epi16(_mm_unpacklo_epi8(srcHi, _mm_setzero_si128()), weight), 4 );
const v128u32 srcHi1 = _mm_srli_epi16( _mm_mullo_epi16(_mm_unpackhi_epi8(srcHi, _mm_setzero_si128()), weight), 4 );
_mm_stream_si128( (v128u32 *__restrict)(lpDst + 0), _mm_packus_epi16(srcLo0, srcLo1) );
_mm_stream_si128( (v128u32 *__restrict)(lpDst + 4), _mm_packus_epi16(srcHi0, srcHi1) );
}
}
#endif
FORCEINLINE static void ScanLine32(u32 *__restrict lpDst, const u32 *__restrict lpSrc, const size_t lineWidth, const int fac_left, const int fac_right)
{
const u8 *__restrict src8 = (const u8 *__restrict)lpSrc;
u8 *__restrict dst8 = (u8 *__restrict)lpDst;
for (size_t i = 0; i < lineWidth; i++, lpSrc++, lpDst+=2)
{ {
#ifdef MSB_FIRST #ifdef MSB_FIRST
u8* u8dst = (u8*)lpDst; dst8[(i*8)+1] = src8[(i*4)+1] * fac_left / 16;
u8* u8src = (u8*)lpSrc; dst8[(i*8)+2] = src8[(i*4)+2] * fac_left / 16;
u8dst++; u8src++; dst8[(i*8)+3] = src8[(i*4)+3] * fac_left / 16;
*u8dst++ = *u8src++ * fac_left / 16;
*u8dst++ = *u8src++ * fac_left / 16;
*u8dst++ = *u8src * fac_left / 16;
u8src = (u8*)lpSrc; dst8[(i*8)+5] = src8[(i*4)+1] * fac_right / 16;
u8dst++; u8src++; dst8[(i*8)+6] = src8[(i*4)+2] * fac_right / 16;
*u8dst++ = *u8src++ * fac_right / 16; dst8[(i*8)+7] = src8[(i*4)+3] * fac_right / 16;
*u8dst++ = *u8src++ * fac_right / 16;
*u8dst++ = *u8src++ * fac_right / 16;
lpDst+=2;
lpSrc++;
#else #else
u8* u8dst = (u8*)lpDst; dst8[(i*8)+0] = src8[(i*4)+0] * fac_left / 16;
u8* u8src = (u8*)lpSrc; dst8[(i*8)+1] = src8[(i*4)+1] * fac_left / 16;
*u8dst++ = *u8src++ * fac_left / 16; dst8[(i*8)+2] = src8[(i*4)+2] * fac_left / 16;
*u8dst++ = *u8src++ * fac_left / 16;
*u8dst++ = *u8src++ * fac_left / 16;
u8dst++;
u8src = (u8*)lpSrc; dst8[(i*8)+4] = src8[(i*4)+0] * fac_right / 16;
*u8dst++ = *u8src++ * fac_right / 16; dst8[(i*8)+5] = src8[(i*4)+1] * fac_right / 16;
*u8dst++ = *u8src++ * fac_right / 16; dst8[(i*8)+6] = src8[(i*4)+2] * fac_right / 16;
*u8dst++ = *u8src++ * fac_right / 16;
u8dst++; u8src++;
lpDst+=2;
lpSrc++;
#endif #endif
} }
} }
FORCEINLINE void DoubleLine32( uint32 *lpDst, uint32 *lpSrc, unsigned int Width){ template <size_t LINEWIDTH>
while(Width--){ FORCEINLINE static void DoubleLine32_Fast(u32 *__restrict lpDst, const u32 *__restrict lpSrc)
*lpDst++ = *lpSrc; {
*lpDst++ = *lpSrc++; for (size_t i = 0; i < LINEWIDTH; i++)
{
lpDst[(i*2)+0] = lpSrc[i];
lpDst[(i*2)+1] = lpSrc[i];
}
}
FORCEINLINE static void DoubleLine32(u32 *__restrict lpDst, const u32 *__restrict lpSrc, const size_t lineWidth)
{
for (size_t i = 0; i < lineWidth; i++)
{
lpDst[(i*2)+0] = lpSrc[i];
lpDst[(i*2)+1] = lpSrc[i];
} }
} }
@ -76,16 +98,30 @@ void RenderScanline( SSurface Src, SSurface Dst)
fac_b = (16-scanline_filter_b); fac_b = (16-scanline_filter_b);
fac_c = (16-scanline_filter_c); fac_c = (16-scanline_filter_c);
fac_d = (16-scanline_filter_d); fac_d = (16-scanline_filter_d);
unsigned int H; size_t dstLineIndex = 0;
const uint32 srcHeight = Src.Height; const size_t srcHeight = Src.Height;
const size_t srcPitch = Src.Pitch >> 1;
const unsigned int srcPitch = Src.Pitch >> 1;
u32 *lpSrc = (u32 *)Src.Surface; u32 *lpSrc = (u32 *)Src.Surface;
const unsigned int dstPitch = Dst.Pitch >> 1; const size_t dstPitch = Dst.Pitch >> 1;
u32 *lpDst = (u32 *)Dst.Surface; u32 *lpDst = (u32 *)Dst.Surface;
for (H = 0; H < srcHeight; H++, lpSrc += srcPitch)
#ifdef ENABLE_SSE2
if (Src.Width == 256)
{
for (; dstLineIndex < srcHeight; dstLineIndex++, lpSrc += srcPitch)
{
ScanLine32_FastSSE2<256>(lpDst, lpSrc, fac_a, fac_b);
lpDst += dstPitch;
ScanLine32_FastSSE2<256>(lpDst, lpSrc, fac_c, fac_d);
lpDst += dstPitch;
}
}
else
#endif
{
for (; dstLineIndex < srcHeight; dstLineIndex++, lpSrc += srcPitch)
{ {
ScanLine32(lpDst, lpSrc, Src.Width, fac_a, fac_b); ScanLine32(lpDst, lpSrc, Src.Width, fac_a, fac_b);
lpDst += dstPitch; lpDst += dstPitch;
@ -93,20 +129,37 @@ void RenderScanline( SSurface Src, SSurface Dst)
lpDst += dstPitch; lpDst += dstPitch;
} }
} }
}
void RenderNearest2X(SSurface Src, SSurface Dst) void RenderNearest2X(SSurface Src, SSurface Dst)
{ {
uint32 *lpSrc; size_t dstLineIndex = 0;
unsigned int H;
const uint32 srcHeight = Src.Height; const size_t srcHeight = Src.Height;
const size_t srcPitch = Src.Pitch >> 1;
const u32 *lpSrc = (u32 *)Src.Surface;
const unsigned int srcPitch = Src.Pitch >> 1; const size_t dstPitch = Dst.Pitch >> 1;
lpSrc = reinterpret_cast<uint32 *>(Src.Surface); u32 *lpDst = (u32 *)Dst.Surface;
const unsigned int dstPitch = Dst.Pitch >> 1; if (Src.Width == 256)
uint32 *lpDst = (uint32*)Dst.Surface; {
for (H = 0; H < srcHeight; H++, lpSrc += srcPitch) for (; dstLineIndex < srcHeight; dstLineIndex++, lpSrc += srcPitch)
DoubleLine32 (lpDst, lpSrc, Src.Width), lpDst += dstPitch, {
DoubleLine32 (lpDst, lpSrc, Src.Width), lpDst += dstPitch; DoubleLine32_Fast<256>(lpDst, lpSrc);
lpDst += dstPitch;
DoubleLine32_Fast<256>(lpDst, lpSrc);
lpDst += dstPitch;
}
}
else
{
for (; dstLineIndex < srcHeight; dstLineIndex++, lpSrc += srcPitch)
{
DoubleLine32(lpDst, lpSrc, Src.Width);
lpDst += dstPitch;
DoubleLine32(lpDst, lpSrc, Src.Width);
lpDst += dstPitch;
}
}
} }