Video Filters: Rework the Scanline and Nearest2X filters to modern standards, and also add an SSE2 version of the Scanline filter.
This commit is contained in:
parent
48bef75fd9
commit
3b0920d7f0
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
Copyright (C) 2009-2015 DeSmuME team
|
Copyright (C) 2009-2017 DeSmuME team
|
||||||
|
|
||||||
This file is free software: you can redistribute it and/or modify
|
This file is free software: you can redistribute it and/or modify
|
||||||
it under the terms of the GNU General Public License as published by
|
it under the terms of the GNU General Public License as published by
|
||||||
|
@ -20,93 +20,146 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
typedef u64 uint64;
|
|
||||||
|
|
||||||
extern int scanline_filter_a, scanline_filter_b, scanline_filter_c, scanline_filter_d;
|
extern int scanline_filter_a, scanline_filter_b, scanline_filter_c, scanline_filter_d;
|
||||||
static int fac_a, fac_b, fac_c, fac_d;
|
static int fac_a, fac_b, fac_c, fac_d;
|
||||||
|
|
||||||
FORCEINLINE void ScanLine32( uint32 *lpDst, uint32 *lpSrc, unsigned int Width, int fac_left, int fac_right)
|
#if defined(ENABLE_SSE2)
|
||||||
|
template <size_t LINEWIDTH>
|
||||||
|
FORCEINLINE static void ScanLine32_FastSSE2(u32 *__restrict lpDst, const u32 *__restrict lpSrc, const int fac_left, const int fac_right)
|
||||||
{
|
{
|
||||||
while(Width--)
|
const v128u16 weight = _mm_set_epi16(16, fac_right, fac_right, fac_right, 16, fac_left, fac_left, fac_left);
|
||||||
|
|
||||||
|
for (size_t i = 0; i < LINEWIDTH; i+=4, lpSrc+=4, lpDst+=8)
|
||||||
|
{
|
||||||
|
const v128u32 src = _mm_load_si128((v128u32 *__restrict)lpSrc);
|
||||||
|
const v128u32 srcLo = _mm_unpacklo_epi32(src, src);
|
||||||
|
const v128u32 srcHi = _mm_unpackhi_epi32(src, src);
|
||||||
|
|
||||||
|
const v128u32 srcLo0 = _mm_srli_epi16( _mm_mullo_epi16(_mm_unpacklo_epi8(srcLo, _mm_setzero_si128()), weight), 4 );
|
||||||
|
const v128u32 srcLo1 = _mm_srli_epi16( _mm_mullo_epi16(_mm_unpackhi_epi8(srcLo, _mm_setzero_si128()), weight), 4 );
|
||||||
|
const v128u32 srcHi0 = _mm_srli_epi16( _mm_mullo_epi16(_mm_unpacklo_epi8(srcHi, _mm_setzero_si128()), weight), 4 );
|
||||||
|
const v128u32 srcHi1 = _mm_srli_epi16( _mm_mullo_epi16(_mm_unpackhi_epi8(srcHi, _mm_setzero_si128()), weight), 4 );
|
||||||
|
|
||||||
|
_mm_stream_si128( (v128u32 *__restrict)(lpDst + 0), _mm_packus_epi16(srcLo0, srcLo1) );
|
||||||
|
_mm_stream_si128( (v128u32 *__restrict)(lpDst + 4), _mm_packus_epi16(srcHi0, srcHi1) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FORCEINLINE static void ScanLine32(u32 *__restrict lpDst, const u32 *__restrict lpSrc, const size_t lineWidth, const int fac_left, const int fac_right)
|
||||||
|
{
|
||||||
|
const u8 *__restrict src8 = (const u8 *__restrict)lpSrc;
|
||||||
|
u8 *__restrict dst8 = (u8 *__restrict)lpDst;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < lineWidth; i++, lpSrc++, lpDst+=2)
|
||||||
{
|
{
|
||||||
#ifdef MSB_FIRST
|
#ifdef MSB_FIRST
|
||||||
u8* u8dst = (u8*)lpDst;
|
dst8[(i*8)+1] = src8[(i*4)+1] * fac_left / 16;
|
||||||
u8* u8src = (u8*)lpSrc;
|
dst8[(i*8)+2] = src8[(i*4)+2] * fac_left / 16;
|
||||||
u8dst++; u8src++;
|
dst8[(i*8)+3] = src8[(i*4)+3] * fac_left / 16;
|
||||||
*u8dst++ = *u8src++ * fac_left / 16;
|
|
||||||
*u8dst++ = *u8src++ * fac_left / 16;
|
|
||||||
*u8dst++ = *u8src * fac_left / 16;
|
|
||||||
|
|
||||||
u8src = (u8*)lpSrc;
|
dst8[(i*8)+5] = src8[(i*4)+1] * fac_right / 16;
|
||||||
u8dst++; u8src++;
|
dst8[(i*8)+6] = src8[(i*4)+2] * fac_right / 16;
|
||||||
*u8dst++ = *u8src++ * fac_right / 16;
|
dst8[(i*8)+7] = src8[(i*4)+3] * fac_right / 16;
|
||||||
*u8dst++ = *u8src++ * fac_right / 16;
|
|
||||||
*u8dst++ = *u8src++ * fac_right / 16;
|
|
||||||
lpDst+=2;
|
|
||||||
lpSrc++;
|
|
||||||
#else
|
#else
|
||||||
u8* u8dst = (u8*)lpDst;
|
dst8[(i*8)+0] = src8[(i*4)+0] * fac_left / 16;
|
||||||
u8* u8src = (u8*)lpSrc;
|
dst8[(i*8)+1] = src8[(i*4)+1] * fac_left / 16;
|
||||||
*u8dst++ = *u8src++ * fac_left / 16;
|
dst8[(i*8)+2] = src8[(i*4)+2] * fac_left / 16;
|
||||||
*u8dst++ = *u8src++ * fac_left / 16;
|
|
||||||
*u8dst++ = *u8src++ * fac_left / 16;
|
|
||||||
u8dst++;
|
|
||||||
|
|
||||||
u8src = (u8*)lpSrc;
|
dst8[(i*8)+4] = src8[(i*4)+0] * fac_right / 16;
|
||||||
*u8dst++ = *u8src++ * fac_right / 16;
|
dst8[(i*8)+5] = src8[(i*4)+1] * fac_right / 16;
|
||||||
*u8dst++ = *u8src++ * fac_right / 16;
|
dst8[(i*8)+6] = src8[(i*4)+2] * fac_right / 16;
|
||||||
*u8dst++ = *u8src++ * fac_right / 16;
|
|
||||||
u8dst++; u8src++;
|
|
||||||
lpDst+=2;
|
|
||||||
lpSrc++;
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
FORCEINLINE void DoubleLine32( uint32 *lpDst, uint32 *lpSrc, unsigned int Width){
|
template <size_t LINEWIDTH>
|
||||||
while(Width--){
|
FORCEINLINE static void DoubleLine32_Fast(u32 *__restrict lpDst, const u32 *__restrict lpSrc)
|
||||||
*lpDst++ = *lpSrc;
|
{
|
||||||
*lpDst++ = *lpSrc++;
|
for (size_t i = 0; i < LINEWIDTH; i++)
|
||||||
|
{
|
||||||
|
lpDst[(i*2)+0] = lpSrc[i];
|
||||||
|
lpDst[(i*2)+1] = lpSrc[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void RenderScanline( SSurface Src, SSurface Dst)
|
FORCEINLINE static void DoubleLine32(u32 *__restrict lpDst, const u32 *__restrict lpSrc, const size_t lineWidth)
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < lineWidth; i++)
|
||||||
|
{
|
||||||
|
lpDst[(i*2)+0] = lpSrc[i];
|
||||||
|
lpDst[(i*2)+1] = lpSrc[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void RenderScanline(SSurface Src, SSurface Dst)
|
||||||
{
|
{
|
||||||
fac_a = (16-scanline_filter_a);
|
fac_a = (16-scanline_filter_a);
|
||||||
fac_b = (16-scanline_filter_b);
|
fac_b = (16-scanline_filter_b);
|
||||||
fac_c = (16-scanline_filter_c);
|
fac_c = (16-scanline_filter_c);
|
||||||
fac_d = (16-scanline_filter_d);
|
fac_d = (16-scanline_filter_d);
|
||||||
unsigned int H;
|
size_t dstLineIndex = 0;
|
||||||
|
|
||||||
const uint32 srcHeight = Src.Height;
|
const size_t srcHeight = Src.Height;
|
||||||
|
const size_t srcPitch = Src.Pitch >> 1;
|
||||||
|
u32 *lpSrc = (u32 *)Src.Surface;
|
||||||
|
|
||||||
const unsigned int srcPitch = Src.Pitch >> 1;
|
const size_t dstPitch = Dst.Pitch >> 1;
|
||||||
u32* lpSrc = (u32*)Src.Surface;
|
u32 *lpDst = (u32 *)Dst.Surface;
|
||||||
|
|
||||||
const unsigned int dstPitch = Dst.Pitch >> 1;
|
#ifdef ENABLE_SSE2
|
||||||
u32 *lpDst = (u32*)Dst.Surface;
|
if (Src.Width == 256)
|
||||||
for (H = 0; H < srcHeight; H++, lpSrc += srcPitch)
|
{
|
||||||
|
for (; dstLineIndex < srcHeight; dstLineIndex++, lpSrc += srcPitch)
|
||||||
|
{
|
||||||
|
ScanLine32_FastSSE2<256>(lpDst, lpSrc, fac_a, fac_b);
|
||||||
|
lpDst += dstPitch;
|
||||||
|
ScanLine32_FastSSE2<256>(lpDst, lpSrc, fac_c, fac_d);
|
||||||
|
lpDst += dstPitch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
for (; dstLineIndex < srcHeight; dstLineIndex++, lpSrc += srcPitch)
|
||||||
{
|
{
|
||||||
ScanLine32(lpDst, lpSrc, Src.Width, fac_a, fac_b);
|
ScanLine32(lpDst, lpSrc, Src.Width, fac_a, fac_b);
|
||||||
lpDst += dstPitch;
|
lpDst += dstPitch;
|
||||||
ScanLine32(lpDst, lpSrc, Src.Width, fac_c, fac_d);
|
ScanLine32(lpDst, lpSrc, Src.Width, fac_c, fac_d);
|
||||||
lpDst += dstPitch;
|
lpDst += dstPitch;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void RenderNearest2X (SSurface Src, SSurface Dst)
|
void RenderNearest2X(SSurface Src, SSurface Dst)
|
||||||
{
|
{
|
||||||
uint32 *lpSrc;
|
size_t dstLineIndex = 0;
|
||||||
unsigned int H;
|
|
||||||
|
|
||||||
const uint32 srcHeight = Src.Height;
|
const size_t srcHeight = Src.Height;
|
||||||
|
const size_t srcPitch = Src.Pitch >> 1;
|
||||||
|
const u32 *lpSrc = (u32 *)Src.Surface;
|
||||||
|
|
||||||
const unsigned int srcPitch = Src.Pitch >> 1;
|
const size_t dstPitch = Dst.Pitch >> 1;
|
||||||
lpSrc = reinterpret_cast<uint32 *>(Src.Surface);
|
u32 *lpDst = (u32 *)Dst.Surface;
|
||||||
|
|
||||||
const unsigned int dstPitch = Dst.Pitch >> 1;
|
if (Src.Width == 256)
|
||||||
uint32 *lpDst = (uint32*)Dst.Surface;
|
{
|
||||||
for (H = 0; H < srcHeight; H++, lpSrc += srcPitch)
|
for (; dstLineIndex < srcHeight; dstLineIndex++, lpSrc += srcPitch)
|
||||||
DoubleLine32 (lpDst, lpSrc, Src.Width), lpDst += dstPitch,
|
{
|
||||||
DoubleLine32 (lpDst, lpSrc, Src.Width), lpDst += dstPitch;
|
DoubleLine32_Fast<256>(lpDst, lpSrc);
|
||||||
|
lpDst += dstPitch;
|
||||||
|
DoubleLine32_Fast<256>(lpDst, lpSrc);
|
||||||
|
lpDst += dstPitch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for (; dstLineIndex < srcHeight; dstLineIndex++, lpSrc += srcPitch)
|
||||||
|
{
|
||||||
|
DoubleLine32(lpDst, lpSrc, Src.Width);
|
||||||
|
lpDst += dstPitch;
|
||||||
|
DoubleLine32(lpDst, lpSrc, Src.Width);
|
||||||
|
lpDst += dstPitch;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue