C99 Restrict keyword does not exist in C++ (yet anyway). MSVC inline assembly should not be floating around by itself. Checking a define for MMX is stupid, but I won't worry about it right now. Link code is severly MSVC only and should be cleaned or moved.

git-svn-id: https://svn.code.sf.net/p/vbam/code/trunk@62 a31d4220-a93d-0410-bf67-fe4944624d44
This commit is contained in:
Nach 2007-11-14 09:16:23 +00:00
parent f188d0727f
commit bc661af3b9
7 changed files with 381 additions and 362 deletions

View File

@ -16,6 +16,7 @@
// along with this program; if not, write to the Free Software Foundation,
// Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#include "Util.h"
#define __STDC_CONSTANT_MACROS
#include <stdint.h>
extern "C"

View File

@ -42,7 +42,7 @@
* This effect is a rewritten implementation of the hq4x effect made by Maxim Stepin
*/
void hq4x_16_def(interp_uint16* restrict dst0, interp_uint16* restrict dst1, interp_uint16* restrict dst2, interp_uint16* restrict dst3, const interp_uint16* restrict src0, const interp_uint16* restrict src1, const interp_uint16* restrict src2, unsigned count)
void hq4x_16_def(interp_uint16* dst0, interp_uint16* dst1, interp_uint16* dst2, interp_uint16* dst3, const interp_uint16* src0, const interp_uint16* src1, const interp_uint16* src2, unsigned count)
{
unsigned i;
@ -126,7 +126,7 @@ void hq4x_16_def(interp_uint16* restrict dst0, interp_uint16* restrict dst1, int
}
}
void hq4x_32_def(interp_uint32* restrict dst0, interp_uint32* restrict dst1, interp_uint32* restrict dst2, interp_uint32* restrict dst3, const interp_uint32* restrict src0, const interp_uint32* restrict src1, const interp_uint32* restrict src2, unsigned count)
void hq4x_32_def(interp_uint32* dst0, interp_uint32* dst1, interp_uint32* dst2, interp_uint32* dst3, const interp_uint32* src0, const interp_uint32* src1, const interp_uint32* src2, unsigned count)
{
unsigned i;

View File

@ -17,359 +17,371 @@
// Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#include "hq_shared32.h"
#define __STDC_CONSTANT_MACROS
#include <stdint.h>
const unsigned __int64 reg_blank = 0x0000000000000000;
const unsigned __int64 const7 = 0x0000000700070007;
const unsigned __int64 treshold = 0x0000000000300706;
void Interp1(unsigned char * pc, unsigned int c1, unsigned int c2)
{
//*((int*)pc) = (c1*3+c2)/4;
#ifdef MMX
__asm
{
mov eax, pc
movd mm1, c1
movd mm2, c2
movq mm0, mm1
pslld mm0, 2
psubd mm0, mm1
paddd mm0, mm2
psrld mm0, 2
movd [eax], mm0
EMMS
}
#else
__asm
{
mov eax, pc
mov edx, c1
shl edx, 2
add edx, c2
sub edx, c1
shr edx, 2
mov [eax], edx
}
#endif
}
void Interp2(unsigned char * pc, unsigned int c1, unsigned int c2, unsigned int c3)
{
//*((int*)pc) = (c1*2+c2+c3)/4;
#ifdef MMX
__asm
{
mov eax, pc
movd mm0, c1
movd mm1, c2
movd mm2, c3
pslld mm0, 1
paddd mm0, mm1
paddd mm0, mm2
psrad mm0, 2
movd [eax], mm0
EMMS
}
#else
__asm
{
mov eax, pc
mov edx, c1
shl edx, 1
add edx, c2
add edx, c3
shr edx, 2
mov [eax], edx
}
#endif
}
void Interp3(unsigned char * pc, unsigned int c1, unsigned int c2)
{
//*((int*)pc) = (c1*7+c2)/8;
//*((int*)pc) = ((((c1 & 0x00FF00)*7 + (c2 & 0x00FF00) ) & 0x0007F800) +
// (((c1 & 0xFF00FF)*7 + (c2 & 0xFF00FF) ) & 0x07F807F8)) >> 3;
#ifdef MMX
__asm
{
mov eax, pc
movd mm1, c1
movd mm2, c2
punpcklbw mm1, reg_blank
punpcklbw mm2, reg_blank
pmullw mm1, const7
paddw mm1, mm2
psrlw mm1, 3
packuswb mm1, reg_blank
movd [eax], mm1
EMMS
}
#else
__asm
{
mov eax, c1
mov ebx, c2
mov ecx, eax
shl ecx, 3
sub ecx, eax
add ecx, ebx
shr ecx, 3
mov eax, pc
mov [eax], ecx
}
#endif
}
void Interp4(unsigned char * pc, unsigned int c1, unsigned int c2, unsigned int c3)
{
//*((int*)pc) = (c1*2+(c2+c3)*7)/16;
//*((int*)pc) = ((((c1 & 0x00FF00)*2 + ((c2 & 0x00FF00) + (c3 & 0x00FF00))*7 ) & 0x000FF000) +
// (((c1 & 0xFF00FF)*2 + ((c2 & 0xFF00FF) + (c3 & 0xFF00FF))*7 ) & 0x0FF00FF0)) >> 4;
#ifdef MMX
__asm
{
mov eax, pc
movd mm1, c1
movd mm2, c2
movd mm3, c3
punpcklbw mm1, reg_blank
punpcklbw mm2, reg_blank
punpcklbw mm3, reg_blank
psllw mm1, 1
paddw mm2, mm3
pmullw mm2, const7
paddw mm1, mm2
psrlw mm1, 4
packuswb mm1, reg_blank
movd [eax], mm1
EMMS
}
#else
__asm
{
mov eax, [c1]
and eax, 0FF00h
shl eax, 1
mov ecx, [c2]
and ecx, 0FF00h
mov edx, [c3]
and edx, 0FF00h
add ecx, edx
imul ecx, ecx,7
add eax, ecx
and eax, 0FF000h
mov ebx, [c1]
and ebx, 0FF00FFh
shl ebx, 1
mov ecx, [c2]
and ecx, 0FF00FFh
mov edx, [c3]
and edx, 0FF00FFh
add ecx, edx
imul ecx, ecx,7
add ebx, ecx
and ebx, 0FF00FF0h
add eax, ebx
shr eax, 4
mov ebx, pc
mov [ebx], eax
}
#endif
}
void Interp5(unsigned char * pc, unsigned int c1, unsigned int c2)
{
//*((int*)pc) = (c1+c2)/2;
#ifdef MMX
__asm
{
mov eax, pc
movd mm0, c1
movd mm1, c2
paddd mm0, mm1
psrad mm0, 1
movd [eax], mm0
EMMS
}
#else
__asm
{
mov eax, pc
mov edx, c1
add edx, c2
shr edx, 1
mov [eax], edx
}
#endif
}
bool Diff(unsigned int c1, unsigned int c2)
{
unsigned int
YUV1 = RGBtoYUV(c1),
YUV2 = RGBtoYUV(c2);
if (YUV1 == YUV2) return false; // Save some processing power
#ifdef MMX
unsigned int retval;
__asm
{
mov eax, 0x7FFFFFFF
movd mm7, eax ;mm7 = ABS_MASK = 0x7FFFFFFF
; Copy source colors in first reg
movd mm0, YUV1
movd mm1, YUV2
mov eax, 0x00FF0000
movd mm6, eax ;mm6 = Ymask = 0x00FF0000
; Calculate color Y difference
movq mm2, mm0
movq mm3, mm1
pand mm2, mm6
pand mm3, mm6
psubd mm2, mm3
pand mm2, mm7
mov eax, 0x0000FF00
movd mm6, eax ;mm6 = Umask = 0x0000FF00
; Calculate color U difference
movq mm3, mm0
movq mm4, mm1
pand mm3, mm6
pand mm4, mm6
psubd mm3, mm4
pand mm3, mm7
mov eax, 0x000000FF
movd mm6, eax ;mm6 = Vmask = 0x000000FF
; Calculate color V difference
movq mm4, mm0
movq mm5, mm1
pand mm4, mm6
pand mm5, mm6
psubd mm4, mm5
pand mm4, mm7
mov eax, 0x00300000
movd mm5, eax ;mm5 = trY = 0x00300000
mov eax, 0x00000700
movd mm6, eax ;mm6 = trU = 0x00000700
mov eax, 0x00000006
movd mm7, eax ;mm7 = trV = 0x00000006
; Compare the results
pcmpgtd mm2, trY
pcmpgtd mm3, trU
pcmpgtd mm4, trV
por mm2, mm3
por mm2, mm4
movd retval, mm2
EMMS
}
return (retval != 0);
#else
return
( abs32((YUV1 & Ymask) - (YUV2 & Ymask)) > trY ) ||
( abs32((YUV1 & Umask) - (YUV2 & Umask)) > trU ) ||
( abs32((YUV1 & Vmask) - (YUV2 & Vmask)) > trV );
#endif
}
unsigned int RGBtoYUV(unsigned int c)
{ // Division through 3 slows down the emulation about 10% !!!
#ifdef MMX
unsigned int retval;
__asm
{
movd mm0, c
movq mm1, mm0
movq mm2, mm0 ;mm0=mm1=mm2=c
mov eax, 0x000000FF
movd mm5, eax ;mm5 = REDMASK = 0x000000FF
mov eax, 0x0000FF00
movd mm6, eax ;mm6 = GREENMASK = 0x0000FF00
mov eax, 0x00FF0000
movd mm7, eax ;mm7 = BLUEMASK = 0x00FF0000
pand mm0, mm5
pand mm1, mm6
pand mm2, mm7 ;mm0=R mm1=G mm2=B
movq mm3, mm0
paddd mm3, mm1
paddd mm3, mm2
; psrld mm3, 2 ;mm3=Y
; pslld mm3, 16
pslld mm3, 14 ;mm3=Y<<16
mov eax, 512
movd mm7, eax ;mm7 = 128 << 2 = 512
movq mm4, mm0
psubd mm4, mm2
; psrld mm4, 2
; paddd mm4, mm7 ;mm4=U
; pslld mm4, 8 ;mm4=U<<8
paddd mm4, mm7
pslld mm4, 6
mov eax, 128
movd mm7, eax ;mm7 = 128
movq mm5, mm1
pslld mm5, 1
psubd mm5, mm0
psubd mm5, mm2
psrld mm5, 3
paddd mm5, mm7 ;mm5=V
paddd mm5, mm4
paddd mm5, mm3
movd retval, mm5
EMMS
}
return retval;
#else
unsigned char r, g, b, Y, u, v;
r = (c & 0x000000FF);
g = (c & 0x0000FF00) >> 8;
b = (c & 0x00FF0000) >> 16;
Y = (r + g + b) >> 2;
u = 128 + ((r - b) >> 2);
v = 128 + ((-r + 2*g -b)>>3);
return (Y<<16) + (u<<8) + v;
// Extremely High Quality Code
//unsigned char r, g, b;
//r = c & 0xFF;
//g = (c >> 8) & 0xFF;
//b = (c >> 16) & 0xFF;
//unsigned char y, u, v;
//y = (0.256788 * r + 0.504129 * g + 0.097906 * b) + 16;
//u = (-0.148223 * r - 0.290993 * g + 0.439216 * b) + 128;
//v = (0.439216 * r - 0.367788 * g - 0.071427 * b) + 128;
//return (y << 16) + (u << 8) + v;
#endif
}
const uint64_t reg_blank = UINT64_C(0x0000000000000000);
const uint64_t const7 = UINT64_C(0x0000000700070007);
const uint64_t treshold = UINT64_C(0x0000000000300706);
void Interp1(unsigned char * pc, unsigned int c1, unsigned int c2)
{
#ifdef _MSC_VER
#ifdef MMX
__asm
{
mov eax, pc
movd mm1, c1
movd mm2, c2
movq mm0, mm1
pslld mm0, 2
psubd mm0, mm1
paddd mm0, mm2
psrld mm0, 2
movd [eax], mm0
EMMS
}
#else
__asm
{
mov eax, pc
mov edx, c1
shl edx, 2
add edx, c2
sub edx, c1
shr edx, 2
mov [eax], edx
}
#endif
#else
*((int*)pc) = (c1*3+c2)/4;
#endif
}
void Interp2(unsigned char * pc, unsigned int c1, unsigned int c2, unsigned int c3)
{
#ifdef _MSC_VER
#ifdef MMX
__asm
{
mov eax, pc
movd mm0, c1
movd mm1, c2
movd mm2, c3
pslld mm0, 1
paddd mm0, mm1
paddd mm0, mm2
psrad mm0, 2
movd [eax], mm0
EMMS
}
#else
__asm
{
mov eax, pc
mov edx, c1
shl edx, 1
add edx, c2
add edx, c3
shr edx, 2
mov [eax], edx
}
#endif
#else
*((int*)pc) = (c1*2+c2+c3)/4;
#endif
}
void Interp3(unsigned char * pc, unsigned int c1, unsigned int c2)
{
#ifdef _MSC_VER
#ifdef MMX
__asm
{
mov eax, pc
movd mm1, c1
movd mm2, c2
punpcklbw mm1, reg_blank
punpcklbw mm2, reg_blank
pmullw mm1, const7
paddw mm1, mm2
psrlw mm1, 3
packuswb mm1, reg_blank
movd [eax], mm1
EMMS
}
#else
__asm
{
mov eax, c1
mov ebx, c2
mov ecx, eax
shl ecx, 3
sub ecx, eax
add ecx, ebx
shr ecx, 3
mov eax, pc
mov [eax], ecx
}
#endif
#else
*((int*)pc) = (c1*7+c2)/8;
*((int*)pc) = ((((c1 & 0x00FF00)*7 + (c2 & 0x00FF00) ) & 0x0007F800) +
(((c1 & 0xFF00FF)*7 + (c2 & 0xFF00FF) ) & 0x07F807F8)) >> 3;
#endif
}
void Interp4(unsigned char * pc, unsigned int c1, unsigned int c2, unsigned int c3)
{
#ifdef _MSC_VER
#ifdef MMX
__asm
{
mov eax, pc
movd mm1, c1
movd mm2, c2
movd mm3, c3
punpcklbw mm1, reg_blank
punpcklbw mm2, reg_blank
punpcklbw mm3, reg_blank
psllw mm1, 1
paddw mm2, mm3
pmullw mm2, const7
paddw mm1, mm2
psrlw mm1, 4
packuswb mm1, reg_blank
movd [eax], mm1
EMMS
}
#else
__asm
{
mov eax, [c1]
and eax, 0FF00h
shl eax, 1
mov ecx, [c2]
and ecx, 0FF00h
mov edx, [c3]
and edx, 0FF00h
add ecx, edx
imul ecx, ecx,7
add eax, ecx
and eax, 0FF000h
mov ebx, [c1]
and ebx, 0FF00FFh
shl ebx, 1
mov ecx, [c2]
and ecx, 0FF00FFh
mov edx, [c3]
and edx, 0FF00FFh
add ecx, edx
imul ecx, ecx,7
add ebx, ecx
and ebx, 0FF00FF0h
add eax, ebx
shr eax, 4
mov ebx, pc
mov [ebx], eax
}
#endif
#else
*((int*)pc) = (c1*2+(c2+c3)*7)/16;
*((int*)pc) = ((((c1 & 0x00FF00)*2 + ((c2 & 0x00FF00) + (c3 & 0x00FF00))*7 ) & 0x000FF000) +
(((c1 & 0xFF00FF)*2 + ((c2 & 0xFF00FF) + (c3 & 0xFF00FF))*7 ) & 0x0FF00FF0)) >> 4;
#endif
}
void Interp5(unsigned char * pc, unsigned int c1, unsigned int c2)
{
#ifdef _MSC_VER
#ifdef MMX
__asm
{
mov eax, pc
movd mm0, c1
movd mm1, c2
paddd mm0, mm1
psrad mm0, 1
movd [eax], mm0
EMMS
}
#else
__asm
{
mov eax, pc
mov edx, c1
add edx, c2
shr edx, 1
mov [eax], edx
}
#endif
#else
*((int*)pc) = (c1+c2)/2;
#endif
}
bool Diff(unsigned int c1, unsigned int c2)
{
unsigned int
YUV1 = RGBtoYUV(c1),
YUV2 = RGBtoYUV(c2);
if (YUV1 == YUV2) return false; // Save some processing power
#ifdef MMX
unsigned int retval;
__asm
{
mov eax, 0x7FFFFFFF
movd mm7, eax ;mm7 = ABS_MASK = 0x7FFFFFFF
; Copy source colors in first reg
movd mm0, YUV1
movd mm1, YUV2
mov eax, 0x00FF0000
movd mm6, eax ;mm6 = Ymask = 0x00FF0000
; Calculate color Y difference
movq mm2, mm0
movq mm3, mm1
pand mm2, mm6
pand mm3, mm6
psubd mm2, mm3
pand mm2, mm7
mov eax, 0x0000FF00
movd mm6, eax ;mm6 = Umask = 0x0000FF00
; Calculate color U difference
movq mm3, mm0
movq mm4, mm1
pand mm3, mm6
pand mm4, mm6
psubd mm3, mm4
pand mm3, mm7
mov eax, 0x000000FF
movd mm6, eax ;mm6 = Vmask = 0x000000FF
; Calculate color V difference
movq mm4, mm0
movq mm5, mm1
pand mm4, mm6
pand mm5, mm6
psubd mm4, mm5
pand mm4, mm7
mov eax, 0x00300000
movd mm5, eax ;mm5 = trY = 0x00300000
mov eax, 0x00000700
movd mm6, eax ;mm6 = trU = 0x00000700
mov eax, 0x00000006
movd mm7, eax ;mm7 = trV = 0x00000006
; Compare the results
pcmpgtd mm2, trY
pcmpgtd mm3, trU
pcmpgtd mm4, trV
por mm2, mm3
por mm2, mm4
movd retval, mm2
EMMS
}
return (retval != 0);
#else
return
( abs32((YUV1 & Ymask) - (YUV2 & Ymask)) > trY ) ||
( abs32((YUV1 & Umask) - (YUV2 & Umask)) > trU ) ||
( abs32((YUV1 & Vmask) - (YUV2 & Vmask)) > trV );
#endif
}
unsigned int RGBtoYUV(unsigned int c)
{ // Division through 3 slows down the emulation about 10% !!!
#ifdef MMX
unsigned int retval;
__asm
{
movd mm0, c
movq mm1, mm0
movq mm2, mm0 ;mm0=mm1=mm2=c
mov eax, 0x000000FF
movd mm5, eax ;mm5 = REDMASK = 0x000000FF
mov eax, 0x0000FF00
movd mm6, eax ;mm6 = GREENMASK = 0x0000FF00
mov eax, 0x00FF0000
movd mm7, eax ;mm7 = BLUEMASK = 0x00FF0000
pand mm0, mm5
pand mm1, mm6
pand mm2, mm7 ;mm0=R mm1=G mm2=B
movq mm3, mm0
paddd mm3, mm1
paddd mm3, mm2
; psrld mm3, 2 ;mm3=Y
; pslld mm3, 16
pslld mm3, 14 ;mm3=Y<<16
mov eax, 512
movd mm7, eax ;mm7 = 128 << 2 = 512
movq mm4, mm0
psubd mm4, mm2
; psrld mm4, 2
; paddd mm4, mm7 ;mm4=U
; pslld mm4, 8 ;mm4=U<<8
paddd mm4, mm7
pslld mm4, 6
mov eax, 128
movd mm7, eax ;mm7 = 128
movq mm5, mm1
pslld mm5, 1
psubd mm5, mm0
psubd mm5, mm2
psrld mm5, 3
paddd mm5, mm7 ;mm5=V
paddd mm5, mm4
paddd mm5, mm3
movd retval, mm5
EMMS
}
return retval;
#else
unsigned char r, g, b, Y, u, v;
r = (c & 0x000000FF);
g = (c & 0x0000FF00) >> 8;
b = (c & 0x00FF0000) >> 16;
Y = (r + g + b) >> 2;
u = 128 + ((r - b) >> 2);
v = 128 + ((-r + 2*g -b)>>3);
return (Y<<16) + (u<<8) + v;
// Extremely High Quality Code
//unsigned char r, g, b;
//r = c & 0xFF;
//g = (c >> 8) & 0xFF;
//b = (c >> 16) & 0xFF;
//unsigned char y, u, v;
//y = (0.256788 * r + 0.504129 * g + 0.097906 * b) + 16;
//u = (-0.148223 * r - 0.290993 * g + 0.439216 * b) + 128;
//v = (0.439216 * r - 0.367788 * g - 0.071427 * b) + 128;
//return (y << 16) + (u << 8) + v;
#endif
}

View File

@ -87,4 +87,4 @@ void Interp3(unsigned char * pc, unsigned int c1, unsigned int c2);
void Interp4(unsigned char * pc, unsigned int c1, unsigned int c2, unsigned int c3);
void Interp5(unsigned char * pc, unsigned int c1, unsigned int c2);
bool Diff(unsigned int c1, unsigned int c2);
unsigned int RGBtoYUV(unsigned int c);
unsigned int RGBtoYUV(unsigned int c);

View File

@ -31,6 +31,12 @@
#ifndef __INTERP_H
#define __INTERP_H
#define __STDC_CONSTANT_MACROS
#include <stdint.h>
typedef uint16_t interp_uint16;
typedef uint32_t interp_uint32;
/***************************************************************************/
/* Basic types */

View File

@ -41,7 +41,7 @@
* This effect is derived from the hq3x effect made by Maxim Stepin
*/
void lq3x_16_def(interp_uint16* restrict dst0, interp_uint16* restrict dst1, interp_uint16* restrict dst2, const interp_uint16* restrict src0, const interp_uint16* restrict src1, const interp_uint16* restrict src2, unsigned count)
void lq3x_16_def(interp_uint16* dst0, interp_uint16* dst1, interp_uint16* dst2, const interp_uint16* src0, const interp_uint16* src1, const interp_uint16* src2, unsigned count)
{
unsigned i;
@ -124,7 +124,7 @@ void lq3x_16_def(interp_uint16* restrict dst0, interp_uint16* restrict dst1, int
}
}
void lq3x_32_def(interp_uint32* restrict dst0, interp_uint32* restrict dst1, interp_uint32* restrict dst2, const interp_uint32* restrict src0, const interp_uint32* restrict src1, const interp_uint32* restrict src2, unsigned count)
void lq3x_32_def(interp_uint32* dst0, interp_uint32* dst1, interp_uint32* dst2, const interp_uint32* src0, const interp_uint32* src1, const interp_uint32* src2, unsigned count)
{
unsigned i;

View File

@ -42,7 +42,7 @@
* This effect is derived from the hq4x effect made by Maxim Stepin
*/
void lq4x_16_def(interp_uint16* restrict dst0, interp_uint16* restrict dst1, interp_uint16* restrict dst2, interp_uint16* restrict dst3, const interp_uint16* restrict src0, const interp_uint16* restrict src1, const interp_uint16* restrict src2, unsigned count)
void lq4x_16_def(interp_uint16* dst0, interp_uint16* dst1, interp_uint16* dst2, interp_uint16* dst3, const interp_uint16* src0, const interp_uint16* src1, const interp_uint16* src2, unsigned count)
{
unsigned i;
@ -126,7 +126,7 @@ void lq4x_16_def(interp_uint16* restrict dst0, interp_uint16* restrict dst1, int
}
}
void lq4x_32_def(interp_uint32* restrict dst0, interp_uint32* restrict dst1, interp_uint32* restrict dst2, interp_uint32* restrict dst3, const interp_uint32* restrict src0, const interp_uint32* restrict src1, const interp_uint32* restrict src2, unsigned count)
void lq4x_32_def(interp_uint32* dst0, interp_uint32* dst1, interp_uint32* dst2, interp_uint32* dst3, const interp_uint32* src0, const interp_uint32* src1, const interp_uint32* src2, unsigned count)
{
unsigned i;