1011 lines
29 KiB
C++
1011 lines
29 KiB
C++
/*
|
|
* This file is part of the Advance project.
|
|
*
|
|
* Copyright (C) 1999-2002 Andrea Mazzoleni
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
*
|
|
* In addition, as a special exception, Andrea Mazzoleni
|
|
* gives permission to link the code of this program with
|
|
* the MAME library (or with modified versions of MAME that use the
|
|
* same license as MAME), and distribute linked combinations including
|
|
* the two. You must obey the GNU General Public License in all
|
|
* respects for all of the code used other than MAME. If you modify
|
|
* this file, you may extend this exception to your version of the
|
|
* file, but you are not obligated to do so. If you do not wish to
|
|
* do so, delete this exception statement from your version.
|
|
*/
|
|
|
|
/*
|
|
* Alternatively at the previous license terms, you are allowed to use this
|
|
* code in your program with these conditions:
|
|
* - the program is not used in commercial activities.
|
|
* - the whole source code of the program is released with the binary.
|
|
*/
|
|
|
|
#include "System.h"
|
|
|
|
#ifdef MMX
|
|
extern "C" bool cpu_mmx;
|
|
#endif
|
|
|
|
static void internal_scale2x_16_def(u16 *dst, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
|
|
/* first pixel */
|
|
dst[0] = src1[0];
|
|
if (src1[1] == src0[0] && src2[0] != src0[0])
|
|
dst[1] = src0[0];
|
|
else
|
|
dst[1] = src1[0];
|
|
++src0;
|
|
++src1;
|
|
++src2;
|
|
dst += 2;
|
|
|
|
/* central pixels */
|
|
count -= 2;
|
|
while (count) {
|
|
if (src0[0] != src2[0] && src1[-1] != src1[1]) {
|
|
dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
|
|
dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
|
|
} else {
|
|
dst[0] = src1[0];
|
|
dst[1] = src1[0];
|
|
}
|
|
|
|
++src0;
|
|
++src1;
|
|
++src2;
|
|
dst += 2;
|
|
--count;
|
|
}
|
|
|
|
/* last pixel */
|
|
if (src1[-1] == src0[0] && src2[0] != src0[0])
|
|
dst[0] = src0[0];
|
|
else
|
|
dst[0] = src1[0];
|
|
dst[1] = src1[0];
|
|
}
|
|
|
|
static void internal_scale2x_32_def(u32* dst,
|
|
const u32* src0,
|
|
const u32* src1,
|
|
const u32* src2,
|
|
unsigned count)
|
|
{
|
|
/* first pixel */
|
|
dst[0] = src1[0];
|
|
if (src1[1] == src0[0] && src2[0] != src0[0])
|
|
dst[1] = src0[0];
|
|
else
|
|
dst[1] = src1[0];
|
|
++src0;
|
|
++src1;
|
|
++src2;
|
|
dst += 2;
|
|
|
|
/* central pixels */
|
|
count -= 2;
|
|
while (count) {
|
|
if (src0[0] != src2[0] && src1[-1] != src1[1]) {
|
|
dst[0] = src1[-1] == src0[0] ? src0[0] : src1[0];
|
|
dst[1] = src1[1] == src0[0] ? src0[0] : src1[0];
|
|
} else {
|
|
dst[0] = src1[0];
|
|
dst[1] = src1[0];
|
|
}
|
|
|
|
++src0;
|
|
++src1;
|
|
++src2;
|
|
dst += 2;
|
|
--count;
|
|
}
|
|
|
|
/* last pixel */
|
|
if (src1[-1] == src0[0] && src2[0] != src0[0])
|
|
dst[0] = src0[0];
|
|
else
|
|
dst[0] = src1[0];
|
|
dst[1] = src1[0];
|
|
}
|
|
|
|
#ifdef MMX
|
|
static void internal_scale2x_16_mmx_single(u16* dst, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
|
|
/* always do the first and last run */
|
|
count -= 2*4;
|
|
|
|
#ifdef __GNUC__
|
|
__asm__ __volatile__(
|
|
/* first run */
|
|
/* set the current, current_pre, current_next registers */
|
|
"movq 0(%1), %%mm0\n"
|
|
"movq 0(%1),%%mm7\n"
|
|
"movq 8(%1),%%mm1\n"
|
|
"psllq $48,%%mm0\n"
|
|
"psllq $48,%%mm1\n"
|
|
"psrlq $48, %%mm0\n"
|
|
"movq %%mm7,%%mm2\n"
|
|
"movq %%mm7,%%mm3\n"
|
|
"psllq $16,%%mm2\n"
|
|
"psrlq $16,%%mm3\n"
|
|
"por %%mm2,%%mm0\n"
|
|
"por %%mm3,%%mm1\n"
|
|
|
|
/* current_upper */
|
|
"movq (%0),%%mm6\n"
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"movq %%mm0,%%mm3\n"
|
|
"movq %%mm1,%%mm5\n"
|
|
"pcmpeqw %%mm6,%%mm2\n"
|
|
"pcmpeqw %%mm6,%%mm4\n"
|
|
"pcmpeqw (%2),%%mm3\n"
|
|
"pcmpeqw (%2),%%mm5\n"
|
|
"pandn %%mm2,%%mm3\n"
|
|
"pandn %%mm4,%%mm5\n"
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"pcmpeqw %%mm1,%%mm2\n"
|
|
"pcmpeqw %%mm0,%%mm4\n"
|
|
"pandn %%mm3,%%mm2\n"
|
|
"pandn %%mm5,%%mm4\n"
|
|
"movq %%mm2,%%mm3\n"
|
|
"movq %%mm4,%%mm5\n"
|
|
"pand %%mm6,%%mm2\n"
|
|
"pand %%mm6,%%mm4\n"
|
|
"pandn %%mm7,%%mm3\n"
|
|
"pandn %%mm7,%%mm5\n"
|
|
"por %%mm3,%%mm2\n"
|
|
"por %%mm5,%%mm4\n"
|
|
|
|
/* set *dst */
|
|
"movq %%mm2,%%mm3\n"
|
|
"punpcklwd %%mm4,%%mm2\n"
|
|
"punpckhwd %%mm4,%%mm3\n"
|
|
"movq %%mm2,(%3)\n"
|
|
"movq %%mm3,8(%3)\n"
|
|
|
|
/* next */
|
|
"addl $8,%0\n"
|
|
"addl $8,%1\n"
|
|
"addl $8,%2\n"
|
|
"addl $16,%3\n"
|
|
|
|
/* central runs */
|
|
"shrl $2,%4\n"
|
|
"jz 1f\n"
|
|
|
|
"0:\n"
|
|
|
|
/* set the current, current_pre, current_next registers */
|
|
"movq -8(%1),%%mm0\n"
|
|
"movq (%1),%%mm7\n"
|
|
"movq 8(%1),%%mm1\n"
|
|
"psrlq $48,%%mm0\n"
|
|
"psllq $48,%%mm1\n"
|
|
"movq %%mm7,%%mm2\n"
|
|
"movq %%mm7,%%mm3\n"
|
|
"psllq $16,%%mm2\n"
|
|
"psrlq $16,%%mm3\n"
|
|
"por %%mm2,%%mm0\n"
|
|
"por %%mm3,%%mm1\n"
|
|
|
|
/* current_upper */
|
|
"movq (%0),%%mm6\n"
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"movq %%mm0,%%mm3\n"
|
|
"movq %%mm1,%%mm5\n"
|
|
"pcmpeqw %%mm6,%%mm2\n"
|
|
"pcmpeqw %%mm6,%%mm4\n"
|
|
"pcmpeqw (%2),%%mm3\n"
|
|
"pcmpeqw (%2),%%mm5\n"
|
|
"pandn %%mm2,%%mm3\n"
|
|
"pandn %%mm4,%%mm5\n"
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"pcmpeqw %%mm1,%%mm2\n"
|
|
"pcmpeqw %%mm0,%%mm4\n"
|
|
"pandn %%mm3,%%mm2\n"
|
|
"pandn %%mm5,%%mm4\n"
|
|
"movq %%mm2,%%mm3\n"
|
|
"movq %%mm4,%%mm5\n"
|
|
"pand %%mm6,%%mm2\n"
|
|
"pand %%mm6,%%mm4\n"
|
|
"pandn %%mm7,%%mm3\n"
|
|
"pandn %%mm7,%%mm5\n"
|
|
"por %%mm3,%%mm2\n"
|
|
"por %%mm5,%%mm4\n"
|
|
|
|
/* set *dst */
|
|
"movq %%mm2,%%mm3\n"
|
|
"punpcklwd %%mm4,%%mm2\n"
|
|
"punpckhwd %%mm4,%%mm3\n"
|
|
"movq %%mm2,(%3)\n"
|
|
"movq %%mm3,8(%3)\n"
|
|
|
|
/* next */
|
|
"addl $8,%0\n"
|
|
"addl $8,%1\n"
|
|
"addl $8,%2\n"
|
|
"addl $16,%3\n"
|
|
|
|
"decl %4\n"
|
|
"jnz 0b\n"
|
|
"1:\n"
|
|
|
|
/* final run */
|
|
/* set the current, current_pre, current_next registers */
|
|
"movq (%1),%%mm1\n"
|
|
"movq (%1),%%mm7\n"
|
|
"movq -8(%1),%%mm0\n"
|
|
"psrlq $48,%%mm1\n"
|
|
"psrlq $48,%%mm0\n"
|
|
"psllq $48,%%mm1\n"
|
|
"movq %%mm7,%%mm2\n"
|
|
"movq %%mm7,%%mm3\n"
|
|
"psllq $16,%%mm2\n"
|
|
"psrlq $16,%%mm3\n"
|
|
"por %%mm2,%%mm0\n"
|
|
"por %%mm3,%%mm1\n"
|
|
|
|
/* current_upper */
|
|
"movq (%0),%%mm6\n"
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"movq %%mm0,%%mm3\n"
|
|
"movq %%mm1,%%mm5\n"
|
|
"pcmpeqw %%mm6,%%mm2\n"
|
|
"pcmpeqw %%mm6,%%mm4\n"
|
|
"pcmpeqw (%2),%%mm3\n"
|
|
"pcmpeqw (%2),%%mm5\n"
|
|
"pandn %%mm2,%%mm3\n"
|
|
"pandn %%mm4,%%mm5\n"
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"pcmpeqw %%mm1,%%mm2\n"
|
|
"pcmpeqw %%mm0,%%mm4\n"
|
|
"pandn %%mm3,%%mm2\n"
|
|
"pandn %%mm5,%%mm4\n"
|
|
"movq %%mm2,%%mm3\n"
|
|
"movq %%mm4,%%mm5\n"
|
|
"pand %%mm6,%%mm2\n"
|
|
"pand %%mm6,%%mm4\n"
|
|
"pandn %%mm7,%%mm3\n"
|
|
"pandn %%mm7,%%mm5\n"
|
|
"por %%mm3,%%mm2\n"
|
|
"por %%mm5,%%mm4\n"
|
|
|
|
/* set *dst */
|
|
"movq %%mm2,%%mm3\n"
|
|
"punpcklwd %%mm4,%%mm2\n"
|
|
"punpckhwd %%mm4,%%mm3\n"
|
|
"movq %%mm2,(%3)\n"
|
|
"movq %%mm3,8(%3)\n"
|
|
"emms\n"
|
|
|
|
: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
|
|
:
|
|
: "cc"
|
|
);
|
|
#else
|
|
__asm {
|
|
mov eax, src0;
|
|
mov ebx, src1;
|
|
mov ecx, src2;
|
|
mov edx, dst;
|
|
mov esi, count;
|
|
|
|
/* first run */
|
|
/* set the current, current_pre, current_next registers */
|
|
movq mm0, qword ptr [ebx];
|
|
movq mm7, qword ptr [ebx];
|
|
movq mm1, qword ptr [ebx + 8];
|
|
psllq mm0, 48;
|
|
psllq mm1, 48;
|
|
psrlq mm0, 48;
|
|
movq mm2, mm7;
|
|
movq mm3, mm7;
|
|
psllq mm2, 16;
|
|
psrlq mm3, 16;
|
|
por mm0, mm2;
|
|
por mm1, mm3;
|
|
|
|
/* current_upper */
|
|
movq mm6, qword ptr [eax];
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
movq mm2, mm0;
|
|
movq mm4, mm1;
|
|
movq mm3, mm0;
|
|
movq mm5, mm1;
|
|
pcmpeqw mm2, mm6;
|
|
pcmpeqw mm4, mm6;
|
|
pcmpeqw mm3, qword ptr [ecx];
|
|
pcmpeqw mm5, qword ptr [ecx];
|
|
pandn mm3,mm2;
|
|
pandn mm5,mm4;
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
pcmpeqw mm2,mm1;
|
|
pcmpeqw mm4,mm0;
|
|
pandn mm2,mm3;
|
|
pandn mm4,mm5;
|
|
movq mm3,mm2;
|
|
movq mm5,mm4;
|
|
pand mm2,mm6;
|
|
pand mm4,mm6;
|
|
pandn mm3,mm7;
|
|
pandn mm5,mm7;
|
|
por mm2,mm3;
|
|
por mm4,mm5;
|
|
|
|
/* set *dst0 */
|
|
movq mm3,mm2;
|
|
punpcklwd mm2,mm4;
|
|
punpckhwd mm3,mm4;
|
|
movq qword ptr [edx], mm2;
|
|
movq qword ptr [edx + 8], mm3;
|
|
|
|
/* next */
|
|
add eax, 8;
|
|
add ebx, 8;
|
|
add ecx, 8;
|
|
add edx, 16;
|
|
|
|
/* central runs */
|
|
shr esi, 2;
|
|
jz label1;
|
|
align 4;
|
|
label0:
|
|
|
|
/* set the current, current_pre, current_next registers */
|
|
movq mm0, qword ptr [ebx-8];
|
|
movq mm7, qword ptr [ebx];
|
|
movq mm1, qword ptr [ebx+8];
|
|
psrlq mm0,48;
|
|
psllq mm1,48;
|
|
movq mm2,mm7;
|
|
movq mm3,mm7;
|
|
psllq mm2,16;
|
|
psrlq mm3,16;
|
|
por mm0,mm2;
|
|
por mm1,mm3;
|
|
|
|
/* current_upper */
|
|
movq mm6, qword ptr [eax];
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
movq mm3,mm0;
|
|
movq mm5,mm1;
|
|
pcmpeqw mm2,mm6;
|
|
pcmpeqw mm4,mm6;
|
|
pcmpeqw mm3, qword ptr [ecx];
|
|
pcmpeqw mm5, qword ptr [ecx];
|
|
pandn mm3,mm2;
|
|
pandn mm5,mm4;
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
pcmpeqw mm2,mm1;
|
|
pcmpeqw mm4,mm0;
|
|
pandn mm2,mm3;
|
|
pandn mm4,mm5;
|
|
movq mm3,mm2;
|
|
movq mm5,mm4;
|
|
pand mm2,mm6;
|
|
pand mm4,mm6;
|
|
pandn mm3,mm7;
|
|
pandn mm5,mm7;
|
|
por mm2,mm3;
|
|
por mm4,mm5;
|
|
|
|
/* set *dst */
|
|
movq mm3,mm2;
|
|
punpcklwd mm2,mm4;
|
|
punpckhwd mm3,mm4;
|
|
movq qword ptr [edx], mm2;
|
|
movq qword ptr [edx+8], mm3;
|
|
|
|
/* next */
|
|
add eax,8;
|
|
add ebx,8;
|
|
add ecx,8;
|
|
add edx,16;
|
|
|
|
dec esi;
|
|
jnz label0;
|
|
label1:
|
|
|
|
/* final run */
|
|
/* set the current, current_pre, current_next registers */
|
|
movq mm1, qword ptr [ebx];
|
|
movq mm7, qword ptr [ebx];
|
|
movq mm0, qword ptr [ebx-8];
|
|
psrlq mm1,48;
|
|
psrlq mm0,48;
|
|
psllq mm1,48;
|
|
movq mm2,mm7;
|
|
movq mm3,mm7;
|
|
psllq mm2,16;
|
|
psrlq mm3,16;
|
|
por mm0,mm2;
|
|
por mm1,mm3;
|
|
|
|
/* current_upper */
|
|
movq mm6, qword ptr [eax];
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
movq mm3,mm0;
|
|
movq mm5,mm1;
|
|
pcmpeqw mm2,mm6;
|
|
pcmpeqw mm4,mm6;
|
|
pcmpeqw mm3, qword ptr [ecx];
|
|
pcmpeqw mm5, qword ptr [ecx];
|
|
pandn mm3,mm2;
|
|
pandn mm5,mm4;
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
pcmpeqw mm2,mm1;
|
|
pcmpeqw mm4,mm0;
|
|
pandn mm2,mm3;
|
|
pandn mm4,mm5;
|
|
movq mm3,mm2;
|
|
movq mm5,mm4;
|
|
pand mm2,mm6;
|
|
pand mm4,mm6;
|
|
pandn mm3,mm7;
|
|
pandn mm5,mm7;
|
|
por mm2,mm3;
|
|
por mm4,mm5;
|
|
|
|
/* set *dst */
|
|
movq mm3,mm2;
|
|
punpcklwd mm2,mm4;
|
|
punpckhwd mm3,mm4;
|
|
movq qword ptr [edx], mm2;
|
|
movq qword ptr [edx+8], mm3;
|
|
|
|
mov src0, eax;
|
|
mov src1, ebx;
|
|
mov src2, ecx;
|
|
mov dst, edx;
|
|
mov count, esi;
|
|
|
|
emms;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void internal_scale2x_32_mmx_single(u32* dst, const u32* src0, const u32* src1, const u32* src2, unsigned count) {
|
|
/* always do the first and last run */
|
|
count -= 2*2;
|
|
|
|
#ifdef __GNUC__
|
|
__asm__ __volatile__(
|
|
/* first run */
|
|
/* set the current, current_pre, current_next registers */
|
|
"movq 0(%1),%%mm0\n"
|
|
"movq 0(%1),%%mm7\n"
|
|
"movq 8(%1),%%mm1\n"
|
|
"psllq $32,%%mm0\n"
|
|
"psllq $32,%%mm1\n"
|
|
"psrlq $32,%%mm0\n"
|
|
"movq %%mm7,%%mm2\n"
|
|
"movq %%mm7,%%mm3\n"
|
|
"psllq $32,%%mm2\n"
|
|
"psrlq $32,%%mm3\n"
|
|
"por %%mm2,%%mm0\n"
|
|
"por %%mm3,%%mm1\n"
|
|
|
|
/* current_upper */
|
|
"movq (%0),%%mm6\n"
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"movq %%mm0,%%mm3\n"
|
|
"movq %%mm1,%%mm5\n"
|
|
"pcmpeqd %%mm6,%%mm2\n"
|
|
"pcmpeqd %%mm6,%%mm4\n"
|
|
"pcmpeqd (%2),%%mm3\n"
|
|
"pcmpeqd (%2),%%mm5\n"
|
|
"pandn %%mm2,%%mm3\n"
|
|
"pandn %%mm4,%%mm5\n"
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"pcmpeqd %%mm1,%%mm2\n"
|
|
"pcmpeqd %%mm0,%%mm4\n"
|
|
"pandn %%mm3,%%mm2\n"
|
|
"pandn %%mm5,%%mm4\n"
|
|
"movq %%mm2,%%mm3\n"
|
|
"movq %%mm4,%%mm5\n"
|
|
"pand %%mm6,%%mm2\n"
|
|
"pand %%mm6,%%mm4\n"
|
|
"pandn %%mm7,%%mm3\n"
|
|
"pandn %%mm7,%%mm5\n"
|
|
"por %%mm3,%%mm2\n"
|
|
"por %%mm5,%%mm4\n"
|
|
|
|
/* set *dst */
|
|
"movq %%mm2,%%mm3\n"
|
|
"punpckldq %%mm4,%%mm2\n"
|
|
"punpckhdq %%mm4,%%mm3\n"
|
|
"movq %%mm2,(%3)\n"
|
|
"movq %%mm3, 8(%3)\n"
|
|
|
|
/* next */
|
|
"addl $8,%0\n"
|
|
"addl $8,%1\n"
|
|
"addl $8,%2\n"
|
|
"addl $16,%3\n"
|
|
|
|
/* central runs */
|
|
"shrl $1,%4\n"
|
|
"jz 1f\n"
|
|
|
|
"0:\n"
|
|
|
|
/* set the current, current_pre, current_next registers */
|
|
"movq -8(%1),%%mm0\n"
|
|
"movq (%1),%%mm7\n"
|
|
"movq 8(%1),%%mm1\n"
|
|
"psrlq $32,%%mm0\n"
|
|
"psllq $32,%%mm1\n"
|
|
"movq %%mm7,%%mm2\n"
|
|
"movq %%mm7,%%mm3\n"
|
|
"psllq $32,%%mm2\n"
|
|
"psrlq $32,%%mm3\n"
|
|
"por %%mm2,%%mm0\n"
|
|
"por %%mm3,%%mm1\n"
|
|
|
|
/* current_upper */
|
|
"movq (%0),%%mm6\n"
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"movq %%mm0,%%mm3\n"
|
|
"movq %%mm1,%%mm5\n"
|
|
"pcmpeqd %%mm6,%%mm2\n"
|
|
"pcmpeqd %%mm6,%%mm4\n"
|
|
"pcmpeqd (%2),%%mm3\n"
|
|
"pcmpeqd (%2),%%mm5\n"
|
|
"pandn %%mm2,%%mm3\n"
|
|
"pandn %%mm4,%%mm5\n"
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"pcmpeqd %%mm1,%%mm2\n"
|
|
"pcmpeqd %%mm0,%%mm4\n"
|
|
"pandn %%mm3,%%mm2\n"
|
|
"pandn %%mm5,%%mm4\n"
|
|
"movq %%mm2,%%mm3\n"
|
|
"movq %%mm4,%%mm5\n"
|
|
"pand %%mm6,%%mm2\n"
|
|
"pand %%mm6,%%mm4\n"
|
|
"pandn %%mm7,%%mm3\n"
|
|
"pandn %%mm7,%%mm5\n"
|
|
"por %%mm3,%%mm2\n"
|
|
"por %%mm5,%%mm4\n"
|
|
|
|
/* set *dst */
|
|
"movq %%mm2,%%mm3\n"
|
|
"punpckldq %%mm4,%%mm2\n"
|
|
"punpckhdq %%mm4,%%mm3\n"
|
|
"movq %%mm2,(%3)\n"
|
|
"movq %%mm3,8(%3)\n"
|
|
|
|
/* next */
|
|
"addl $8,%0\n"
|
|
"addl $8,%1\n"
|
|
"addl $8,%2\n"
|
|
"addl $16,%3\n"
|
|
|
|
"decl %4\n"
|
|
"jnz 0b\n"
|
|
"1:\n"
|
|
|
|
/* final run */
|
|
/* set the current, current_pre, current_next registers */
|
|
"movq (%1),%%mm1\n"
|
|
"movq (%1),%%mm7\n"
|
|
"movq -8(%1), %%mm0\n"
|
|
"psrlq $32,%%mm1\n"
|
|
"psrlq $32,%%mm0\n"
|
|
"psllq $32,%%mm1\n"
|
|
"movq %%mm7,%%mm2\n"
|
|
"movq %%mm7,%%mm3\n"
|
|
"psllq $32,%%mm2\n"
|
|
"psrlq $32,%%mm3\n"
|
|
"por %%mm2,%%mm0\n"
|
|
"por %%mm3,%%mm1\n"
|
|
|
|
/* current_upper */
|
|
"movq (%0),%%mm6\n"
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"movq %%mm0,%%mm3\n"
|
|
"movq %%mm1,%%mm5\n"
|
|
"pcmpeqd %%mm6,%%mm2\n"
|
|
"pcmpeqd %%mm6,%%mm4\n"
|
|
"pcmpeqd (%2),%%mm3\n"
|
|
"pcmpeqd (%2),%%mm5\n"
|
|
"pandn %%mm2,%%mm3\n"
|
|
"pandn %%mm4,%%mm5\n"
|
|
"movq %%mm0,%%mm2\n"
|
|
"movq %%mm1,%%mm4\n"
|
|
"pcmpeqd %%mm1,%%mm2\n"
|
|
"pcmpeqd %%mm0,%%mm4\n"
|
|
"pandn %%mm3,%%mm2\n"
|
|
"pandn %%mm5,%%mm4\n"
|
|
"movq %%mm2,%%mm3\n"
|
|
"movq %%mm4,%%mm5\n"
|
|
"pand %%mm6,%%mm2\n"
|
|
"pand %%mm6,%%mm4\n"
|
|
"pandn %%mm7,%%mm3\n"
|
|
"pandn %%mm7,%%mm5\n"
|
|
"por %%mm3,%%mm2\n"
|
|
"por %%mm5,%%mm4\n"
|
|
|
|
/* set *dst */
|
|
"movq %%mm2,%%mm3\n"
|
|
"punpckldq %%mm4,%%mm2\n"
|
|
"punpckhdq %%mm4,%%mm3\n"
|
|
"movq %%mm2,(%3)\n"
|
|
"movq %%mm3,8(%3)\n"
|
|
"emms\n"
|
|
|
|
: "+r" (src0), "+r" (src1), "+r" (src2), "+r" (dst), "+r" (count)
|
|
:
|
|
: "cc"
|
|
);
|
|
#else
|
|
__asm {
|
|
mov eax, src0;
|
|
mov ebx, src1;
|
|
mov ecx, src2;
|
|
mov edx, dst;
|
|
mov esi, count;
|
|
|
|
/* first run */
|
|
/* set the current, current_pre, current_next registers */
|
|
movq mm0,qword ptr [ebx];
|
|
movq mm7,qword ptr [ebx];
|
|
movq mm1,qword ptr [ebx + 8];
|
|
psllq mm0,32;
|
|
psllq mm1,32;
|
|
psrlq mm0,32;
|
|
movq mm2,mm7;
|
|
movq mm3,mm7;
|
|
psllq mm2,32;
|
|
psrlq mm3,32;
|
|
por mm0,mm2;
|
|
por mm1,mm3;
|
|
|
|
/* current_upper */
|
|
movq mm6,qword ptr [eax];
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
movq mm3,mm0;
|
|
movq mm5,mm1;
|
|
pcmpeqd mm2,mm6;
|
|
pcmpeqd mm4,mm6;
|
|
pcmpeqd mm3,qword ptr [ecx];
|
|
pcmpeqd mm5,qword ptr [ecx];
|
|
pandn mm3,mm2;
|
|
pandn mm5,mm4;
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
pcmpeqd mm2,mm1;
|
|
pcmpeqd mm4,mm0;
|
|
pandn mm2,mm3;
|
|
pandn mm4,mm5;
|
|
movq mm3,mm2;
|
|
movq mm5,mm4;
|
|
pand mm2,mm6;
|
|
pand mm4,mm6;
|
|
pandn mm3,mm7;
|
|
pandn mm5,mm7;
|
|
por mm2,mm3;
|
|
por mm4,mm5;
|
|
|
|
/* set *dst */
|
|
movq mm3,mm2;
|
|
punpckldq mm2,mm4;
|
|
punpckhdq mm3,mm4;
|
|
movq qword ptr [edx],mm2;
|
|
movq qword ptr [edx+8],mm3;
|
|
|
|
/* next */
|
|
add eax,8;
|
|
add ebx,8;
|
|
add ecx,8;
|
|
add edx,16;
|
|
|
|
/* central runs */
|
|
shr esi,1;
|
|
jz label1;
|
|
label0:
|
|
|
|
/* set the current, current_pre, current_next registers */
|
|
movq mm0,qword ptr [ebx-8];
|
|
movq mm7,qword ptr [ebx];
|
|
movq mm1,qword ptr [ebx+8];
|
|
psrlq mm0,32;
|
|
psllq mm1,32;
|
|
movq mm2,mm7;
|
|
movq mm3,mm7;
|
|
psllq mm2,32;
|
|
psrlq mm3,32;
|
|
por mm0,mm2;
|
|
por mm1,mm3;
|
|
|
|
/* current_upper */
|
|
movq mm6,qword ptr[eax];
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
movq mm3,mm0;
|
|
movq mm5,mm1;
|
|
pcmpeqd mm2,mm6;
|
|
pcmpeqd mm4,mm6;
|
|
pcmpeqd mm3,qword ptr[ecx];
|
|
pcmpeqd mm5,qword ptr[ecx];
|
|
pandn mm3,mm2;
|
|
pandn mm5,mm4;
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
pcmpeqd mm2,mm1;
|
|
pcmpeqd mm4,mm0;
|
|
pandn mm2,mm3;
|
|
pandn mm4,mm5;
|
|
movq mm3,mm2;
|
|
movq mm5,mm4;
|
|
pand mm2,mm6;
|
|
pand mm4,mm6;
|
|
pandn mm3,mm7;
|
|
pandn mm5,mm7;
|
|
por mm2,mm3;
|
|
por mm4,mm5;
|
|
|
|
/* set *dst */
|
|
movq mm3,mm2;
|
|
punpckldq mm2,mm4;
|
|
punpckhdq mm3,mm4;
|
|
movq qword ptr [edx],mm2;
|
|
movq qword ptr [edx+8],mm3;
|
|
|
|
/* next */
|
|
add eax,8;
|
|
add ebx,8;
|
|
add ecx,8;
|
|
add edx,16;
|
|
|
|
dec esi;
|
|
jnz label0;
|
|
label1:
|
|
|
|
/* final run */
|
|
/* set the current, current_pre, current_next registers */
|
|
movq mm1,qword ptr [ebx];
|
|
movq mm7,qword ptr [ebx];
|
|
movq mm0,qword ptr [ebx-8];
|
|
psrlq mm1,32;
|
|
psrlq mm0,32;
|
|
psllq mm1,32;
|
|
movq mm2,mm7;
|
|
movq mm3,mm7;
|
|
psllq mm2,32;
|
|
psrlq mm3,32;
|
|
por mm0,mm2;
|
|
por mm1,mm3;
|
|
|
|
/* current_upper */
|
|
movq mm6,qword ptr [eax];
|
|
|
|
/* compute the upper-left pixel for dst on %%mm2 */
|
|
/* compute the upper-right pixel for dst on %%mm4 */
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
movq mm3,mm0;
|
|
movq mm5,mm1;
|
|
pcmpeqd mm2,mm6;
|
|
pcmpeqd mm4,mm6;
|
|
pcmpeqd mm3,qword ptr [ecx];
|
|
pcmpeqd mm5,qword ptr [ecx];
|
|
pandn mm3,mm2;
|
|
pandn mm5,mm4;
|
|
movq mm2,mm0;
|
|
movq mm4,mm1;
|
|
pcmpeqd mm2,mm1;
|
|
pcmpeqd mm4,mm0;
|
|
pandn mm2,mm3;
|
|
pandn mm4,mm5;
|
|
movq mm3,mm2;
|
|
movq mm5,mm4;
|
|
pand mm2,mm6;
|
|
pand mm4,mm6;
|
|
pandn mm3,mm7;
|
|
pandn mm5,mm7;
|
|
por mm2,mm3;
|
|
por mm4,mm5;
|
|
|
|
/* set *dst */
|
|
movq mm3,mm2;
|
|
punpckldq mm2,mm4;
|
|
punpckhdq mm3,mm4;
|
|
movq qword ptr [edx],mm2;
|
|
movq qword ptr [edx+8],mm3;
|
|
|
|
mov src0, eax;
|
|
mov src1, ebx;
|
|
mov src2, ecx;
|
|
mov dst, edx;
|
|
mov count, esi;
|
|
|
|
emms;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void internal_scale2x_16_mmx(u16* dst0, u16* dst1, const u16* src0, const u16* src1, const u16* src2, unsigned count) {
|
|
// assert( count >= 2*4 );
|
|
internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
|
|
internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
|
|
}
|
|
|
|
static void internal_scale2x_32_mmx(u32* dst0, u32* dst1, const u32* src0, const u32* src1, const u32* src2, unsigned count) {
|
|
// assert( count >= 2*2 );
|
|
internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
|
|
internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
|
|
}
|
|
#endif
|
|
|
|
void AdMame2x(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
|
|
u8 *dstPtr, u32 dstPitch, int width, int height)
|
|
{
|
|
u16 *dst0 = (u16 *)dstPtr;
|
|
u16 *dst1 = dst0 + (dstPitch >> 1);
|
|
|
|
u16 *src0 = (u16 *)srcPtr;
|
|
u16 *src1 = src0 + (srcPitch >> 1);
|
|
u16 *src2 = src1 + (srcPitch >> 1);
|
|
#ifdef MMX
|
|
if(cpu_mmx) {
|
|
internal_scale2x_16_mmx(dst0, dst1, src0, src0, src1, width);
|
|
|
|
int count = height;
|
|
|
|
count -= 2;
|
|
while(count) {
|
|
dst0 += dstPitch;
|
|
dst1 += dstPitch;
|
|
internal_scale2x_16_mmx(dst0, dst1, src0, src1, src2, width);
|
|
src0 = src1;
|
|
src1 = src2;
|
|
src2 += srcPitch >> 1;
|
|
--count;
|
|
}
|
|
dst0 += dstPitch;
|
|
dst1 += dstPitch;
|
|
internal_scale2x_16_mmx(dst0, dst1, src0, src1, src1, width);
|
|
} else {
|
|
#endif
|
|
internal_scale2x_16_def(dst0, src0, src0, src1, width);
|
|
internal_scale2x_16_def(dst1, src1, src0, src0, width);
|
|
|
|
int count = height;
|
|
|
|
count -= 2;
|
|
while(count) {
|
|
dst0 += dstPitch;
|
|
dst1 += dstPitch;
|
|
internal_scale2x_16_def(dst0, src0, src1, src2, width);
|
|
internal_scale2x_16_def(dst1, src2, src1, src0, width);
|
|
src0 = src1;
|
|
src1 = src2;
|
|
src2 += srcPitch >> 1;
|
|
--count;
|
|
}
|
|
dst0 += dstPitch;
|
|
dst1 += dstPitch;
|
|
internal_scale2x_16_def(dst0, src0, src1, src1, width);
|
|
internal_scale2x_16_def(dst1, src1, src1, src0, width);
|
|
#ifdef MMX
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void AdMame2x32(u8 *srcPtr, u32 srcPitch, u8 * /* deltaPtr */,
|
|
u8 *dstPtr, u32 dstPitch, int width, int height)
|
|
{
|
|
u32 *dst0 = (u32 *)dstPtr;
|
|
u32 *dst1 = dst0 + (dstPitch >> 2);
|
|
|
|
u32 *src0 = (u32 *)srcPtr;
|
|
u32 *src1 = src0 + (srcPitch >> 2);
|
|
u32 *src2 = src1 + (srcPitch >> 2);
|
|
#ifdef MMX
|
|
if(cpu_mmx) {
|
|
internal_scale2x_32_mmx(dst0, dst1, src0, src0, src1, width);
|
|
|
|
int count = height;
|
|
|
|
count -= 2;
|
|
while(count) {
|
|
dst0 += dstPitch >> 1;
|
|
dst1 += dstPitch >> 1;
|
|
internal_scale2x_32_mmx(dst0, dst1, src0, src1, src2, width);
|
|
src0 = src1;
|
|
src1 = src2;
|
|
src2 += srcPitch >> 2;
|
|
--count;
|
|
}
|
|
dst0 += dstPitch >> 1;
|
|
dst1 += dstPitch >> 1;
|
|
internal_scale2x_32_mmx(dst0, dst1, src0, src1, src1, width);
|
|
} else {
|
|
#endif
|
|
internal_scale2x_32_def(dst0, src0, src0, src1, width);
|
|
internal_scale2x_32_def(dst1, src1, src0, src0, width);
|
|
|
|
int count = height;
|
|
|
|
count -= 2;
|
|
while(count) {
|
|
dst0 += dstPitch >> 1;
|
|
dst1 += dstPitch >> 1;
|
|
internal_scale2x_32_def(dst0, src0, src1, src2, width);
|
|
internal_scale2x_32_def(dst1, src2, src1, src0, width);
|
|
src0 = src1;
|
|
src1 = src2;
|
|
src2 += srcPitch >> 2;
|
|
--count;
|
|
}
|
|
dst0 += dstPitch >> 1;
|
|
dst1 += dstPitch >> 1;
|
|
internal_scale2x_32_def(dst0, src0, src1, src1, width);
|
|
internal_scale2x_32_def(dst1, src1, src1, src0, width);
|
|
#ifdef MMX
|
|
}
|
|
#endif
|
|
}
|