diff --git a/build.sh b/build.sh index ca1c11b69d..ae2ca97abe 100644 --- a/build.sh +++ b/build.sh @@ -9,13 +9,13 @@ #export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --prefix `pwd`" #Optimized, but a devbuild -export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --prefix `pwd`" +#export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --prefix `pwd`" #Debug / Devbuild version #export PCSX2OPTIONS="--enable-debug --enable-devbuild --enable-sse3 --prefix `pwd`" -#Optimized, but a devbuild - with memcpy_fast_ enabled. - BROKEN! -#export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --enable-memcpyfast --prefix `pwd`" +#Optimized, but a devbuild - with memcpy_fast_ enabled. - EXPERIMENTAL +export PCSX2OPTIONS="--enable-sse3 --enable-sse4 --enable-devbuild --enable-memcpyfast --prefix `pwd`" #ZeroGS Normal mode export ZEROGSOPTIONS="--enable-sse2" diff --git a/pcsx2/IPU/yuv2rgb.cpp b/pcsx2/IPU/yuv2rgb.cpp index 48896cbeef..312ff8b845 100644 --- a/pcsx2/IPU/yuv2rgb.cpp +++ b/pcsx2/IPU/yuv2rgb.cpp @@ -58,7 +58,7 @@ enum BCb_COEFF = 0x40 }; -static PCSX2_ALIGNED16(const SSE2_Tables sse2_tables) = +static volatile PCSX2_ALIGNED16(const SSE2_Tables sse2_tables) = { {0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000,0x8000}, // c_bias {16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16}, // y_bias @@ -223,8 +223,8 @@ ihatemsvc: // Use ecx and edx as base pointers, to allow for Mod/RM form on memOps. // This saves 2-3 bytes per instruction where these are used. :) - "mov ecx, offset yuv2rgb_temp\n" - "mov edx, offset sse2_tables+64\n" + "mov ecx, offset %c[yuv2rgb_temp]\n" + "mov edx, offset %c[sse2_tables]+64\n" ".align 16\n" "tworows:\n" @@ -240,15 +240,15 @@ ihatemsvc: // unfortunately I don't think this will matter despite being // technically potentially a little faster, but this is // equivalent to an add or sub - "pxor xmm2, xmmword ptr [edx-0x40]\n" // xmm2 <-- 8 x (Cb - 128) << 8 - "pxor xmm0, xmmword ptr [edx-0x40]\n" // xmm0 <-- 8 x (Cr - 128) << 8 + "pxor xmm2, xmmword ptr [edx+%c[C_BIAS]]\n" // xmm2 <-- 8 x (Cb - 128) << 8 + "pxor xmm0, xmmword ptr [edx+%c[C_BIAS]]\n" // xmm0 <-- 8 x (Cr - 128) << 8 "movaps xmm1, xmm0\n" "movaps xmm3, xmm2\n" - "pmulhw xmm1, xmmword ptr [edx+0x10]\n" - "pmulhw xmm3, xmmword ptr [edx+0x20]\n" - "pmulhw xmm0, xmmword ptr [edx+0x30]\n" - "pmulhw xmm2, xmmword ptr [edx+0x40]\n" + "pmulhw xmm1, xmmword ptr [edx+%c[GCr_COEFF]]\n" + "pmulhw xmm3, xmmword ptr [edx+%c[GCb_COEFF]]\n" + "pmulhw xmm0, xmmword ptr [edx+%c[RCr_COEFF]]\n" + "pmulhw xmm2, xmmword ptr [edx+%c[BCb_COEFF]]\n" "paddsw xmm1, xmm3\n" // store for the next line; looking at the code above // compared to the code below, I have to wonder whether @@ -270,13 +270,13 @@ ihatemsvc: "movaps xmm5, xmm2\n" "movaps xmm6, xmmword ptr [mb8+edi]\n" - "psubusb xmm6, xmmword ptr [edx-0x30]\n" + "psubusb xmm6, xmmword ptr [edx+%c[Y_BIAS]]\n" "movaps xmm7, xmm6\n" "psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14 - "pand xmm7, xmmword ptr [edx+Y_MASK]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15 + "pand xmm7, xmmword ptr [edx+%c[Y_MASK]]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15 - "pmulhuw xmm6, xmmword ptr [edx+0x00]\n" - "pmulhuw xmm7, xmmword ptr [edx+0x00]\n" + "pmulhuw xmm6, xmmword ptr [edx+%c[Y_COEFF]]\n" + "pmulhuw xmm7, xmmword ptr [edx+%c[Y_COEFF]]\n" "paddsw xmm0, xmm6\n" "paddsw xmm3, xmm7\n" @@ -286,7 +286,7 @@ ihatemsvc: "paddsw xmm5, xmm7\n" // round - "movaps xmm6, xmmword ptr [edx-0x10]\n" + "movaps xmm6, xmmword ptr [edx+%c[ROUND_1BIT]]\n" "paddw xmm0, xmm6\n" "paddw xmm1, xmm6\n" "paddw xmm2, xmm6\n" @@ -342,6 +342,12 @@ ihatemsvc: "cmp esi, 64\n" "jne tworows\n" ".att_syntax\n" + : + :[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK), + [ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF), + [GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF), + [yuv2rgb_temp]"i"(yuv2rgb_temp), [sse2_tables]"i"(&sse2_tables) + : ); #else #error Unsupported compiler diff --git a/pcsx2/x86/fast_routines.S b/pcsx2/x86/fast_routines.S index 0cdfcbc0fc..c38669d08b 100644 --- a/pcsx2/x86/fast_routines.S +++ b/pcsx2/x86/fast_routines.S @@ -381,7 +381,7 @@ $memcpy_align_done: // destination is dword aligned shr eax, 6 // get 64-byte block count jz $memcpy_ic_2 // finish the last few bytes - mov edx, offset _mmx_backup ; will probably need this to save/restore mmx + mov edx, offset _mmx_backup // will probably need this to save/restore mmx cmp eax, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy jae $memcpy_uc_test