[zzogl]: asm work. Asm was totally broken. At least in release build.

* use volatile keyword to avoid gcc removing the function...
* Use name variable in asm code instead of %n
* Fix constraint on s_clut16mask. There are input, not output...

Arcum can you look at this 2 things thanks.
-> code is still broken in one place. s_clut16mask & s_clut16mask2 
re null in the code generated by gcc ! To fix it (do not know why), we can declare them as static. But I'm not sure
on the impact and I can not test windows...
-> s_clut16mask is declared as a 256bits numbers instead of 128 !


git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3494 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut 2010-07-15 16:32:59 +00:00
parent 202f09bf43
commit 4a23585a55
1 changed files with 48 additions and 48 deletions

View File

@ -433,11 +433,11 @@ WriteUnaligned:
End:
}
#else
__asm__(".intel_syntax noprefix\n"
"movdqa xmm0, xmmword ptr [ecx]\n"
"movdqa xmm1, xmmword ptr [ecx+16]\n"
"movdqa xmm2, xmmword ptr [ecx+32]\n"
"movdqa xmm3, xmmword ptr [ecx+48]\n"
__asm__ __volatile__(".intel_syntax noprefix\n"
"movdqa xmm0, xmmword ptr [%[vm]]\n"
"movdqa xmm1, xmmword ptr [%[vm]+16]\n"
"movdqa xmm2, xmmword ptr [%[vm]+32]\n"
"movdqa xmm3, xmmword ptr [%[vm]+48]\n"
// rearrange
"pshuflw xmm0, xmm0, 0x88\n"
@ -457,14 +457,14 @@ End:
"pxor xmm6, xmm6\n"
"test edx, 15\n"
"test %[clut], 15\n"
"jnz WriteUnaligned\n"
"movdqa xmm7, [s_clut16mask]\n" // saves upper 16 bits
"movdqa xmm7, s_clut16mask\n" // saves upper 16 bits
// have to save interlaced with the old data
"movdqa xmm4, [edx]\n"
"movdqa xmm5, [edx+32]\n"
"movdqa xmm4, [%[clut]]\n"
"movdqa xmm5, [%[clut]+32]\n"
"movhlps xmm1, xmm0\n"
"movlhps xmm0, xmm2\n"// lower 8 colors
@ -483,29 +483,29 @@ End:
"punpckhwd xmm2, xmm6\n"
"punpckhwd xmm3, xmm6\n"
"movdqa [edx], xmm0\n"
"movdqa [edx+32], xmm1\n"
"movdqa [%[clut]], xmm0\n"
"movdqa [%[clut]+32], xmm1\n"
"movdqa xmm5, xmm7\n"
"pand xmm7, [edx+16]\n"
"pand xmm5, [edx+48]\n"
"pand xmm7, [%[clut]+16]\n"
"pand xmm5, [%[clut]+48]\n"
"por xmm2, xmm7\n"
"por xmm3, xmm5\n"
"movdqa [edx+16], xmm2\n"
"movdqa [edx+48], xmm3\n"
"movdqa [%[clut]+16], xmm2\n"
"movdqa [%[clut]+48], xmm3\n"
"jmp WriteCLUT_T16_I4_CSM1_End\n"
"WriteUnaligned:\n"
// %edx is offset by 2
"sub edx, 2\n"
// %[clut] is offset by 2
"sub %[clut], 2\n"
"movdqa xmm7, [[s_clut16mask2]]\n" // saves lower 16 bits
"movdqa xmm7, s_clut16mask2\n" // saves lower 16 bits
// have to save interlaced with the old data
"movdqa xmm4, [edx]\n"
"movdqa xmm5, [edx+32]\n"
"movdqa xmm4, [%[clut]]\n"
"movdqa xmm5, [%[clut]+32]\n"
"movhlps xmm1, xmm0\n"
"movlhps xmm0, xmm2\n" // lower 8 colors
@ -528,24 +528,24 @@ End:
"pslld xmm2, 16\n"
"pslld xmm3, 16\n"
"movdqa [edx], xmm0\n"
"movdqa [edx+32], xmm1\n"
"movdqa [%[clut]], xmm0\n"
"movdqa [%[clut]+32], xmm1\n"
"movdqa xmm5, xmm7\n"
"pand xmm7, [edx+16]\n"
"pand xmm5, [edx+48]\n"
"pand xmm7, [%[clut]+16]\n"
"pand xmm5, [%[clut]+48]\n"
"por xmm2, xmm7\n"
"por xmm3, xmm5\n"
"movdqa [edx+16], xmm2\n"
"movdqa [edx+48], xmm3\n"
"movdqa [%[clut]+16], xmm2\n"
"movdqa [%[clut]+48], xmm3\n"
"WriteCLUT_T16_I4_CSM1_End:\n"
"\n"
".att_syntax\n"
: [s_clut16mask] "=m" (s_clut16mask), [s_clut16mask2] "=m" (s_clut16mask2)
: "c" (vm), "d" (clut)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
:
: [vm] "r" (vm), [clut] "r" (clut), [s_clut16mask] "m" (*s_clut16mask), [s_clut16mask2] "m" (*s_clut16mask2)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
);
#endif // _MSC_VER
}
@ -718,15 +718,15 @@ Z16Loop:
}
#else // _MSC_VER
__asm__(".intel_syntax\n"
__asm__ __volatile__(".intel_syntax\n"
"pxor %%xmm7, %%xmm7\n"
"Z16Loop:\n"
// unpack 64 bytes at a time
"movdqa %%xmm0, [%0]\n"
"movdqa %%xmm2, [%0+16]\n"
"movdqa %%xmm4, [%0+32]\n"
"movdqa %%xmm6, [%0+48]\n"
"movdqa %%xmm0, [%[src]]\n"
"movdqa %%xmm2, [%[src]+16]\n"
"movdqa %%xmm4, [%[src]+32]\n"
"movdqa %%xmm6, [%[src]+48]\n"
"movdqa %%xmm1, %%xmm0\n"
"movdqa %%xmm3, %%xmm2\n"
@ -738,35 +738,35 @@ Z16Loop:
"punpckhwd %%xmm3, %%xmm7\n"
// start saving
"movdqa [%1], %%xmm0\n"
"movdqa [%1+16], %%xmm1\n"
"movdqa [%[dst]], %%xmm0\n"
"movdqa [%[dst]+16], %%xmm1\n"
"punpcklwd %%xmm4, %%xmm7\n"
"punpckhwd %%xmm5, %%xmm7\n"
"movdqa [%1+32], %%xmm2\n"
"movdqa [%1+48], %%xmm3\n"
"movdqa [%[dst]+32], %%xmm2\n"
"movdqa [%[dst]+48], %%xmm3\n"
"movdqa %%xmm0, %%xmm6\n"
"punpcklwd %%xmm6, %%xmm7\n"
"movdqa [%1+64], %%xmm4\n"
"movdqa [%1+80], %%xmm5\n"
"movdqa [%[dst]+64], %%xmm4\n"
"movdqa [%[dst]+80], %%xmm5\n"
"punpckhwd %%xmm0, %%xmm7\n"
"movdqa [%1+96], %%xmm6\n"
"movdqa [%1+112], %%xmm0\n"
"movdqa [%[dst]+96], %%xmm6\n"
"movdqa [%[dst]+112], %%xmm0\n"
"add %0, 64\n"
"add %1, 128\n"
"sub %2, 1\n"
"add %[src], 64\n"
"add %[dst], 128\n"
"sub %[iters], 1\n"
"jne Z16Loop\n"
".att_syntax\n"
: "=r"(src), "=r"(dst), "=r"(iters)
: "0"(src), "1"(dst), "2"(iters)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
: "=&r"(src), "=&r"(dst), "=&r"(iters)
: [src] "0"(src), [dst] "1"(dst), [iters] "2"(iters)
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
);
#endif // _MSC_VER
}