mirror of https://github.com/PCSX2/pcsx2.git
GregMiscellaneous: zzogl-pg:
* port ASM GAS to intrinsic. git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3820 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
709a79f513
commit
6e1b11c2b5
|
@ -26,6 +26,8 @@
|
||||||
#include "zerogs.h"
|
#include "zerogs.h"
|
||||||
#include "targets.h"
|
#include "targets.h"
|
||||||
#include "ZZoglShaders.h"
|
#include "ZZoglShaders.h"
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
#define RHA
|
#define RHA
|
||||||
//#define RW
|
//#define RW
|
||||||
|
@ -3042,6 +3044,7 @@ __forceinline void update_4pixels_sse2(u32* src, Tdst* basepage, u32 i_msk, u32
|
||||||
Tdst* dst_tmp;
|
Tdst* dst_tmp;
|
||||||
__aligned16 u32 src_tmp[4];
|
__aligned16 u32 src_tmp[4];
|
||||||
u32* base_ptr;
|
u32* base_ptr;
|
||||||
|
__m128i pixels;
|
||||||
|
|
||||||
// highly slow. memory -> register -> memory -> xmm ...
|
// highly slow. memory -> register -> memory -> xmm ...
|
||||||
// Intel SSE4.1 support an instruction to load memory to a part of xmm
|
// Intel SSE4.1 support an instruction to load memory to a part of xmm
|
||||||
|
@ -3051,132 +3054,58 @@ __forceinline void update_4pixels_sse2(u32* src, Tdst* basepage, u32 i_msk, u32
|
||||||
src_tmp[2] = src[RW((j<<6)+INDEX+2)];
|
src_tmp[2] = src[RW((j<<6)+INDEX+2)];
|
||||||
src_tmp[3] = src[RW((j<<6)+INDEX+3)];
|
src_tmp[3] = src[RW((j<<6)+INDEX+3)];
|
||||||
#endif
|
#endif
|
||||||
// NOTE: maybe look at g++ instrinsic. (maybe it could be compatible with the window compiler)
|
|
||||||
if (AA.x == 2) {
|
if (AA.x == 2) {
|
||||||
// Note: pixels (32bits) are stored like that:
|
// Note: pixels (32bits) are stored like that:
|
||||||
// p0 p0 p0 p0 p1 p1 p1 p1
|
// p0 p0 p0 p0 p1 p1 p1 p1
|
||||||
// p2 p2 p2 p2 p3 p3 p3 p3
|
// p2 p2 p2 p2 p3 p3 p3 p3
|
||||||
base_ptr = &src[(((j<<6)+INDEX)<<2)];
|
base_ptr = &src[(((j<<6)+INDEX)<<2)];
|
||||||
#ifdef __LINUX__
|
__m128i pixel_low = _mm_cvtsi32_si128(*(u64*)(base_ptr+3));
|
||||||
__asm__ __volatile
|
__m128i pixel_high = _mm_cvtsi32_si128(*(u64*)(base_ptr+11));
|
||||||
(
|
pixels = _mm_unpacklo_epi64(pixel_low, pixel_high);
|
||||||
".intel_syntax noprefix\n"
|
|
||||||
|
|
||||||
"movq xmm0, [%[base_ptr]+12]\n" // read p0 p1
|
|
||||||
"movq xmm1, [%[base_ptr]+44]\n" // read p2 p3
|
|
||||||
"punpcklqdq xmm0, xmm1\n"
|
|
||||||
|
|
||||||
".att_syntax\n"
|
|
||||||
:
|
|
||||||
: [base_ptr]"r"(base_ptr)
|
|
||||||
: "xmm0", "xmm1"
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
} else if(AA.x ==1) {
|
} else if(AA.x ==1) {
|
||||||
|
// Note: pixels (32bits) are stored like that:
|
||||||
|
// p0 p0 p1 p1 p2 p2 p3 p3
|
||||||
base_ptr = &src[(((j<<6)+INDEX)<<1)];
|
base_ptr = &src[(((j<<6)+INDEX)<<1)];
|
||||||
#ifdef __LINUX__
|
__m128i pixel_low = _mm_cvtsi32_si128(*(u64*)(base_ptr+1));
|
||||||
__asm__ __volatile
|
__m128i pixel_high = _mm_cvtsi32_si128(*(u64*)(base_ptr+5));
|
||||||
(
|
pixels = _mm_unpacklo_epi64(pixel_low, pixel_high);
|
||||||
".intel_syntax noprefix\n"
|
|
||||||
|
|
||||||
"movq xmm0, [%[base_ptr]+4]\n" // read p0 p1
|
|
||||||
"movq xmm1, [%[base_ptr]+20]\n" // read p2 p3
|
|
||||||
"punpcklqdq xmm0, xmm1\n"
|
|
||||||
|
|
||||||
".att_syntax\n"
|
|
||||||
:
|
|
||||||
: [base_ptr]"r"(base_ptr)
|
|
||||||
: "xmm0", "xmm1"
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
base_ptr = &src[((j<<6)+INDEX)];
|
base_ptr = &src[((j<<6)+INDEX)];
|
||||||
#ifdef __LINUX__
|
pixels = _mm_loadu_si128((__m128i*)base_ptr);
|
||||||
__asm__ __volatile
|
|
||||||
(
|
|
||||||
".intel_syntax noprefix\n"
|
|
||||||
|
|
||||||
"movdqu xmm0, [%[base_ptr]]\n"
|
|
||||||
|
|
||||||
".att_syntax\n"
|
|
||||||
:
|
|
||||||
: [base_ptr]"r"(base_ptr)
|
|
||||||
: "xmm0"
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (do_conversion) {
|
if (do_conversion) {
|
||||||
// transform pixel from ARGB:8888 to ARGB:1555
|
// transform pixel from ARGB:8888 to ARGB:1555
|
||||||
// It also does the fbm pixel mask
|
// It also does the fbm pixel mask
|
||||||
#ifdef __LINUX__
|
// Filter component of each pixel
|
||||||
__asm__ __volatile__
|
__m128i pixel_A = _mm_and_si128(pixels, _mm_load_si128((__m128i*)pixel_Amask));
|
||||||
(
|
__m128i pixel_R = _mm_and_si128(pixels, _mm_load_si128((__m128i*)pixel_Rmask));
|
||||||
".intel_syntax noprefix\n"
|
__m128i pixel_G = _mm_and_si128(pixels, _mm_load_si128((__m128i*)pixel_Gmask));
|
||||||
|
__m128i pixel_B = _mm_and_si128(pixels, _mm_load_si128((__m128i*)pixel_Bmask));
|
||||||
|
|
||||||
// The loading of the register is done above
|
// shift the value
|
||||||
// "movdqa xmm0, [%[src_tmp]]\n" // load 4 pixel
|
pixel_A = _mm_srli_si128(pixel_A, 15);
|
||||||
"movdqa xmm1, xmm0\n"
|
pixel_R = _mm_srli_si128(pixel_R, 9);
|
||||||
"movdqa xmm2, xmm0\n"
|
pixel_G = _mm_srli_si128(pixel_G, 6);
|
||||||
"movdqa xmm3, xmm0\n"
|
pixel_B = _mm_srli_si128(pixel_B, 3);
|
||||||
|
|
||||||
// keep 1 color and shift it
|
// rebuild a complete pixel
|
||||||
"pand xmm0, %[pixel_Amask]\n"
|
pixels = _mm_or_si128(pixel_A, pixel_B);
|
||||||
"psrld xmm0, 15\n"
|
pixels = _mm_or_si128(pixels, pixel_G);
|
||||||
|
pixels = _mm_or_si128(pixels, pixel_R);
|
||||||
|
|
||||||
"pand xmm1, %[pixel_Rmask]\n"
|
// apply fbm mask
|
||||||
"psrld xmm1, 9\n"
|
pixels = _mm_and_si128(pixels, _mm_load_si128((__m128i*)mask) );
|
||||||
|
|
||||||
"pand xmm2, %[pixel_Gmask]\n"
|
_mm_store_si128((__m128i*)src_tmp, pixels);
|
||||||
"psrld xmm2, 6\n"
|
|
||||||
|
|
||||||
"pand xmm3, %[pixel_Bmask]\n"
|
|
||||||
"psrld xmm3, 3\n"
|
|
||||||
|
|
||||||
// Rebuild a full 16bits pixel
|
|
||||||
"por xmm0, xmm1\n"
|
|
||||||
"por xmm0, xmm2\n"
|
|
||||||
"por xmm0, xmm3\n"
|
|
||||||
|
|
||||||
// Apply the fbm mask
|
|
||||||
"movdqa xmm1,[%[mask]]\n"
|
|
||||||
"pand xmm0, xmm1\n"
|
|
||||||
|
|
||||||
// save the result
|
|
||||||
"movdqa [%[src_tmp]], xmm0\n" // load 4 pixel
|
|
||||||
|
|
||||||
".att_syntax\n"
|
|
||||||
:
|
|
||||||
: [src_tmp]"r"(src_tmp), [mask]"r"(mask), // note: "m" need a standard type pointer (not a typedef)
|
|
||||||
[pixel_Amask]"m"(*pixel_Amask), [pixel_Rmask]"m"(*pixel_Rmask),
|
|
||||||
[pixel_Bmask]"m"(*pixel_Bmask), [pixel_Gmask]"m"(*pixel_Gmask)
|
|
||||||
: "xmm0", "xmm1", "xmm2", "xmm3", "memory"
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
#ifdef __LINUX__
|
|
||||||
// Just apply the fbm mask
|
// Just apply the fbm mask
|
||||||
// The real optimization is to reduce the register usage for dst_tmp update
|
// The real optimization is to reduce the register usage for dst_tmp update
|
||||||
// Because x86 does not have enough register gcc does multiples load/store value
|
// Because x86 does not have enough register gcc does multiples load/store value
|
||||||
// in the stack
|
// in the stack
|
||||||
__asm__ __volatile__
|
pixels = _mm_and_si128(pixels, _mm_loadu_si128((__m128i*)mask));
|
||||||
(
|
_mm_store_si128((__m128i*)src_tmp, pixels);
|
||||||
".intel_syntax noprefix\n"
|
|
||||||
|
|
||||||
// Note pixel are already load above
|
|
||||||
// Apply the fbm mask
|
|
||||||
"movdqa xmm1,[%[mask]]\n"
|
|
||||||
"pand xmm0, xmm1\n"
|
|
||||||
|
|
||||||
// save the result
|
|
||||||
"movdqa [%[src_tmp]], xmm0\n" // load 4 pixel
|
|
||||||
|
|
||||||
".att_syntax\n"
|
|
||||||
:
|
|
||||||
: [src_tmp]"r"(src_tmp), [mask]"r"(mask)
|
|
||||||
: "xmm0", "xmm1", "memory"
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Group 4 pixel to allow futur sse optimization of the convfn function
|
// Group 4 pixel to allow futur sse optimization of the convfn function
|
||||||
|
@ -3224,7 +3153,7 @@ void Resolve_32b(const void* psrc, int fbp, int fbw, int fbh, u32 fbm)
|
||||||
|
|
||||||
int maxfbh = (MEMORY_END-fbp*256) / (sizeof(Tdst) * fbw);
|
int maxfbh = (MEMORY_END-fbp*256) / (sizeof(Tdst) * fbw);
|
||||||
if( maxfbh > fbh ) maxfbh = fbh;
|
if( maxfbh > fbh ) maxfbh = fbh;
|
||||||
ZZLog::Error_Log("*** Resolve 32 to 32 bits: %dx%d", maxfbh, fbw);
|
ZZLog::Error_Log("*** Resolve 32 to 32 bits: %dx%d. Frame Mask %x", maxfbh, fbw, imask);
|
||||||
|
|
||||||
// Start the src array at the end to reduce testing in loop
|
// Start the src array at the end to reduce testing in loop
|
||||||
u32 raw_size = RH(Pitch(fbw))/sizeof(u32);
|
u32 raw_size = RH(Pitch(fbw))/sizeof(u32);
|
||||||
|
|
Loading…
Reference in New Issue