From 57cf72039269d8b88dda0b6073dd1d01b8478191 Mon Sep 17 00:00:00 2001 From: zilmar Date: Wed, 21 Oct 2015 07:49:29 +1100 Subject: [PATCH] [Glide64] Sync texture code --- Source/Glide64/TexLoad16b.h | 272 ++-- Source/Glide64/TexLoad4b.h | 2434 ++++++++--------------------------- Source/Glide64/TexLoad8b.h | 963 +++++--------- 3 files changed, 1001 insertions(+), 2668 deletions(-) diff --git a/Source/Glide64/TexLoad16b.h b/Source/Glide64/TexLoad16b.h index 53acdea91..00a2f334b 100644 --- a/Source/Glide64/TexLoad16b.h +++ b/Source/Glide64/TexLoad16b.h @@ -37,165 +37,131 @@ // //**************************************************************** -extern "C" void __declspec(naked) asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext) +static inline void load16bRGBA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext) { - _asm { - align 4 - push ebp - mov ebp,esp - push ebx - push esi - push edi + uint32_t *v6; + uint32_t *v7; + int v8; + int v9; + uint32_t v10; + uint32_t v11; + uint32_t *v12; + uint32_t *v13; + int v14; + uint32_t v15; + uint32_t v16; + int v17; + int v18; - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - mov eax,[esi] // read both pixels - mov ebx,[esi+4] // read both pixels - bswap eax - bswap ebx - - ror ax,1 - ror bx,1 - ror eax,16 - ror ebx,16 - ror ax,1 - ror bx,1 - - mov [edi],eax - mov [edi+4],ebx - add esi,8 - add edi,8 - - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz end_y_loop - push ecx - - mov eax,esi - add eax,[line] - mov esi,[src] - sub eax, esi - and eax, 0xFFF - add esi, eax - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - mov eax,[esi+4] // read both pixels - mov ebx,[esi] // read both pixels - bswap eax - bswap ebx - - ror ax,1 - ror bx,1 - ror eax,16 - ror ebx,16 - ror ax,1 - ror bx,1 - - mov [edi],eax - mov [edi+4],ebx - add esi,8 - add edi,8 - - dec ecx - jnz x_loop_2 - - mov eax,esi - add eax,[line] - mov esi,[src] - sub eax, esi - and eax, 0xFFF - add esi, eax - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v6 = (uint32_t *)src; + v7 = (uint32_t *)dst; + v8 = height; + do + { + v17 = v8; + v9 = wid_64; + do + { + v10 = bswap32(*v6); + v11 = bswap32(v6[1]); + ALOWORD(v10) = __ROR__((uint16_t)v10, 1); + ALOWORD(v11) = __ROR__((uint16_t)v11, 1); + v10 = __ROR__(v10, 16); + v11 = __ROR__(v11, 16); + ALOWORD(v10) = __ROR__((uint16_t)v10, 1); + ALOWORD(v11) = __ROR__((uint16_t)v11, 1); + *v7 = v10; + v7[1] = v11; + v6 += 2; + v7 += 2; + --v9; + } + while ( v9 ); + if ( v17 == 1 ) + break; + v18 = v17 - 1; + v12 = (uint32_t *)&src[(line + (uintptr_t)v6 - (uintptr_t)src) & 0xFFF]; + v13 = (uint32_t *)((char *)v7 + ext); + v14 = wid_64; + do + { + v15 = bswap32(v12[1]); + v16 = bswap32(*v12); + ALOWORD(v15) = __ROR__((uint16_t)v15, 1); + ALOWORD(v16) = __ROR__((uint16_t)v16, 1); + v15 = __ROR__(v15, 16); + v16 = __ROR__(v16, 16); + ALOWORD(v15) = __ROR__((uint16_t)v15, 1); + ALOWORD(v16) = __ROR__((uint16_t)v16, 1); + *v13 = v15; + v13[1] = v16; + v12 += 2; + v13 += 2; + --v14; + } + while ( v14 ); + v6 = (uint32_t *)&src[(line + (uintptr_t)v12 - (uintptr_t)src) & 0xFFF]; + v7 = (uint32_t *)((char *)v13 + ext); + v8 = v18 - 1; + } + while ( v18 != 1 ); } -extern "C" void __declspec(naked) asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext) +static inline void load16bIA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext) { - _asm { - ALIGN 4 + uint32_t *v6; + uint32_t *v7; + int v8; + int v9; + uint32_t v10; + uint32_t *v11; + uint32_t *v12; + int v13; + uint32_t v14; + int v15; + int v16; - push ebp - mov ebp, esp - push ebx - push esi - push edi - - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - mov eax,[esi] // read both pixels - mov ebx,[esi+4] // read both pixels - mov [edi],eax - mov [edi+4],ebx - add esi,8 - add edi,8 - - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz end_y_loop - push ecx - - add esi,[line] - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - mov eax,[esi+4] // read both pixels - mov ebx,[esi] // read both pixels - mov [edi],eax - mov [edi+4],ebx - add esi,8 - add edi,8 - - dec ecx - jnz x_loop_2 - - add esi,[line] - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v6 = (uint32_t *)src; + v7 = (uint32_t *)dst; + v8 = height; + do + { + v15 = v8; + v9 = wid_64; + do + { + v10 = v6[1]; + *v7 = *v6; + v7[1] = v10; + v6 += 2; + v7 += 2; + --v9; + } + while ( v9 ); + if ( v15 == 1 ) + break; + v16 = v15 - 1; + v11 = (uint32_t *)((char *)v6 + line); + v12 = (uint32_t *)((char *)v7 + ext); + v13 = wid_64; + do + { + v14 = *v11; + *v12 = v11[1]; + v12[1] = v14; + v11 += 2; + v12 += 2; + --v13; + } + while ( v13 ); + v6 = (uint32_t *)((char *)v11 + line); + v7 = (uint32_t *)((char *)v12 + ext); + v8 = v16 - 1; + } + while ( v16 != 1 ); } + //**************************************************************** // Size: 2, Format: 0 // @@ -206,7 +172,7 @@ wxUint32 Load16bRGBA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int if (height < 1) height = 1; int ext = (real_width - (wid_64 << 2)) << 1; - asmLoad16bRGBA(src, dst, wid_64, height, line, ext); + load16bRGBA((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext); return (1 << 16) | GR_TEXFMT_ARGB_1555; } @@ -221,7 +187,7 @@ wxUint32 Load16bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int li if (height < 1) height = 1; int ext = (real_width - (wid_64 << 2)) << 1; - asmLoad16bIA(src, dst, wid_64, height, line, ext); + load16bIA((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext); return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88; } diff --git a/Source/Glide64/TexLoad4b.h b/Source/Glide64/TexLoad4b.h index 3ed5438a1..9b70874b4 100644 --- a/Source/Glide64/TexLoad4b.h +++ b/Source/Glide64/TexLoad4b.h @@ -37,1897 +37,554 @@ // //**************************************************************** -extern "C" void __declspec(naked) asmLoad4bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal) +static inline void load4bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, uint16_t line, int ext, uint16_t *pal) { - _asm { - push ebp - mov ebp, esp - push ebx - push esi - push edi + uint8_t *v7; + uint8_t *v8; + int v9; + int v10; + int v11; + uint32_t v12; + uint8_t *v13; + uint32_t v14; + uint32_t *v15; + uint32_t v16; + uint8_t *v17; + uint32_t *v18; + int v19; + int v20; + uint32_t v21; + uint32_t v22; + uint32_t *v23; + uint32_t v24; + int v25; + int v26; - mov ebx,[pal] - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - push ecx - - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword output { - shr eax,23 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,27 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shr eax,15 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,19 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 3rd dword output { - mov eax,edx - shr eax,7 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,11 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 4th dword output { - mov eax,edx - shl eax,1 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - shr edx,3 - and edx,0x1E - mov cx,[ebx+edx] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword output { - shr eax,23 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,27 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shr eax,15 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,19 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 3rd dword output { - mov eax,edx - shr eax,7 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,11 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 4th dword output { - mov eax,edx - shl eax,1 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - shr edx,3 - and edx,0x1E - mov cx,[ebx+edx] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz near end_y_loop - push ecx - - mov eax,esi - add eax,[line] - mov esi,[src] - sub eax,esi - and eax,0x7FF - add esi,eax - add edi,[ext] - - mov ecx,[wid_64] - x_loop_2: - push ecx - - mov eax,[esi+4] // read all 8 pixels - bswap eax - mov edx,eax - - // 1st dword output { - shr eax,23 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,27 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shr eax,15 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,19 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 3rd dword output { - mov eax,edx - shr eax,7 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,11 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 4th dword output { - mov eax,edx - shl eax,1 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - shr edx,3 - and edx,0x1E - mov cx,[ebx+edx] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 8 pixels - bswap eax - mov edx,esi - add edx,8 - mov esi,[src] - sub edx,esi - and edx,0x7FF - add esi,edx - mov edx,eax - - // 1st dword output { - shr eax,23 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,27 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shr eax,15 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,19 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 3rd dword output { - mov eax,edx - shr eax,7 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,11 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 4th dword output { - mov eax,edx - shl eax,1 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - shr edx,3 - and edx,0x1E - mov cx,[ebx+edx] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - - dec ecx - jnz x_loop_2 - - mov eax,esi - add eax,[line] - mov esi,[src] - sub eax,esi - and eax,0x7FF - add esi,eax - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v7 = src; + v8 = dst; + v9 = height; + do + { + v25 = v9; + v10 = wid_64; + do + { + v11 = v10; + v12 = bswap32(*(uint32_t *)v7); + v13 = v7 + 4; + ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1E)), 1); + v14 = v10 << 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 27) & 0x1E)), 1); + *(uint32_t *)v8 = v14; + v15 = (uint32_t *)(v8 + 4); + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1E)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 19) & 0x1E)), 1); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1E)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 11) & 0x1E)), 1); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v12 & 0x1E)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 3) & 0x1E)), 1); + *v15 = v14; + ++v15; + v16 = bswap32(*(uint32_t *)v13); + v7 = v13 + 4; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1E)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 27) & 0x1E)), 1); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1E)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 19) & 0x1E)), 1); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1E)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 11) & 0x1E)), 1); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v16 & 0x1E)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 3) & 0x1E)), 1); + *v15 = v14; + v8 = (uint8_t *)(v15 + 1); + v10 = v11 - 1; + } + while ( v11 != 1 ); + if ( v25 == 1 ) + break; + v26 = v25 - 1; + v17 = &src[(line + (uintptr_t)v7 - (uintptr_t)src) & 0x7FF]; + v18 = (uint32_t *)&v8[ext]; + v19 = wid_64; + do + { + v20 = v19; + v21 = bswap32(*((uint32_t *)v17 + 1)); + ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1E)), 1); + v22 = v19 << 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 27) & 0x1E)), 1); + *v18 = v22; + v23 = v18 + 1; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1E)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 19) & 0x1E)), 1); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1E)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 11) & 0x1E)), 1); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v21 & 0x1E)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 3) & 0x1E)), 1); + *v23 = v22; + ++v23; + v24 = bswap32(*(uint32_t *)v17); + v17 = &src[((uintptr_t)v17 + 8 - (uintptr_t)src) & 0x7FF]; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1E)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 27) & 0x1E)), 1); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1E)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 19) & 0x1E)), 1); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1E)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 11) & 0x1E)), 1); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v24 & 0x1E)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 3) & 0x1E)), 1); + *v23 = v22; + v18 = v23 + 1; + v19 = v20 - 1; + } + while ( v20 != 1 ); + v7 = &src[(line + (uintptr_t)v17 - (uintptr_t)src) & 0x7FF]; + v8 = (uint8_t *)((char *)v18 + ext); + v9 = v26 - 1; + } + while ( v26 != 1 ); } -extern "C" void __declspec(naked) asmLoad4bIAPal (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal) +static inline void load4bIAPal(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal) { - _asm { - push ebp - mov ebp, esp - push ebx - push esi - push edi + uint8_t *v7; + uint32_t *v8; + int v9; + int v10; + int v11; + uint32_t v12; + uint32_t *v13; + uint32_t v14; + uint32_t *v15; + uint32_t v16; + uint8_t *v17; + uint32_t *v18; + int v19; + int v20; + uint32_t v21; + uint32_t v22; + uint32_t *v23; + uint32_t v24; + int v25; + int v26; - mov ebx,[pal] - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - push ecx - - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword output { - shr eax,23 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,27 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shr eax,15 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,19 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 3rd dword output { - mov eax,edx - shr eax,7 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,11 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 4th dword output { - mov eax,edx - shl eax,1 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - shr edx,3 - and edx,0x1E - mov cx,[ebx+edx] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword output { - shr eax,23 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,27 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shr eax,15 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,19 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 3rd dword output { - mov eax,edx - shr eax,7 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,11 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 4th dword output { - mov eax,edx - shl eax,1 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - shr edx,3 - and edx,0x1E - mov cx,[ebx+edx] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz near end_y_loop - push ecx - - mov eax,esi - add eax,[line] - mov esi,[src] - sub eax,esi - and eax,0x7FF - add esi,eax - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - push ecx - - mov eax,[esi+4] // read all 8 pixels - bswap eax - mov edx,eax - - // 1st dword output { - shr eax,23 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,27 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shr eax,15 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,19 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 3rd dword output { - mov eax,edx - shr eax,7 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,11 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 4th dword output { - mov eax,edx - shl eax,1 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - shr edx,3 - and edx,0x1E - mov cx,[ebx+edx] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 8 pixels - bswap eax - mov edx,esi - add edx,8 - mov esi,[src] - sub edx,esi - and edx,0x7FF - add esi,edx - mov edx,eax - - // 1st dword output { - shr eax,23 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,27 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shr eax,15 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,19 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 3rd dword output { - mov eax,edx - shr eax,7 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,11 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 4th dword output { - mov eax,edx - shl eax,1 - and eax,0x1E - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - shr edx,3 - and edx,0x1E - mov cx,[ebx+edx] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - - dec ecx - jnz x_loop_2 - - mov eax,esi - add eax,[line] - mov esi,[src] - sub eax,esi - and eax,0x7FF - add esi,eax - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v7 = src; + v8 = (uint32_t *)dst; + v9 = height; + do + { + v25 = v9; + v10 = wid_64; + do + { + v11 = v10; + v12 = bswap32(*(uint32_t *)v7); + v13 = (uint32_t *)(v7 + 4); + ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1E)), 8); + v14 = v10 << 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 27) & 0x1E)), 8); + *v8 = v14; + v15 = v8 + 1; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1E)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 19) & 0x1E)), 8); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1E)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 11) & 0x1E)), 8); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v12 & 0x1E)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 3) & 0x1E)), 8); + *v15 = v14; + ++v15; + v16 = bswap32(*v13); + v7 = (uint8_t *)(v13 + 1); + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1E)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 27) & 0x1E)), 8); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1E)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 19) & 0x1E)), 8); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1E)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 11) & 0x1E)), 8); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v16 & 0x1E)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 3) & 0x1E)), 8); + *v15 = v14; + v8 = v15 + 1; + v10 = v11 - 1; + } + while ( v11 != 1 ); + if ( v25 == 1 ) + break; + v26 = v25 - 1; + v17 = &src[(line + (uintptr_t)v7 - (uintptr_t)src) & 0x7FF]; + v18 = (uint32_t *)((char *)v8 + ext); + v19 = wid_64; + do + { + v20 = v19; + v21 = bswap32(*((uint32_t *)v17 + 1)); + ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1E)), 8); + v22 = v19 << 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 27) & 0x1E)), 8); + *v18 = v22; + v23 = v18 + 1; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1E)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 19) & 0x1E)), 8); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1E)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 11) & 0x1E)), 8); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v21 & 0x1E)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 3) & 0x1E)), 8); + *v23 = v22; + ++v23; + v24 = bswap32(*(uint32_t *)v17); + v17 = &src[((uintptr_t)v17 + 8 - (uintptr_t)src) & 0x7FF]; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1E)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 27) & 0x1E)), 8); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1E)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 19) & 0x1E)), 8); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1E)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 11) & 0x1E)), 8); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v24 & 0x1E)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 3) & 0x1E)), 8); + *v23 = v22; + v18 = v23 + 1; + v19 = v20 - 1; + } + while ( v20 != 1 ); + v7 = &src[(line + (uintptr_t)v17 - (uintptr_t)src) & 0x7FF]; + v8 = (uint32_t *)((char *)v18 + ext); + v9 = v26 - 1; + } + while ( v26 != 1 ); } -extern "C" void __declspec(naked) asmLoad4bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext) +static inline void load4bIA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext) { - _asm { - push ebp - mov ebp, esp - push ebx - push esi - push edi - - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - push ecx - - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword { - xor ecx,ecx - - // pixel #1 - // IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII - mov eax,edx - shr eax,24 //Alpha - and eax,0x00000010 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,28 // Intensity - and eax,0x0000000E - or ecx,eax - shr eax,3 - or ecx,eax - - // pixel #2 - // xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx - mov eax,edx - shr eax,12 //Alpha - and eax,0x00001000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,16 // Intensity - and eax,0x00000E00 - or ecx,eax - shr eax,3 - and eax,0x00000100 - or ecx,eax - - // pixel #3 - // xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx - //Alpha - mov eax,edx - and eax,0x00100000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,4 // Intensity - and eax,0x000E0000 - or ecx,eax - shr eax,3 - and eax,0x00010000 - or ecx,eax - - // pixel #4 - // xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx - // AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx - mov eax,edx - shl eax,12 //Alpha - and eax,0x10000000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,8 // Intensity - and eax,0x0E000000 - or ecx,eax - shr eax,3 - and eax,0x01000000 - or ecx,eax - - - mov [edi],ecx - add edi,4 - // } - -// 2nd dword { - xor ecx,ecx - - // pixel #5 - // xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII - mov eax,edx - shr eax,8 //Alpha - and eax,0x00000010 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,12 // Intensity - and eax,0x0000000E - or ecx,eax - shr eax,3 - or ecx,eax - - // pixel #6 - // xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx - // xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx - //Alpha - mov eax,edx - shl eax,4 - and eax,0x00001000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx // Intensity - and eax,0x00000E00 - or ecx,eax - shr eax,3 - and eax,0x00000100 - or ecx,eax - - // pixel #7 - // xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx - // xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx - //Alpha - mov eax,edx - shl eax,16 - and eax,0x00100000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,12 // Intensity - and eax,0x000E0000 - or ecx,eax - shr eax,3 - and eax,0x00010000 - or ecx,eax - - // pixel #8 - // xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA - // AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx - mov eax,edx - shl eax,28 //Alpha - and eax,0x10000000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,24 // Intensity - and eax,0x0E000000 - or ecx,eax - shr eax,3 - and eax,0x01000000 - or ecx,eax - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword { - xor ecx,ecx - - // pixel #1 - // IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII - mov eax,edx - shr eax,24 //Alpha - and eax,0x00000010 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,28 // Intensity - and eax,0x0000000E - or ecx,eax - shr eax,3 - or ecx,eax - - // pixel #2 - // xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx - mov eax,edx - shr eax,12 //Alpha - and eax,0x00001000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,16 // Intensity - and eax,0x00000E00 - or ecx,eax - shr eax,3 - and eax,0x00000100 - or ecx,eax - - // pixel #3 - // xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx - //Alpha - mov eax,edx - and eax,0x00100000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,4 // Intensity - and eax,0x000E0000 - or ecx,eax - shr eax,3 - and eax,0x00010000 - or ecx,eax - - // pixel #4 - // xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx - // AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx - mov eax,edx - shl eax,12 //Alpha - and eax,0x10000000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,8 // Intensity - and eax,0x0E000000 - or ecx,eax - shr eax,3 - and eax,0x01000000 - or ecx,eax - - - mov [edi],ecx - add edi,4 - // } - -// 2nd dword { - xor ecx,ecx - - // pixel #5 - // xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII - mov eax,edx - shr eax,8 //Alpha - and eax,0x00000010 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,12 // Intensity - and eax,0x0000000E - or ecx,eax - shr eax,3 - or ecx,eax - - // pixel #6 - // xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx - // xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx - //Alpha - mov eax,edx - shl eax,4 - and eax,0x00001000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx // Intensity - and eax,0x00000E00 - or ecx,eax - shr eax,3 - and eax,0x00000100 - or ecx,eax - - // pixel #7 - // xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx - // xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx - //Alpha - mov eax,edx - shl eax,16 - and eax,0x00100000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,12 // Intensity - and eax,0x000E0000 - or ecx,eax - shr eax,3 - and eax,0x00010000 - or ecx,eax - - // pixel #8 - // xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA - // AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx - mov eax,edx - shl eax,28 //Alpha - and eax,0x10000000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,24 // Intensity - and eax,0x0E000000 - or ecx,eax - shr eax,3 - and eax,0x01000000 - or ecx,eax - - mov [edi],ecx - add edi,4 - // } - - // * - - pop ecx - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz near end_y_loop - push ecx - - add esi,[line] - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - push ecx - - mov eax,[esi+4] // read all 8 pixels - bswap eax - mov edx,eax - - // 1st dword { - xor ecx,ecx - - // pixel #1 - // IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII - mov eax,edx - shr eax,24 //Alpha - and eax,0x00000010 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,28 // Intensity - and eax,0x0000000E - or ecx,eax - shr eax,3 - or ecx,eax - - // pixel #2 - // xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx - mov eax,edx - shr eax,12 //Alpha - and eax,0x00001000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,16 // Intensity - and eax,0x00000E00 - or ecx,eax - shr eax,3 - and eax,0x00000100 - or ecx,eax - - // pixel #3 - // xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx - //Alpha - mov eax,edx - and eax,0x00100000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,4 // Intensity - and eax,0x000E0000 - or ecx,eax - shr eax,3 - and eax,0x00010000 - or ecx,eax - - // pixel #4 - // xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx - // AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx - mov eax,edx - shl eax,12 //Alpha - and eax,0x10000000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,8 // Intensity - and eax,0x0E000000 - or ecx,eax - shr eax,3 - and eax,0x01000000 - or ecx,eax - - - mov [edi],ecx - add edi,4 - // } - -// 2nd dword { - xor ecx,ecx - - // pixel #5 - // xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII - mov eax,edx - shr eax,8 //Alpha - and eax,0x00000010 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,12 // Intensity - and eax,0x0000000E - or ecx,eax - shr eax,3 - or ecx,eax - - // pixel #6 - // xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx - // xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx - //Alpha - mov eax,edx - shl eax,4 - and eax,0x00001000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx // Intensity - and eax,0x00000E00 - or ecx,eax - shr eax,3 - and eax,0x00000100 - or ecx,eax - - // pixel #7 - // xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx - // xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx - //Alpha - mov eax,edx - shl eax,16 - and eax,0x00100000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,12 // Intensity - and eax,0x000E0000 - or ecx,eax - shr eax,3 - and eax,0x00010000 - or ecx,eax - - // pixel #8 - // xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA - // AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx - mov eax,edx - shl eax,28 //Alpha - and eax,0x10000000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,24 // Intensity - and eax,0x0E000000 - or ecx,eax - shr eax,3 - and eax,0x01000000 - or ecx,eax - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,8 - mov edx,eax - -// 1st dword { - xor ecx,ecx - - // pixel #1 - // IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII - mov eax,edx - shr eax,24 //Alpha - and eax,0x00000010 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,28 // Intensity - and eax,0x0000000E - or ecx,eax - shr eax,3 - or ecx,eax - - // pixel #2 - // xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx - mov eax,edx - shr eax,12 //Alpha - and eax,0x00001000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,16 // Intensity - and eax,0x00000E00 - or ecx,eax - shr eax,3 - and eax,0x00000100 - or ecx,eax - - // pixel #3 - // xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx - // xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx - //Alpha - mov eax,edx - and eax,0x00100000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,4 // Intensity - and eax,0x000E0000 - or ecx,eax - shr eax,3 - and eax,0x00010000 - or ecx,eax - - // pixel #4 - // xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx - // AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx - mov eax,edx - shl eax,12 //Alpha - and eax,0x10000000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,8 // Intensity - and eax,0x0E000000 - or ecx,eax - shr eax,3 - and eax,0x01000000 - or ecx,eax - - - mov [edi],ecx - add edi,4 - // } - -// 2nd dword { - xor ecx,ecx - - // pixel #5 - // xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx - // xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII - mov eax,edx - shr eax,8 //Alpha - and eax,0x00000010 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shr eax,12 // Intensity - and eax,0x0000000E - or ecx,eax - shr eax,3 - or ecx,eax - - // pixel #6 - // xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx - // xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx - //Alpha - mov eax,edx - shl eax,4 - and eax,0x00001000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx // Intensity - and eax,0x00000E00 - or ecx,eax - shr eax,3 - and eax,0x00000100 - or ecx,eax - - // pixel #7 - // xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx - // xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx - //Alpha - mov eax,edx - shl eax,16 - and eax,0x00100000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,12 // Intensity - and eax,0x000E0000 - or ecx,eax - shr eax,3 - and eax,0x00010000 - or ecx,eax - - // pixel #8 - // xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA - // AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx - mov eax,edx - shl eax,28 //Alpha - and eax,0x10000000 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - shl eax,1 - or ecx,eax - mov eax,edx - shl eax,24 // Intensity - and eax,0x0E000000 - or ecx,eax - shr eax,3 - and eax,0x01000000 - or ecx,eax - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - dec ecx - jnz x_loop_2 - - add esi,[line] - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + uint32_t *v6; + uint32_t *v7; + int v8; + int v9; + int v10; + uint32_t v11; + uint32_t *v12; + uint32_t v13; + uint32_t v14; + uint32_t v15; + uint32_t *v16; + uint32_t v17; + uint32_t v18; + uint32_t v19; + uint32_t v20; + uint32_t v21; + uint32_t v22; + uint32_t v23; + uint32_t v24; + uint32_t v25; + uint32_t v26; + uint32_t v27; + uint32_t v28; + uint32_t v29; + uint32_t v30; + uint32_t v31; + uint32_t v32; + uint32_t *v33; + uint32_t *v34; + int v35; + int v36; + uint32_t v37; + uint32_t v38; + uint32_t v39; + uint32_t *v40; + uint32_t v41; + uint32_t v42; + uint32_t v43; + uint32_t v44; + uint32_t v45; + uint32_t v46; + uint32_t v47; + uint32_t v48; + uint32_t v49; + uint32_t v50; + uint32_t v51; + uint32_t v52; + uint32_t v53; + uint32_t v54; + uint32_t v55; + uint32_t v56; + int v57; + int v58; + + v6 = (uint32_t *)src; + v7 = (uint32_t *)dst; + v8 = height; + do + { + v57 = v8; + v9 = wid_64; + do + { + v10 = v9; + v11 = bswap32(*v6); + v12 = v6 + 1; + v13 = v11; + v14 = (8 * (v11 & 0x100000)) | (4 * (v11 & 0x100000)) | (2 * (v11 & 0x100000)) | (v11 & 0x100000) | ((((v11 >> 16) & 0xE00) >> 3) & 0x100) | ((v11 >> 16) & 0xE00) | (8 * ((v11 >> 12) & 0x1000)) | (4 * ((v11 >> 12) & 0x1000)) | (2 * ((v11 >> 12) & 0x1000)) | ((v11 >> 12) & 0x1000) | ((((v11 >> 28) & 0xE) >> 3)) | ((v11 >> 28) & 0xE) | (8 * ((v11 >> 24) & 0x10)) | (4 * ((v11 >> 24) & 0x10)) | (2 * ((v11 >> 24) & 0x10)) | ((v11 >> 24) & 0x10); + v11 >>= 4; + v11 &= 0xE0000u; + v15 = v11 | v14; + v11 >>= 3; + *v7 = ((((v13 << 8) & 0xE000000) >> 3) & 0x1000000) | ((v13 << 8) & 0xE000000) | (8 * ((v13 << 12) & 0x10000000)) | (4 * ((v13 << 12) & 0x10000000)) | (2 * ((v13 << 12) & 0x10000000)) | ((v13 << 12) & 0x10000000) | (v11 & 0x10000) | v15; + v16 = v7 + 1; + v17 = 16 * (uint16_t)v13 & 0x1000; + v18 = (((v13 & 0xE00) >> 3) & 0x100) | (v13 & 0xE00) | (8 * v17) | (4 * v17) | (2 * v17) | (v17) | ((((v13 >> 12) & 0xE) >> 3)) | ((v13 >> 12) & 0xE) | (8 * ((v13 >> 8) & 0x10)) | (4 * ((v13 >> 8) & 0x10)) | (2 * ((v13 >> 8) & 0x10)) | ((v13 >> 8) & 0x10); + v19 = v13 << 16; + v20 = (8 * (v19 & 0x100000)) | (4 * (v19 & 0x100000)) | (2 * (v19 & 0x100000)) | (v19 & 0x100000) | v18; + v21 = v13 << 12; + v21 &= 0xE0000u; + v22 = v21 | v20; + v21 >>= 3; + *v16 = ((((v13 << 24) & 0xE000000) >> 3) & 0x1000000) | ((v13 << 24) & 0xE000000) | (8 * ((v13 << 28) & 0x10000000)) | (4 * ((v13 << 28) & 0x10000000)) | (2 * ((v13 << 28) & 0x10000000)) | ((v13 << 28) & 0x10000000) | (v21 & 0x10000) | v22; + ++v16; + v23 = bswap32(*v12); + v6 = v12 + 1; + v24 = v23; + v25 = (8 * (v23 & 0x100000)) | (4 * (v23 & 0x100000)) | (2 * (v23 & 0x100000)) | (v23 & 0x100000) | ((((v23 >> 16) & 0xE00) >> 3) & 0x100) | ((v23 >> 16) & 0xE00) | (8 * ((v23 >> 12) & 0x1000)) | (4 * ((v23 >> 12) & 0x1000)) | (2 * ((v23 >> 12) & 0x1000)) | ((v23 >> 12) & 0x1000) | (((v23 >> 28) & 0xE) >> 3) | ((v23 >> 28) & 0xE) | (8 * ((v23 >> 24) & 0x10)) | (4 * ((v23 >> 24) & 0x10)) | (2 * ((v23 >> 24) & 0x10)) | ((v23 >> 24) & 0x10); + v23 >>= 4; + v23 &= 0xE0000u; + v26 = v23 | v25; + v23 >>= 3; + *v16 = ((((v24 << 8) & 0xE000000) >> 3) & 0x1000000) | ((v24 << 8) & 0xE000000) | (8 * ((v24 << 12) & 0x10000000)) | (4 * ((v24 << 12) & 0x10000000)) | (2 * ((v24 << 12) & 0x10000000)) | ((v24 << 12) & 0x10000000) | (v23 & 0x10000) | (v26); + ++v16; + v27 = 16 * (uint16_t)v24 & 0x1000; + v28 = (((v24 & 0xE00) >> 3) & 0x100) | (v24 & 0xE00) | (8 * v27) | (4 * v27) | (2 * v27) | (v27) | ((((v24 >> 12) & 0xE) >> 3)) | ((v24 >> 12) & 0xE) | (8 * ((v24 >> 8) & 0x10)) | (4 * ((v24 >> 8) & 0x10)) | (2 * ((v24 >> 8) & 0x10)) | ((v24 >> 8) & 0x10); + v29 = v24 << 16; + v30 = (8 * (v29 & 0x100000)) | (4 * (v29 & 0x100000)) | (2 * (v29 & 0x100000)) | (v29 & 0x100000) | v28; + v31 = v24 << 12; + v31 &= 0xE0000u; + v32 = v31 | v30; + v31 >>= 3; + *v16 = ((((v24 << 24) & 0xE000000) >> 3) & 0x1000000) | ((v24 << 24) & 0xE000000) | (8 * ((v24 << 28) & 0x10000000)) | (4 * ((v24 << 28) & 0x10000000)) | (2 * ((v24 << 28) & 0x10000000)) | ((v24 << 28) & 0x10000000) | (v31 & 0x10000) | v32; + v7 = v16 + 1; + v9 = v10 - 1; + } + while ( v10 != 1 ); + if ( v57 == 1 ) + break; + v58 = v57 - 1; + v33 = (uint32_t *)((char *)v6 + line); + v34 = (uint32_t *)((char *)v7 + ext); + v35 = wid_64; + do + { + v36 = v35; + v37 = bswap32(v33[1]); + v38 = v37 >> 4; + v38 &= 0xE0000u; + v39 = v38 | (8 * (v37 & 0x100000)) | (4 * (v37 & 0x100000)) | (2 * (v37 & 0x100000)) | (v37 & 0x100000) | ((((v37 >> 16) & 0xE00) >> 3) & 0x100) | ((v37 >> 16) & 0xE00) | (8 * ((v37 >> 12) & 0x1000)) | (4 * ((v37 >> 12) & 0x1000)) | (2 * ((v37 >> 12) & 0x1000)) | ((v37 >> 12) & 0x1000) | (((v37 >> 28) & 0xE) >> 3) | ((v37 >> 28) & 0xE) | (8 * ((v37 >> 24) & 0x10)) | (4 * ((v37 >> 24) & 0x10)) | (2 * ((v37 >> 24) & 0x10)) | ((v37 >> 24) & 0x10); + v38 >>= 3; + *v34 = ((((v37 << 8) & 0xE000000) >> 3) & 0x1000000) | ((v37 << 8) & 0xE000000) | (8 * ((v37 << 12) & 0x10000000)) | (4 * ((v37 << 12) & 0x10000000)) | (2 * ((v37 << 12) & 0x10000000)) | ((v37 << 12) & 0x10000000) | (v38 & 0x10000) | v39; + v40 = v34 + 1; + v41 = 16 * (uint16_t)v37 & 0x1000; + v42 = (((v37 & 0xE00) >> 3) & 0x100) | (v37 & 0xE00) | (8 * v41) | (4 * v41) | (2 * v41) | v41 | (((v37 >> 12) & 0xE) >> 3) | ((v37 >> 12) & 0xE) | (8 * ((v37 >> 8) & 0x10)) | (4 * ((v37 >> 8) & 0x10)) | (2 * ((v37 >> 8) & 0x10)) | ((v37 >> 8) & 0x10); + v43 = v37 << 16; + v44 = (8 * (v43 & 0x100000)) | (4 * (v43 & 0x100000)) | (2 * (v43 & 0x100000)) | (v43 & 0x100000) | v42; + v45 = v37 << 12; + v45 &= 0xE0000u; + v46 = v45 | v44; + v45 >>= 3; + *v40 = ((((v37 << 24) & 0xE000000) >> 3) & 0x1000000) | ((v37 << 24) & 0xE000000) | (8 * ((v37 << 28) & 0x10000000)) | (4 * ((v37 << 28) & 0x10000000)) | (2 * ((v37 << 28) & 0x10000000)) | ((v37 << 28) & 0x10000000) | (v45 & 0x10000) | v46; + ++v40; + v47 = bswap32(*v33); + v33 += 2; + v48 = v47; + v49 = (8 * (v47 & 0x100000)) | (4 * (v47 & 0x100000)) | (2 * (v47 & 0x100000)) | (v47 & 0x100000) | ((((v47 >> 16) & 0xE00) >> 3) & 0x100) | ((v47 >> 16) & 0xE00) | (8 * ((v47 >> 12) & 0x1000)) | (4 * ((v47 >> 12) & 0x1000)) | (2 * ((v47 >> 12) & 0x1000)) | ((v47 >> 12) & 0x1000) | (((v47 >> 28) & 0xE) >> 3) | ((v47 >> 28) & 0xE) | (8 * ((v47 >> 24) & 0x10)) | (4 * ((v47 >> 24) & 0x10)) | (2 * ((v47 >> 24) & 0x10)) | ((v47 >> 24) & 0x10); + v47 >>= 4; + v47 &= 0xE0000u; + v50 = v47 | v49; + v47 >>= 3; + *v40 = ((((v48 << 8) & 0xE000000) >> 3) & 0x1000000) | ((v48 << 8) & 0xE000000) | (8 * ((v48 << 12) & 0x10000000)) | (4 * ((v48 << 12) & 0x10000000)) | (2 * ((v48 << 12) & 0x10000000)) | ((v48 << 12) & 0x10000000) | (v47 & 0x10000) | v50; + ++v40; + v51 = 16 * (uint16_t)v48 & 0x1000; + v52 = (((v48 & 0xE00) >> 3) & 0x100) | (v48 & 0xE00) | (8 * v51) | (4 * v51) | (2 * v51) | v51 | (((v48 >> 12) & 0xE) >> 3) | ((v48 >> 12) & 0xE) | (8 * ((v48 >> 8) & 0x10)) | (4 * ((v48 >> 8) & 0x10)) | (2 * ((v48 >> 8) & 0x10)) | ((v48 >> 8) & 0x10); + v53 = v48 << 16; + v54 = (8 * (v53 & 0x100000)) | (4 * (v53 & 0x100000)) | (2 * (v53 & 0x100000)) | (v53 & 0x100000) | v52; + v55 = v48 << 12; + v55 &= 0xE0000u; + v56 = v55 | v54; + v55 >>= 3; + *v40 = ((((v48 << 24) & 0xE000000) >> 3) & 0x1000000) | ((v48 << 24) & 0xE000000) | (8 * ((v48 << 28) & 0x10000000)) | (4 * ((v48 << 28) & 0x10000000)) | (2 * ((v48 << 28) & 0x10000000)) | ((v48 << 28) & 0x10000000) | (v55 & 0x10000) | v56; + v34 = v40 + 1; + v35 = v36 - 1; + } + while ( v36 != 1 ); + v6 = (uint32_t *)((char *)v33 + line); + v7 = (uint32_t *)((char *)v34 + ext); + v8 = v58 - 1; + } + while ( v58 != 1 ); } -extern "C" void __declspec(naked) asmLoad4bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext) +static inline void load4bI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext) { - _asm { - push ebp - mov ebp, esp - push ebx - push esi - push edi + uint32_t *v6; + uint32_t *v7; + int v8; + int v9; + int v10; + uint32_t v11; + uint32_t *v12; + uint32_t v13; + uint32_t v14; + uint32_t *v15; + uint32_t v16; + unsigned int v17; + unsigned int v18; + uint32_t v19; + uint32_t v20; + uint32_t *v21; + uint32_t *v22; + int v23; + int v24; + uint32_t v25; + uint32_t v26; + uint32_t *v27; + uint32_t v28; + uint32_t v29; + uint32_t v30; + uint32_t v31; + uint32_t v32; + int v33; + int v34; - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - push ecx - - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword { - xor ecx,ecx - shr eax,28 // 0xF0000000 -> 0x0000000F - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx // 0x0F000000 -> 0x00000F00 - shr eax,16 - and eax,0x00000F00 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shr eax,4 // 0x00F00000 -> 0x000F0000 - and eax,0x000F0000 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shl eax,8 // 0x000F0000 -> 0x0F000000 - and eax,0x0F000000 - or ecx,eax - shl eax,4 - or ecx,eax - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword { - xor ecx,ecx - mov eax,edx - shr eax,12 // 0x0000F000 -> 0x0000000F - and eax,0x0000000F - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx // 0x00000F00 -> 0x00000F00 - and eax,0x00000F00 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shl eax,12 // 0x000000F0 -> 0x000F0000 - and eax,0x000F0000 - or ecx,eax - shl eax,4 - or ecx,eax - - shl edx,24 // 0x0000000F -> 0x0F000000 - and edx,0x0F000000 - or ecx,edx - shl edx,4 - or ecx,edx - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword { - xor ecx,ecx - shr eax,28 // 0xF0000000 -> 0x0000000F - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx // 0x0F000000 -> 0x00000F00 - shr eax,16 - and eax,0x00000F00 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shr eax,4 // 0x00F00000 -> 0x000F0000 - and eax,0x000F0000 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shl eax,8 // 0x000F0000 -> 0x0F000000 - and eax,0x0F000000 - or ecx,eax - shl eax,4 - or ecx,eax - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword { - xor ecx,ecx - mov eax,edx - shr eax,12 // 0x0000F000 -> 0x0000000F - and eax,0x0000000F - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx // 0x00000F00 -> 0x00000F00 - and eax,0x00000F00 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shl eax,12 // 0x000000F0 -> 0x000F0000 - and eax,0x000F0000 - or ecx,eax - shl eax,4 - or ecx,eax - - shl edx,24 // 0x0000000F -> 0x0F000000 - and edx,0x0F000000 - or ecx,edx - shl edx,4 - or ecx,edx - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz near end_y_loop - push ecx - - add esi,[line] - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - push ecx - - mov eax,[esi+4] // read all 8 pixels - bswap eax - mov edx,eax - - // 1st dword { - xor ecx,ecx - shr eax,28 // 0xF0000000 -> 0x0000000F - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx // 0x0F000000 -> 0x00000F00 - shr eax,16 - and eax,0x00000F00 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shr eax,4 // 0x00F00000 -> 0x000F0000 - and eax,0x000F0000 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shl eax,8 // 0x000F0000 -> 0x0F000000 - and eax,0x0F000000 - or ecx,eax - shl eax,4 - or ecx,eax - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword { - xor ecx,ecx - mov eax,edx - shr eax,12 // 0x0000F000 -> 0x0000000F - and eax,0x0000000F - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx // 0x00000F00 -> 0x00000F00 - and eax,0x00000F00 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shl eax,12 // 0x000000F0 -> 0x000F0000 - and eax,0x000F0000 - or ecx,eax - shl eax,4 - or ecx,eax - - shl edx,24 // 0x0000000F -> 0x0F000000 - and edx,0x0F000000 - or ecx,edx - shl edx,4 - or ecx,edx - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 8 pixels - bswap eax - add esi,8 - mov edx,eax - - // 1st dword { - xor ecx,ecx - shr eax,28 // 0xF0000000 -> 0x0000000F - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx // 0x0F000000 -> 0x00000F00 - shr eax,16 - and eax,0x00000F00 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shr eax,4 // 0x00F00000 -> 0x000F0000 - and eax,0x000F0000 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shl eax,8 // 0x000F0000 -> 0x0F000000 - and eax,0x0F000000 - or ecx,eax - shl eax,4 - or ecx,eax - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword { - xor ecx,ecx - mov eax,edx - shr eax,12 // 0x0000F000 -> 0x0000000F - and eax,0x0000000F - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx // 0x00000F00 -> 0x00000F00 - and eax,0x00000F00 - or ecx,eax - shl eax,4 - or ecx,eax - - mov eax,edx - shl eax,12 // 0x000000F0 -> 0x000F0000 - and eax,0x000F0000 - or ecx,eax - shl eax,4 - or ecx,eax - - shl edx,24 // 0x0000000F -> 0x0F000000 - and edx,0x0F000000 - or ecx,edx - shl edx,4 - or ecx,edx - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - dec ecx - jnz x_loop_2 - - add esi,[line] - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v6 = (uint32_t *)src; + v7 = (uint32_t *)dst; + v8 = height; + do + { + v33 = v8; + v9 = wid_64; + do + { + v10 = v9; + v11 = bswap32(*v6); + v12 = v6 + 1; + v13 = v11; + v14 = (16 * ((v11 >> 16) & 0xF00)) | ((v11 >> 16) & 0xF00) | (16 * (v11 >> 28)) | (v11 >> 28); + v11 >>= 4; + *v7 = (16 * ((v13 << 8) & 0xF000000)) | ((v13 << 8) & 0xF000000) | (16 * (v11 & 0xF0000)) | (v11 & 0xF0000) | v14; + v15 = v7 + 1; + v16 = v13 << 12; + *v15 = (16 * ((v13 << 24) & 0xF000000)) | ((v13 << 24) & 0xF000000) | (16 * (v16 & 0xF0000)) | (v16 & 0xF0000) | (16 * (v13 & 0xF00)) | (v13 & 0xF00) | (16 * ((uint16_t)v13 >> 12)) | ((uint16_t)v13 >> 12); + ++v15; + v17 = bswap32(*v12); + v6 = v12 + 1; + v18 = v17; + v19 = (16 * ((v17 >> 16) & 0xF00)) | ((v17 >> 16) & 0xF00) | (16 * (v17 >> 28)) | (v17 >> 28); + v17 >>= 4; + *v15 = (16 * ((v18 << 8) & 0xF000000)) | ((v18 << 8) & 0xF000000) | (16 * (v17 & 0xF0000)) | (v17 & 0xF0000) | v19; + ++v15; + v20 = v18 << 12; + *v15 = (16 * ((v18 << 24) & 0xF000000)) | ((v18 << 24) & 0xF000000) | (16 * (v20 & 0xF0000)) | (v20 & 0xF0000) | (16 * (v18 & 0xF00)) | (v18 & 0xF00) | (16 * ((uint16_t)v18 >> 12)) | ((uint16_t)v18 >> 12); + v7 = v15 + 1; + v9 = v10 - 1; + } + while ( v10 != 1 ); + if ( v33 == 1 ) + break; + v34 = v33 - 1; + v21 = (uint32_t *)((char *)v6 + line); + v22 = (uint32_t *)((char *)v7 + ext); + v23 = wid_64; + do + { + v24 = v23; + v25 = bswap32(v21[1]); + v26 = v25 >> 4; + *v22 = (16 * ((v25 << 8) & 0xF000000)) | ((v25 << 8) & 0xF000000) | (16 * (v26 & 0xF0000)) | (v26 & 0xF0000) | (16 * ((v25 >> 16) & 0xF00)) | ((v25 >> 16) & 0xF00) | (16 * (v25 >> 28)) | (v25 >> 28); + v27 = v22 + 1; + v28 = v25 << 12; + *v27 = (16 * ((v25 << 24) & 0xF000000)) | ((v25 << 24) & 0xF000000) | (16 * (v28 & 0xF0000)) | (v28 & 0xF0000) | (16 * (v25 & 0xF00)) | (v25 & 0xF00) | (16 * ((uint16_t)v25 >> 12)) | ((uint16_t)v25 >> 12); + ++v27; + v29 = bswap32(*v21); + v21 += 2; + v30 = v29; + v31 = (16 * ((v29 >> 16) & 0xF00)) | ((v29 >> 16) & 0xF00) | (16 * (v29 >> 28)) | (v29 >> 28); + v29 >>= 4; + *v27 = (16 * ((v30 << 8) & 0xF000000)) | ((v30 << 8) & 0xF000000) | (16 * (v29 & 0xF0000)) | (v29 & 0xF0000) | v31; + ++v27; + v32 = v30 << 12; + *v27 = (16 * ((v30 << 24) & 0xF000000)) | ((v30 << 24) & 0xF000000) | (16 * (v32 & 0xF0000)) | (v32 & 0xF0000) | (16 * (v30 & 0xF00)) | (v30 & 0xF00) | (16 * ((uint16_t)v30 >> 12)) | ((uint16_t)v30 >> 12); + v22 = v27 + 1; + v23 = v24 - 1; + } + while ( v24 != 1 ); + v6 = (uint32_t *)((char *)v21 + line); + v7 = (uint32_t *)((char *)v22 + ext); + v8 = v34 - 1; + } + while ( v34 != 1 ); } //**************************************************************** @@ -1937,24 +594,26 @@ wxUint32 Load4bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int lin { if (wid_64 < 1) wid_64 = 1; if (height < 1) height = 1; - int ext = (real_width - (wid_64 << 4)) << 1; + int ext = (real_width - (wid_64 << 4)); - if (rdp.tlut_mode == 0) + if (rdp.tlut_mode == 0) { - //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference. + //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference. //Thanks to angrylion for the advice - asmLoad4bI (src, dst, wid_64, height, line, ext); + load4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext); return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44; } wxUIntPtr pal = wxPtrToUInt(rdp.pal_8 + (rdp.tiles[tile].palette << 4)); - if (rdp.tlut_mode == 2) + if (rdp.tlut_mode == 2) { - asmLoad4bCI (src, dst, wid_64, height, line, ext, pal); + ext <<= 1; + load4bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal); return (1 << 16) | GR_TEXFMT_ARGB_1555; } - asmLoad4bIAPal (src, dst, wid_64, height, line, ext, pal); + ext <<= 1; + load4bIAPal ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal); return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88; } @@ -1971,7 +630,7 @@ wxUint32 Load4bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int lin if (wid_64 < 1) wid_64 = 1; if (height < 1) height = 1; int ext = (real_width - (wid_64 << 4)); - asmLoad4bIA (src, dst, wid_64, height, line, ext); + load4bIA ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext); return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44; } @@ -1986,7 +645,8 @@ wxUint32 Load4bI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line if (wid_64 < 1) wid_64 = 1; if (height < 1) height = 1; int ext = (real_width - (wid_64 << 4)); - asmLoad4bI (src, dst, wid_64, height, line, ext); + load4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext); + return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44; } diff --git a/Source/Glide64/TexLoad8b.h b/Source/Glide64/TexLoad8b.h index 388e55e7c..64e01eb40 100644 --- a/Source/Glide64/TexLoad8b.h +++ b/Source/Glide64/TexLoad8b.h @@ -37,629 +37,336 @@ // //**************************************************************** -extern "C" void __declspec(naked) asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal) +static inline void load8bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal) { - _asm { - push ebp - mov ebp, esp - push ebx - push esi - push edi + uint8_t *v7; + uint32_t *v8; + int v9; + int v10; + int v11; + uint32_t v12; + uint32_t *v13; + uint32_t v14; + uint32_t *v15; + uint32_t v16; + uint32_t *v17; + uint32_t *v18; + int v19; + int v20; + uint32_t v21; + uint32_t v22; + uint32_t *v23; + uint32_t v24; + int v25; + int v26; - mov ebx,[pal] - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - push ecx - - mov eax,[esi] // read all 4 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword output { - shr eax,15 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,23 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shl eax,1 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - shr edx,7 - and edx,0x1FE - mov cx,[ebx+edx] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 4 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword output { - shr eax,15 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,23 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shl eax,1 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - shr edx,7 - and edx,0x1FE - mov cx,[ebx+edx] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz near end_y_loop - push ecx - - mov eax,esi - add eax,[line] - mov esi,[src] - sub eax,esi - and eax,0x7FF - add esi,eax - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - push ecx - - mov eax,[esi+4] // read all 4 pixels - bswap eax - mov edx,eax - - // 1st dword output { - shr eax,15 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,23 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shl eax,1 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - shr edx,7 - and edx,0x1FE - mov cx,[ebx+edx] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 4 pixels - bswap eax - mov edx,esi - add edx,8 - mov esi,[src] - sub edx,esi - and edx,0x7FF - add esi,edx - mov edx,eax - - // 1st dword output { - shr eax,15 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - mov eax,edx - shr eax,23 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shl eax,1 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,1 - shl ecx,16 - - shr edx,7 - and edx,0x1FE - mov cx,[ebx+edx] - ror cx,1 - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - - dec ecx - jnz x_loop_2 - - mov eax,esi - add eax,[line] - mov esi,[src] - sub eax,esi - and eax,0x7FF - add esi,eax - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v7 = src; + v8 = (uint32_t *)dst; + v9 = height; + do + { + v25 = v9; + v10 = wid_64; + do + { + v11 = v10; + v12 = bswap32(*(uint32_t *)v7); + v13 = (uint32_t *)(v7 + 4); + ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1FE)), 1); + v14 = v10 << 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1FE)), 1); + *v8 = v14; + v15 = v8 + 1; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v12 & 0x1FE)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1FE)), 1); + *v15 = v14; + ++v15; + v16 = bswap32(*v13); + v7 = (uint8_t *)(v13 + 1); + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1FE)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1FE)), 1); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v16 & 0x1FE)), 1); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1FE)), 1); + *v15 = v14; + v8 = v15 + 1; + v10 = v11 - 1; + } + while ( v11 != 1 ); + if ( v25 == 1 ) + break; + v26 = v25 - 1; + v17 = (uint32_t *)&src[(line + (uintptr_t)v7 - (uintptr_t)src) & 0x7FF]; + v18 = (uint32_t *)((char *)v8 + ext); + v19 = wid_64; + do + { + v20 = v19; + v21 = bswap32(v17[1]); + ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1FE)), 1); + v22 = v19 << 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1FE)), 1); + *v18 = v22; + v23 = v18 + 1; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v21 & 0x1FE)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1FE)), 1); + *v23 = v22; + ++v23; + v24 = bswap32(*v17); + v17 = (uint32_t *)&src[((uintptr_t)v17 + 8 - (uintptr_t)src) & 0x7FF]; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1FE)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1FE)), 1); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v24 & 0x1FE)), 1); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1FE)), 1); + *v23 = v22; + v18 = v23 + 1; + v19 = v20 - 1; + } + while ( v20 != 1 ); + v7 = &src[(line + (uintptr_t)v17 - (uintptr_t)src) & 0x7FF]; + v8 = (uint32_t *)((char *)v18 + ext); + v9 = v26 - 1; + } + while ( v26 != 1 ); } -extern "C" void __declspec(naked) asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal) +static inline void load8bIA8(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal) { - _asm { - push ebp - mov ebp, esp - push ebx - push esi - push edi + uint32_t *v7; + uint32_t *v8; + int v9; + int v10; + int v11; + uint32_t v12; + uint32_t *v13; + uint32_t v14; + uint32_t *v15; + uint32_t v16; + uint32_t *v17; + uint32_t *v18; + int v19; + int v20; + uint32_t v21; + uint32_t v22; + uint32_t *v23; + uint32_t v24; + int v25; + int v26; - mov ebx,[pal] - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - push ecx - - mov eax,[esi] // read all 4 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword output { - shr eax,15 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,23 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shl eax,1 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - shr edx,7 - and edx,0x1FE - mov cx,[ebx+edx] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 4 pixels - bswap eax - add esi,4 - mov edx,eax - - // 1st dword output { - shr eax,15 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,23 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shl eax,1 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - shr edx,7 - and edx,0x1FE - mov cx,[ebx+edx] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz near end_y_loop - push ecx - - add esi,[line] - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - push ecx - - mov eax,[esi+4] // read all 4 pixels - bswap eax - mov edx,eax - - // 1st dword output { - shr eax,15 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,23 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shl eax,1 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - shr edx,7 - and edx,0x1FE - mov cx,[ebx+edx] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // * copy - mov eax,[esi] // read all 4 pixels - bswap eax - add esi,8 - mov edx,eax - - // 1st dword output { - shr eax,15 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - mov eax,edx - shr eax,23 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - - // 2nd dword output { - mov eax,edx - shl eax,1 - and eax,0x1FE - mov cx,[ebx+eax] - ror cx,8 - shl ecx,16 - - shr edx,7 - and edx,0x1FE - mov cx,[ebx+edx] - ror cx,8 - - mov [edi],ecx - add edi,4 - // } - // * - - pop ecx - - dec ecx - jnz x_loop_2 - - add esi,[line] - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v7 = (uint32_t *)src; + v8 = (uint32_t *)dst; + v9 = height; + do + { + v25 = v9; + v10 = wid_64; + do + { + v11 = v10; + v12 = bswap32(*v7); + v13 = v7 + 1; + ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1FE)), 8); + v14 = v10 << 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1FE)), 8); + *v8 = v14; + v15 = v8 + 1; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v12 & 0x1FE)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1FE)), 8); + *v15 = v14; + ++v15; + v16 = bswap32(*v13); + v7 = v13 + 1; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1FE)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1FE)), 8); + *v15 = v14; + ++v15; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v16 & 0x1FE)), 8); + v14 <<= 16; + ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1FE)), 8); + *v15 = v14; + v8 = v15 + 1; + v10 = v11 - 1; + } + while ( v11 != 1 ); + if ( v25 == 1 ) + break; + v26 = v25 - 1; + v17 = (uint32_t *)((char *)v7 + line); + v18 = (uint32_t *)((char *)v8 + ext); + v19 = wid_64; + do + { + v20 = v19; + v21 = bswap32(v17[1]); + ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1FE)), 8); + v22 = v19 << 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1FE)), 8); + *v18 = v22; + v23 = v18 + 1; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v21 & 0x1FE)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1FE)), 8); + *v23 = v22; + ++v23; + v24 = bswap32(*v17); + v17 += 2; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1FE)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1FE)), 8); + *v23 = v22; + ++v23; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v24 & 0x1FE)), 8); + v22 <<= 16; + ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1FE)), 8); + *v23 = v22; + v18 = v23 + 1; + v19 = v20 - 1; + } + while ( v20 != 1 ); + v7 = (uint32_t *)((char *)v17 + line); + v8 = (uint32_t *)((char *)v18 + ext); + v9 = v26 - 1; + } + while ( v26 != 1 ); } -extern "C" void __declspec(naked) asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext) +static inline void load8bIA4(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext) { - _asm { - push ebp - mov ebp, esp - push ebx - push esi - push edi + uint32_t *v6; + uint32_t *v7; + int v8; + int v9; + uint32_t v10; + uint32_t v11; + uint32_t *v12; + uint32_t *v13; + uint32_t v14; + uint32_t v15; + uint32_t *v16; + uint32_t *v17; + int v18; + uint32_t *v19; + uint32_t v20; + int v21; + int v22; - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - mov eax,[esi] // read all 4 pixels - mov edx,eax - - shr eax,4 //all alpha - shl edx,4 - and eax,0x0F0F0F0F - and edx,0xF0F0F0F0 - add esi,4 - or eax,edx - - mov [edi],eax // save dword - add edi,4 - - mov eax,[esi] // read all 4 pixels - mov edx,eax - - shr eax,4 //all alpha - shl edx,4 - and eax,0x0F0F0F0F - and edx,0xF0F0F0F0 - add esi,4 - or eax,edx - - mov [edi],eax // save dword - add edi,4 - // * - - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz end_y_loop - push ecx - - add esi,[line] - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - mov eax,[esi+4] // read both pixels - mov edx,eax - - shr eax,4 //all alpha - shl edx,4 - and eax,0x0F0F0F0F - and edx,0xF0F0F0F0 - or eax,edx - - mov [edi],eax //save dword - add edi,4 - - mov eax,[esi] // read both pixels - add esi,8 - mov edx,eax - - shr eax,4 //all alpha - shl edx,4 - and eax,0x0F0F0F0F - and edx,0xF0F0F0F0 - or eax,edx - - mov [edi],eax //save dword - add edi,4 - // * - - dec ecx - jnz x_loop_2 - - add esi,[line] - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v6 = (uint32_t *)src; + v7 = (uint32_t *)dst; + v8 = height; + do + { + v21 = v8; + v9 = wid_64; + do + { + v10 = *v6; + v11 = (*v6 >> 4) & 0xF0F0F0F; + v12 = v6 + 1; + *v7 = (16 * v10 & 0xF0F0F0F0) | v11; + v13 = v7 + 1; + v14 = (*v12 >> 4) & 0xF0F0F0F; + v15 = 16 * *v12 & 0xF0F0F0F0; + v6 = v12 + 1; + *v13 = v15 | v14; + v7 = v13 + 1; + --v9; + } + while ( v9 ); + if ( v21 == 1 ) + break; + v22 = v21 - 1; + v16 = (uint32_t *)((char *)v6 + line); + v17 = (uint32_t *)((char *)v7 + ext); + v18 = wid_64; + do + { + *v17 = (16 * v16[1] & 0xF0F0F0F0) | ((v16[1] >> 4) & 0xF0F0F0F); + v19 = v17 + 1; + v20 = *v16; + v16 += 2; + *v19 = (16 * v20 & 0xF0F0F0F0) | ((v20 >> 4) & 0xF0F0F0F); + v17 = v19 + 1; + --v18; + } + while ( v18 ); + v6 = (uint32_t *)((char *)v16 + line); + v7 = (uint32_t *)((char *)v17 + ext); + v8 = v22 - 1; + } + while ( v22 != 1 ); } -extern "C" void __declspec(naked) asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext) +static inline void load8bI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext) { - _asm { - push ebp - mov ebp, esp - push ebx - push esi - push edi + uint32_t *v6; + uint32_t *v7; + int v8; + int v9; + uint32_t v10; + uint32_t *v11; + uint32_t *v12; + uint32_t v13; + uint32_t *v14; + uint32_t *v15; + int v16; + uint32_t *v17; + uint32_t v18; + int v19; + int v20; - mov esi,[src] - mov edi,[dst] - mov ecx,[height] -y_loop: - push ecx - mov ecx,[wid_64] -x_loop: - mov eax,[esi] // read all 4 pixels - add esi,4 - - mov [edi],eax // save dword - add edi,4 - - mov eax,[esi] // read all 4 pixels - add esi,4 - - mov [edi],eax // save dword - add edi,4 - // * - - dec ecx - jnz x_loop - - pop ecx - dec ecx - jz end_y_loop - push ecx - - add esi,[line] - add edi,[ext] - - mov ecx,[wid_64] -x_loop_2: - mov eax,[esi+4] // read both pixels - - mov [edi],eax //save dword - add edi,4 - - mov eax,[esi] // read both pixels - add esi,8 - - mov [edi],eax //save dword - add edi,4 - // * - - dec ecx - jnz x_loop_2 - - add esi,[line] - add edi,[ext] - - pop ecx - dec ecx - jnz y_loop - -end_y_loop: - pop edi - pop esi - pop ebx - mov esp, ebp - pop ebp - ret - } + v6 = (uint32_t *)src; + v7 = (uint32_t *)dst; + v8 = height; + do + { + v19 = v8; + v9 = wid_64; + do + { + v10 = *v6; + v11 = v6 + 1; + *v7 = v10; + v12 = v7 + 1; + v13 = *v11; + v6 = v11 + 1; + *v12 = v13; + v7 = v12 + 1; + --v9; + } + while ( v9 ); + if ( v19 == 1 ) + break; + v20 = v19 - 1; + v14 = (uint32_t *)((char *)v6 + line); + v15 = (uint32_t *)((char *)v7 + ext); + v16 = wid_64; + do + { + *v15 = v14[1]; + v17 = v15 + 1; + v18 = *v14; + v14 += 2; + *v17 = v18; + v15 = v17 + 1; + --v16; + } + while ( v16 ); + v6 = (uint32_t *)((char *)v14 + line); + v7 = (uint32_t *)((char *)v15 + ext); + v8 = v20 - 1; + } + while ( v20 != 1 ); } //**************************************************************** @@ -671,21 +378,21 @@ wxUint32 Load8bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int lin if (wid_64 < 1) wid_64 = 1; if (height < 1) height = 1; int ext = (real_width - (wid_64 << 3)); - wxUIntPtr pal = wxPtrToUInt(rdp.pal_8); + unsigned short * pal = rdp.pal_8; switch (rdp.tlut_mode) { case 0: //palette is not used - //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference. + //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference. //Thanks to angrylion for the advice - asmLoad8bI (src, dst, wid_64, height, line, ext); + load8bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext); return /*(0 << 16) | */GR_TEXFMT_ALPHA_8; case 2: //color palette ext <<= 1; - asmLoad8bCI (src, dst, wid_64, height, line, ext, pal); + load8bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, pal); return (1 << 16) | GR_TEXFMT_ARGB_1555; default: //IA palette ext <<= 1; - asmLoad8bIA8 (src, dst, wid_64, height, line, ext, pal); + load8bIA8 ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, pal); return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88; } } @@ -694,30 +401,30 @@ wxUint32 Load8bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int lin // Size: 1, Format: 3 // -wxUint32 Load8bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile) -{ +wxUint32 Load8bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile) +{ if (rdp.tlut_mode != 0) return Load8bCI (dst, src, wid_64, height, line, real_width, tile); - if (wid_64 < 1) wid_64 = 1; - if (height < 1) height = 1; - int ext = (real_width - (wid_64 << 3)); - asmLoad8bIA4 (src, dst, wid_64, height, line, ext); - return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44; + if (wid_64 < 1) wid_64 = 1; + if (height < 1) height = 1; + int ext = (real_width - (wid_64 << 3)); + load8bIA4 ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext); + return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44; } //**************************************************************** // Size: 1, Format: 4 // -wxUint32 Load8bI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile) -{ +wxUint32 Load8bI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile) +{ if (rdp.tlut_mode != 0) return Load8bCI (dst, src, wid_64, height, line, real_width, tile); - if (wid_64 < 1) wid_64 = 1; - if (height < 1) height = 1; - int ext = (real_width - (wid_64 << 3)); - asmLoad8bI (src, dst, wid_64, height, line, ext); - return /*(0 << 16) | */GR_TEXFMT_ALPHA_8; + if (wid_64 < 1) wid_64 = 1; + if (height < 1) height = 1; + int ext = (real_width - (wid_64 << 3)); + load8bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext); + return /*(0 << 16) | */GR_TEXFMT_ALPHA_8; }