From 57cf72039269d8b88dda0b6073dd1d01b8478191 Mon Sep 17 00:00:00 2001
From: zilmar <zilmar@pj64-emu.com>
Date: Wed, 21 Oct 2015 07:49:29 +1100
Subject: [PATCH] [Glide64] Sync texture code

---
 Source/Glide64/TexLoad16b.h |  272 ++--
 Source/Glide64/TexLoad4b.h  | 2434 ++++++++---------------------------
 Source/Glide64/TexLoad8b.h  |  963 +++++---------
 3 files changed, 1001 insertions(+), 2668 deletions(-)

diff --git a/Source/Glide64/TexLoad16b.h b/Source/Glide64/TexLoad16b.h
index 53acdea91..00a2f334b 100644
--- a/Source/Glide64/TexLoad16b.h
+++ b/Source/Glide64/TexLoad16b.h
@@ -37,165 +37,131 @@
 //
 //****************************************************************
 
-extern "C" void __declspec(naked) asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+static inline void load16bRGBA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
 {
-	_asm {
-		align 4
-		push        ebp  
-		mov         ebp,esp 
-        push ebx
-        push esi
-        push edi
+  uint32_t *v6;
+  uint32_t *v7;
+  int v8;
+  int v9;
+  uint32_t v10;
+  uint32_t v11;
+  uint32_t *v12;
+  uint32_t *v13;
+  int v14;
+  uint32_t v15;
+  uint32_t v16;
+  int v17;
+  int v18;
 
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        mov eax,[esi]   // read both pixels
-        mov ebx,[esi+4] // read both pixels
-        bswap eax
-        bswap ebx
-
-        ror ax,1
-        ror bx,1
-        ror eax,16
-        ror ebx,16
-        ror ax,1
-        ror bx,1
-
-        mov  [edi],eax
-        mov  [edi+4],ebx
-        add esi,8
-        add edi,8
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz end_y_loop
-        push ecx
-
-        mov eax,esi
-        add eax,[line]
-        mov esi,[src]
-        sub eax, esi
-        and eax, 0xFFF
-        add esi, eax
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        mov eax,[esi+4] // read both pixels
-        mov ebx,[esi]   // read both pixels
-        bswap eax
-        bswap ebx
-
-        ror ax,1
-        ror bx,1
-        ror eax,16
-        ror ebx,16
-        ror ax,1
-        ror bx,1
-
-        mov [edi],eax
-        mov [edi+4],ebx
-        add esi,8
-        add edi,8
-
-        dec ecx
-        jnz x_loop_2
-
-        mov eax,esi
-        add eax,[line]
-        mov esi,[src]
-        sub eax, esi
-        and eax, 0xFFF
-        add esi, eax
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v6 = (uint32_t *)src;
+  v7 = (uint32_t *)dst;
+  v8 = height;
+  do
+  {
+    v17 = v8;
+    v9 = wid_64;
+    do
+    {
+      v10 = bswap32(*v6);
+      v11 = bswap32(v6[1]);
+      ALOWORD(v10) = __ROR__((uint16_t)v10, 1);
+      ALOWORD(v11) = __ROR__((uint16_t)v11, 1);
+      v10 = __ROR__(v10, 16);
+      v11 = __ROR__(v11, 16);
+      ALOWORD(v10) = __ROR__((uint16_t)v10, 1);
+      ALOWORD(v11) = __ROR__((uint16_t)v11, 1);
+      *v7 = v10;
+      v7[1] = v11;
+      v6 += 2;
+      v7 += 2;
+      --v9;
+    }
+    while ( v9 );
+    if ( v17 == 1 )
+      break;
+    v18 = v17 - 1;
+    v12 = (uint32_t *)&src[(line + (uintptr_t)v6 - (uintptr_t)src) & 0xFFF];
+    v13 = (uint32_t *)((char *)v7 + ext);
+    v14 = wid_64;
+    do
+    {
+      v15 = bswap32(v12[1]);
+      v16 = bswap32(*v12);
+      ALOWORD(v15) = __ROR__((uint16_t)v15, 1);
+      ALOWORD(v16) = __ROR__((uint16_t)v16, 1);
+      v15 = __ROR__(v15, 16);
+      v16 = __ROR__(v16, 16);
+      ALOWORD(v15) = __ROR__((uint16_t)v15, 1);
+      ALOWORD(v16) = __ROR__((uint16_t)v16, 1);
+      *v13 = v15;
+      v13[1] = v16;
+      v12 += 2;
+      v13 += 2;
+      --v14;
+    }
+    while ( v14 );
+    v6 = (uint32_t *)&src[(line + (uintptr_t)v12 - (uintptr_t)src) & 0xFFF];
+    v7 = (uint32_t *)((char *)v13 + ext);
+    v8 = v18 - 1;
+  }
+  while ( v18 != 1 );
 }
 
-extern "C" void  __declspec(naked) asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+static inline void load16bIA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
 {
-	_asm {
-		ALIGN 4
+  uint32_t *v6;
+  uint32_t *v7;
+  int v8;
+  int v9;
+  uint32_t v10;
+  uint32_t *v11;
+  uint32_t *v12;
+  int v13;
+  uint32_t v14;
+  int v15;
+  int v16;
 
-		push ebp
-		mov ebp, esp
-		push ebx
-        push esi
-        push edi
-
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        mov eax,[esi]   // read both pixels
-        mov ebx,[esi+4] // read both pixels
-        mov [edi],eax
-        mov [edi+4],ebx
-        add esi,8
-        add edi,8
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz end_y_loop
-        push ecx
-
-        add esi,[line]
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        mov eax,[esi+4] // read both pixels
-        mov ebx,[esi]   // read both pixels
-        mov [edi],eax
-        mov [edi+4],ebx
-        add esi,8
-        add edi,8
-
-        dec ecx
-        jnz x_loop_2
-
-        add esi,[line]
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v6 = (uint32_t *)src;
+  v7 = (uint32_t *)dst;
+  v8 = height;
+  do
+  {
+    v15 = v8;
+    v9 = wid_64;
+    do
+    {
+      v10 = v6[1];
+      *v7 = *v6;
+      v7[1] = v10;
+      v6 += 2;
+      v7 += 2;
+      --v9;
+    }
+    while ( v9 );
+    if ( v15 == 1 )
+      break;
+    v16 = v15 - 1;
+    v11 = (uint32_t *)((char *)v6 + line);
+    v12 = (uint32_t *)((char *)v7 + ext);
+    v13 = wid_64;
+    do
+    {
+      v14 = *v11;
+      *v12 = v11[1];
+      v12[1] = v14;
+      v11 += 2;
+      v12 += 2;
+      --v13;
+    }
+    while ( v13 );
+    v6 = (uint32_t *)((char *)v11 + line);
+    v7 = (uint32_t *)((char *)v12 + ext);
+    v8 = v16 - 1;
+  }
+  while ( v16 != 1 );
 }
 
+
 //****************************************************************
 // Size: 2, Format: 0
 //
@@ -206,7 +172,7 @@ wxUint32 Load16bRGBA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 2)) << 1;
 
-  asmLoad16bRGBA(src, dst, wid_64, height, line, ext);
+  load16bRGBA((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
 
   return (1 << 16) | GR_TEXFMT_ARGB_1555;
 }
@@ -221,7 +187,7 @@ wxUint32 Load16bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int li
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 2)) << 1;
 
-  asmLoad16bIA(src, dst, wid_64, height, line, ext);
+  load16bIA((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
 
   return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
 }
diff --git a/Source/Glide64/TexLoad4b.h b/Source/Glide64/TexLoad4b.h
index 3ed5438a1..9b70874b4 100644
--- a/Source/Glide64/TexLoad4b.h
+++ b/Source/Glide64/TexLoad4b.h
@@ -37,1897 +37,554 @@
 //
 //****************************************************************
 
-extern "C" void __declspec(naked) asmLoad4bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+static inline void load4bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, uint16_t line, int ext, uint16_t *pal)
 {
-	_asm {
-		push ebp
-		mov ebp, esp
-        push ebx
-        push esi
-        push edi
+  uint8_t *v7;
+  uint8_t *v8;
+  int v9;
+  int v10;
+  int v11;
+  uint32_t v12;
+  uint8_t *v13;
+  uint32_t v14;
+  uint32_t *v15;
+  uint32_t v16;
+  uint8_t *v17;
+  uint32_t *v18;
+  int v19;
+  int v20;
+  uint32_t v21;
+  uint32_t v22;
+  uint32_t *v23;
+  uint32_t v24;
+  int v25;
+  int v26;
 
-        mov ebx,[pal]
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        push ecx
-
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz near end_y_loop
-        push ecx
-
-        mov eax,esi
-        add eax,[line]
-        mov esi,[src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ext]
-
-        mov ecx,[wid_64]
- x_loop_2:
-        push ecx
-
-        mov eax,[esi+4]         // read all 8 pixels
-        bswap eax
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        mov edx,esi
-        add edx,8
-        mov esi,[src]
-        sub edx,esi
-        and edx,0x7FF
-        add esi,edx
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop_2
-
-        mov eax,esi
-        add eax,[line]
-        mov esi,[src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v7 = src;
+  v8 = dst;
+  v9 = height;
+  do
+  {
+    v25 = v9;
+    v10 = wid_64;
+    do
+    {
+      v11 = v10;
+      v12 = bswap32(*(uint32_t *)v7);
+      v13 = v7 + 4;
+      ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1E)), 1);
+      v14 = v10 << 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 27) & 0x1E)), 1);
+      *(uint32_t *)v8 = v14;
+      v15 = (uint32_t *)(v8 + 4);
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 19) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 11) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v12 & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 3) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      v16 = bswap32(*(uint32_t *)v13);
+      v7 = v13 + 4;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 27) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 19) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 11) & 0x1E)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v16 & 0x1E)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 3) & 0x1E)), 1);
+      *v15 = v14;
+      v8 = (uint8_t *)(v15 + 1);
+      v10 = v11 - 1;
+    }
+    while ( v11 != 1 );
+    if ( v25 == 1 )
+      break;
+    v26 = v25 - 1;
+    v17 = &src[(line + (uintptr_t)v7 - (uintptr_t)src) & 0x7FF];
+    v18 = (uint32_t *)&v8[ext];
+    v19 = wid_64;
+    do
+    {
+      v20 = v19;
+      v21 = bswap32(*((uint32_t *)v17 + 1));
+      ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1E)), 1);
+      v22 = v19 << 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 27) & 0x1E)), 1);
+      *v18 = v22;
+      v23 = v18 + 1;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 19) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 11) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v21 & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 3) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      v24 = bswap32(*(uint32_t *)v17);
+      v17 = &src[((uintptr_t)v17 + 8 - (uintptr_t)src) & 0x7FF];
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 27) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 19) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 11) & 0x1E)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v24 & 0x1E)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 3) & 0x1E)), 1);
+      *v23 = v22;
+      v18 = v23 + 1;
+      v19 = v20 - 1;
+    }
+    while ( v20 != 1 );
+    v7 = &src[(line + (uintptr_t)v17 - (uintptr_t)src) & 0x7FF];
+    v8 = (uint8_t *)((char *)v18 + ext);
+    v9 = v26 - 1;
+  }
+  while ( v26 != 1 );
 }
 
-extern "C" void  __declspec(naked) asmLoad4bIAPal (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+static inline void load4bIAPal(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal)
 {
-	_asm {
-		push ebp
-		mov ebp, esp
-		push ebx
-        push esi
-        push edi
+  uint8_t *v7;
+  uint32_t *v8;
+  int v9;
+  int v10;
+  int v11;
+  uint32_t v12;
+  uint32_t *v13;
+  uint32_t v14;
+  uint32_t *v15;
+  uint32_t v16;
+  uint8_t *v17;
+  uint32_t *v18;
+  int v19;
+  int v20;
+  uint32_t v21;
+  uint32_t v22;
+  uint32_t *v23;
+  uint32_t v24;
+  int v25;
+  int v26;
 
-        mov ebx,[pal]
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        push ecx
-
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz near end_y_loop
-        push ecx
-
-        mov eax,esi
-        add eax,[line]
-        mov esi,[src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        push ecx
-
-        mov eax,[esi+4]         // read all 8 pixels
-        bswap eax
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        mov edx,esi
-        add edx,8
-        mov esi,[src]
-        sub edx,esi
-        and edx,0x7FF
-        add esi,edx
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,23
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,27
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shr eax,15
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,19
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 3rd dword output {
-        mov eax,edx
-        shr eax,7
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,11
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 4th dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1E
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,3
-        and edx,0x1E
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop_2
-
-        mov eax,esi
-        add eax,[line]
-        mov esi,[src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v7 = src;
+  v8 = (uint32_t *)dst;
+  v9 = height;
+  do
+  {
+    v25 = v9;
+    v10 = wid_64;
+    do
+    {
+      v11 = v10;
+      v12 = bswap32(*(uint32_t *)v7);
+      v13 = (uint32_t *)(v7 + 4);
+      ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1E)), 8);
+      v14 = v10 << 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 27) & 0x1E)), 8);
+      *v8 = v14;
+      v15 = v8 + 1;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 19) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 11) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v12 & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 3) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      v16 = bswap32(*v13);
+      v7 = (uint8_t *)(v13 + 1);
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 27) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 19) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 11) & 0x1E)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v16 & 0x1E)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 3) & 0x1E)), 8);
+      *v15 = v14;
+      v8 = v15 + 1;
+      v10 = v11 - 1;
+    }
+    while ( v11 != 1 );
+    if ( v25 == 1 )
+      break;
+    v26 = v25 - 1;
+    v17 = &src[(line + (uintptr_t)v7 - (uintptr_t)src) & 0x7FF];
+    v18 = (uint32_t *)((char *)v8 + ext);
+    v19 = wid_64;
+    do
+    {
+      v20 = v19;
+      v21 = bswap32(*((uint32_t *)v17 + 1));
+      ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1E)), 8);
+      v22 = v19 << 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 27) & 0x1E)), 8);
+      *v18 = v22;
+      v23 = v18 + 1;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 19) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 11) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v21 & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 3) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      v24 = bswap32(*(uint32_t *)v17);
+      v17 = &src[((uintptr_t)v17 + 8 - (uintptr_t)src) & 0x7FF];
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 27) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 19) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 11) & 0x1E)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint8_t)v24 & 0x1E)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 3) & 0x1E)), 8);
+      *v23 = v22;
+      v18 = v23 + 1;
+      v19 = v20 - 1;
+    }
+    while ( v20 != 1 );
+    v7 = &src[(line + (uintptr_t)v17 - (uintptr_t)src) & 0x7FF];
+    v8 = (uint32_t *)((char *)v18 + ext);
+    v9 = v26 - 1;
+  }
+  while ( v26 != 1 );
 }
 
-extern "C" void  __declspec(naked) asmLoad4bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+static inline void load4bIA(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
 {
-	_asm {
-		push ebp
-		mov ebp, esp
-		push ebx
-        push esi
-        push edi
-
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        push ecx
-
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword {
-        xor ecx,ecx
-
-        // pixel #1
-        //       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,24 //Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,28 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #2
-        //       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        mov eax,edx
-        shr eax,12 //Alpha
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,16 // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        // pixel #3
-        //       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha
-        mov eax,edx
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,4 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #4
-        //       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
-        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,12 //Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,8 // Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-// 2nd dword {
-        xor ecx,ecx
-
-        // pixel #5
-        //       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,8 //Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,12 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #6
-        //       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
-        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        //Alpha
-        mov eax,edx
-        shl eax,4
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx     // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        // pixel #7
-        //       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
-        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha
-        mov eax,edx
-        shl eax,16
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,12 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #8
-        //       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
-        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,28 //Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,24 // Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword {
-        xor ecx,ecx
-
-        // pixel #1
-        //       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,24 //Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,28 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #2
-        //       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        mov eax,edx
-        shr eax,12 //Alpha
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,16 // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        // pixel #3
-        //       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha
-        mov eax,edx
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,4 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #4
-        //       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
-        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,12 //Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,8 // Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-// 2nd dword {
-        xor ecx,ecx
-
-        // pixel #5
-        //       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,8 //Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,12 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #6
-        //       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
-        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        //Alpha
-        mov eax,edx
-        shl eax,4
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx     // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        // pixel #7
-        //       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
-        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha
-        mov eax,edx
-        shl eax,16
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,12 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #8
-        //       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
-        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,28 //Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,24 // Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // *
-
-        pop ecx
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz near end_y_loop
-        push ecx
-
-        add esi,[line]
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        push ecx
-
-        mov eax,[esi+4]         // read all 8 pixels
-        bswap eax
-        mov edx,eax
-
-        // 1st dword {
-        xor ecx,ecx
-
-        // pixel #1
-        //       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,24 //Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,28 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #2
-        //       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        mov eax,edx
-        shr eax,12 //Alpha
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,16 // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        // pixel #3
-        //       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha
-        mov eax,edx
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,4 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #4
-        //       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
-        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,12 //Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,8 // Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-// 2nd dword {
-        xor ecx,ecx
-
-        // pixel #5
-        //       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,8 //Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,12 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #6
-        //       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
-        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        //Alpha
-        mov eax,edx
-        shl eax,4
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx     // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        // pixel #7
-        //       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
-        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha
-        mov eax,edx
-        shl eax,16
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,12 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #8
-        //       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
-        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,28 //Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,24 // Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,8
-        mov edx,eax
-
-// 1st dword {
-        xor ecx,ecx
-
-        // pixel #1
-        //       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,24 //Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,28 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #2
-        //       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        mov eax,edx
-        shr eax,12 //Alpha
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,16 // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        // pixel #3
-        //       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
-        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha
-        mov eax,edx
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,4 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #4
-        //       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
-        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,12 //Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,8 // Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-// 2nd dword {
-        xor ecx,ecx
-
-        // pixel #5
-        //       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
-        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
-        mov eax,edx
-        shr eax,8 //Alpha
-        and eax,0x00000010
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shr eax,12 // Intensity
-        and eax,0x0000000E
-        or ecx,eax
-        shr eax,3
-        or ecx,eax
-
-        // pixel #6
-        //       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
-        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
-        //Alpha
-        mov eax,edx
-        shl eax,4
-        and eax,0x00001000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx     // Intensity
-        and eax,0x00000E00
-        or ecx,eax
-        shr eax,3
-        and eax,0x00000100
-        or ecx,eax
-
-        // pixel #7
-        //       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
-        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
-        //Alpha
-        mov eax,edx
-        shl eax,16
-        and eax,0x00100000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,12 // Intensity
-        and eax,0x000E0000
-        or ecx,eax
-        shr eax,3
-        and eax,0x00010000
-        or ecx,eax
-
-        // pixel #8
-        //       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
-        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
-        mov eax,edx
-        shl eax,28 //Alpha
-        and eax,0x10000000
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        shl eax,1
-        or ecx,eax
-        mov eax,edx
-        shl eax,24 // Intensity
-        and eax,0x0E000000
-        or ecx,eax
-        shr eax,3
-        and eax,0x01000000
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-        dec ecx
-        jnz x_loop_2
-
-        add esi,[line]
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  uint32_t *v6;
+  uint32_t *v7;
+  int v8;
+  int v9;
+  int v10;
+  uint32_t v11;
+  uint32_t *v12;
+  uint32_t v13;
+  uint32_t v14;
+  uint32_t v15;
+  uint32_t *v16;
+  uint32_t v17;
+  uint32_t v18;
+  uint32_t v19;
+  uint32_t v20;
+  uint32_t v21;
+  uint32_t v22;
+  uint32_t v23;
+  uint32_t v24;
+  uint32_t v25;
+  uint32_t v26;
+  uint32_t v27;
+  uint32_t v28;
+  uint32_t v29;
+  uint32_t v30;
+  uint32_t v31;
+  uint32_t v32;
+  uint32_t *v33;
+  uint32_t *v34;
+  int v35;
+  int v36;
+  uint32_t v37;
+  uint32_t v38;
+  uint32_t v39;
+  uint32_t *v40;
+  uint32_t v41;
+  uint32_t v42;
+  uint32_t v43;
+  uint32_t v44;
+  uint32_t v45;
+  uint32_t v46;
+  uint32_t v47;
+  uint32_t v48;
+  uint32_t v49;
+  uint32_t v50;
+  uint32_t v51;
+  uint32_t v52;
+  uint32_t v53;
+  uint32_t v54;
+  uint32_t v55;
+  uint32_t v56;
+  int v57;
+  int v58;
+
+  v6 = (uint32_t *)src;
+  v7 = (uint32_t *)dst;
+  v8 = height;
+  do
+  {
+    v57 = v8;
+    v9 = wid_64;
+    do
+    {
+      v10 = v9;
+      v11 = bswap32(*v6);
+      v12 = v6 + 1;
+      v13 = v11;
+      v14 = (8 * (v11 & 0x100000)) | (4 * (v11 & 0x100000)) | (2 * (v11 & 0x100000)) | (v11 & 0x100000) | ((((v11 >> 16) & 0xE00) >> 3) & 0x100) | ((v11 >> 16) & 0xE00) | (8 * ((v11 >> 12) & 0x1000)) | (4 * ((v11 >> 12) & 0x1000)) | (2 * ((v11 >> 12) & 0x1000)) | ((v11 >> 12) & 0x1000) | ((((v11 >> 28) & 0xE) >> 3)) | ((v11 >> 28) & 0xE) | (8 * ((v11 >> 24) & 0x10)) | (4 * ((v11 >> 24) & 0x10)) | (2 * ((v11 >> 24) & 0x10)) | ((v11 >> 24) & 0x10);
+      v11 >>= 4;
+      v11 &= 0xE0000u;
+      v15 = v11 | v14;
+      v11 >>= 3;
+      *v7 = ((((v13 << 8) & 0xE000000) >> 3) & 0x1000000) | ((v13 << 8) & 0xE000000) | (8 * ((v13 << 12) & 0x10000000)) | (4 * ((v13 << 12) & 0x10000000)) | (2 * ((v13 << 12) & 0x10000000)) | ((v13 << 12) & 0x10000000) | (v11 & 0x10000) | v15;
+      v16 = v7 + 1;
+      v17 = 16 * (uint16_t)v13 & 0x1000;
+      v18 = (((v13 & 0xE00) >> 3) & 0x100) | (v13 & 0xE00) | (8 * v17) | (4 * v17) | (2 * v17) | (v17) | ((((v13 >> 12) & 0xE) >> 3)) | ((v13 >> 12) & 0xE) | (8 * ((v13 >> 8) & 0x10)) | (4 * ((v13 >> 8) & 0x10)) | (2 * ((v13 >> 8) & 0x10)) | ((v13 >> 8) & 0x10);
+      v19 = v13 << 16;
+      v20 = (8 * (v19 & 0x100000)) | (4 * (v19 & 0x100000)) | (2 * (v19 & 0x100000)) | (v19 & 0x100000) | v18;
+      v21 = v13 << 12;
+      v21 &= 0xE0000u;
+      v22 = v21 | v20;
+      v21 >>= 3;
+      *v16 = ((((v13 << 24) & 0xE000000) >> 3) & 0x1000000) | ((v13 << 24) & 0xE000000) | (8 * ((v13 << 28) & 0x10000000)) | (4 * ((v13 << 28) & 0x10000000)) | (2 * ((v13 << 28) & 0x10000000)) | ((v13 << 28) & 0x10000000) | (v21 & 0x10000) | v22;
+      ++v16;
+      v23 = bswap32(*v12);
+      v6 = v12 + 1;
+      v24 = v23;
+      v25 = (8 * (v23 & 0x100000)) | (4 * (v23 & 0x100000)) | (2 * (v23 & 0x100000)) | (v23 & 0x100000) | ((((v23 >> 16) & 0xE00) >> 3) & 0x100) | ((v23 >> 16) & 0xE00) | (8 * ((v23 >> 12) & 0x1000)) | (4 * ((v23 >> 12) & 0x1000)) | (2 * ((v23 >> 12) & 0x1000)) | ((v23 >> 12) & 0x1000) | (((v23 >> 28) & 0xE) >> 3) | ((v23 >> 28) & 0xE) | (8 * ((v23 >> 24) & 0x10)) | (4 * ((v23 >> 24) & 0x10)) | (2 * ((v23 >> 24) & 0x10)) | ((v23 >> 24) & 0x10);
+      v23 >>= 4;
+      v23 &= 0xE0000u;
+      v26 = v23 | v25;
+      v23 >>= 3;
+      *v16 = ((((v24 << 8) & 0xE000000) >> 3) & 0x1000000) | ((v24 << 8) & 0xE000000) | (8 * ((v24 << 12) & 0x10000000)) | (4 * ((v24 << 12) & 0x10000000)) | (2 * ((v24 << 12) & 0x10000000)) | ((v24 << 12) & 0x10000000) | (v23 & 0x10000) | (v26);
+      ++v16;
+      v27 = 16 * (uint16_t)v24 & 0x1000;
+      v28 = (((v24 & 0xE00) >> 3) & 0x100) | (v24 & 0xE00) | (8 * v27) | (4 * v27) | (2 * v27) | (v27) | ((((v24 >> 12) & 0xE) >> 3)) | ((v24 >> 12) & 0xE) | (8 * ((v24 >> 8) & 0x10)) | (4 * ((v24 >> 8) & 0x10)) | (2 * ((v24 >> 8) & 0x10)) | ((v24 >> 8) & 0x10);
+      v29 = v24 << 16;
+      v30 = (8 * (v29 & 0x100000)) | (4 * (v29 & 0x100000)) | (2 * (v29 & 0x100000)) | (v29 & 0x100000) | v28;
+      v31 = v24 << 12;
+      v31 &= 0xE0000u;
+      v32 = v31 | v30;
+      v31 >>= 3;
+      *v16 = ((((v24 << 24) & 0xE000000) >> 3) & 0x1000000) | ((v24 << 24) & 0xE000000) | (8 * ((v24 << 28) & 0x10000000)) | (4 * ((v24 << 28) & 0x10000000)) | (2 * ((v24 << 28) & 0x10000000)) | ((v24 << 28) & 0x10000000) | (v31 & 0x10000) | v32;
+      v7 = v16 + 1;
+      v9 = v10 - 1;
+    }
+    while ( v10 != 1 );
+    if ( v57 == 1 )
+      break;
+    v58 = v57 - 1;
+    v33 = (uint32_t *)((char *)v6 + line);
+    v34 = (uint32_t *)((char *)v7 + ext);
+    v35 = wid_64;
+    do
+    {
+      v36 = v35;
+      v37 = bswap32(v33[1]);
+      v38 = v37 >> 4;
+      v38 &= 0xE0000u;
+      v39 = v38 | (8 * (v37 & 0x100000)) | (4 * (v37 & 0x100000)) | (2 * (v37 & 0x100000)) | (v37 & 0x100000) | ((((v37 >> 16) & 0xE00) >> 3) & 0x100) | ((v37 >> 16) & 0xE00) | (8 * ((v37 >> 12) & 0x1000)) | (4 * ((v37 >> 12) & 0x1000)) | (2 * ((v37 >> 12) & 0x1000)) | ((v37 >> 12) & 0x1000) | (((v37 >> 28) & 0xE) >> 3) | ((v37 >> 28) & 0xE) | (8 * ((v37 >> 24) & 0x10)) | (4 * ((v37 >> 24) & 0x10)) | (2 * ((v37 >> 24) & 0x10)) | ((v37 >> 24) & 0x10);
+      v38 >>= 3;
+      *v34 = ((((v37 << 8) & 0xE000000) >> 3) & 0x1000000) | ((v37 << 8) & 0xE000000) | (8 * ((v37 << 12) & 0x10000000)) | (4 * ((v37 << 12) & 0x10000000)) | (2 * ((v37 << 12) & 0x10000000)) | ((v37 << 12) & 0x10000000) | (v38 & 0x10000) | v39;
+      v40 = v34 + 1;
+      v41 = 16 * (uint16_t)v37 & 0x1000;
+      v42 = (((v37 & 0xE00) >> 3) & 0x100) | (v37 & 0xE00) | (8 * v41) | (4 * v41) | (2 * v41) | v41 | (((v37 >> 12) & 0xE) >> 3) | ((v37 >> 12) & 0xE) | (8 * ((v37 >> 8) & 0x10)) | (4 * ((v37 >> 8) & 0x10)) | (2 * ((v37 >> 8) & 0x10)) | ((v37 >> 8) & 0x10);
+      v43 = v37 << 16;
+      v44 = (8 * (v43 & 0x100000)) | (4 * (v43 & 0x100000)) | (2 * (v43 & 0x100000)) | (v43 & 0x100000) | v42;
+      v45 = v37 << 12;
+      v45 &= 0xE0000u;
+      v46 = v45 | v44;
+      v45 >>= 3;
+      *v40 = ((((v37 << 24) & 0xE000000) >> 3) & 0x1000000) | ((v37 << 24) & 0xE000000) | (8 * ((v37 << 28) & 0x10000000)) | (4 * ((v37 << 28) & 0x10000000)) | (2 * ((v37 << 28) & 0x10000000)) | ((v37 << 28) & 0x10000000) | (v45 & 0x10000) | v46;
+      ++v40;
+      v47 = bswap32(*v33);
+      v33 += 2;
+      v48 = v47;
+      v49 = (8 * (v47 & 0x100000)) | (4 * (v47 & 0x100000)) | (2 * (v47 & 0x100000)) | (v47 & 0x100000) | ((((v47 >> 16) & 0xE00) >> 3) & 0x100) | ((v47 >> 16) & 0xE00) | (8 * ((v47 >> 12) & 0x1000)) | (4 * ((v47 >> 12) & 0x1000)) | (2 * ((v47 >> 12) & 0x1000)) | ((v47 >> 12) & 0x1000) | (((v47 >> 28) & 0xE) >> 3) | ((v47 >> 28) & 0xE) | (8 * ((v47 >> 24) & 0x10)) | (4 * ((v47 >> 24) & 0x10)) | (2 * ((v47 >> 24) & 0x10)) | ((v47 >> 24) & 0x10);
+      v47 >>= 4;
+      v47 &= 0xE0000u;
+      v50 = v47 | v49;
+      v47 >>= 3;
+      *v40 = ((((v48 << 8) & 0xE000000) >> 3) & 0x1000000) | ((v48 << 8) & 0xE000000) | (8 * ((v48 << 12) & 0x10000000)) | (4 * ((v48 << 12) & 0x10000000)) | (2 * ((v48 << 12) & 0x10000000)) | ((v48 << 12) & 0x10000000) | (v47 & 0x10000) | v50;
+      ++v40;
+      v51 = 16 * (uint16_t)v48 & 0x1000;
+      v52 = (((v48 & 0xE00) >> 3) & 0x100) | (v48 & 0xE00) | (8 * v51) | (4 * v51) | (2 * v51) | v51 | (((v48 >> 12) & 0xE) >> 3) | ((v48 >> 12) & 0xE) | (8 * ((v48 >> 8) & 0x10)) | (4 * ((v48 >> 8) & 0x10)) | (2 * ((v48 >> 8) & 0x10)) | ((v48 >> 8) & 0x10);
+      v53 = v48 << 16;
+      v54 = (8 * (v53 & 0x100000)) | (4 * (v53 & 0x100000)) | (2 * (v53 & 0x100000)) | (v53 & 0x100000) | v52;
+      v55 = v48 << 12;
+      v55 &= 0xE0000u;
+      v56 = v55 | v54;
+      v55 >>= 3;
+      *v40 = ((((v48 << 24) & 0xE000000) >> 3) & 0x1000000) | ((v48 << 24) & 0xE000000) | (8 * ((v48 << 28) & 0x10000000)) | (4 * ((v48 << 28) & 0x10000000)) | (2 * ((v48 << 28) & 0x10000000)) | ((v48 << 28) & 0x10000000) | (v55 & 0x10000) | v56;
+      v34 = v40 + 1;
+      v35 = v36 - 1;
+    }
+    while ( v36 != 1 );
+    v6 = (uint32_t *)((char *)v33 + line);
+    v7 = (uint32_t *)((char *)v34 + ext);
+    v8 = v58 - 1;
+  }
+  while ( v58 != 1 );
 }
 
-extern "C" void  __declspec(naked) asmLoad4bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
+static inline void load4bI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
 {
-	_asm {
-		push ebp
-		mov ebp, esp
-		push ebx
-        push esi
-        push edi
+  uint32_t *v6;
+  uint32_t *v7;
+  int v8;
+  int v9;
+  int v10;
+  uint32_t v11;
+  uint32_t *v12;
+  uint32_t v13;
+  uint32_t v14;
+  uint32_t *v15;
+  uint32_t v16;
+  unsigned int v17;
+  unsigned int v18;
+  uint32_t v19;
+  uint32_t v20;
+  uint32_t *v21;
+  uint32_t *v22;
+  int v23;
+  int v24;
+  uint32_t v25;
+  uint32_t v26;
+  uint32_t *v27;
+  uint32_t v28;
+  uint32_t v29;
+  uint32_t v30;
+  uint32_t v31;
+  uint32_t v32;
+  int v33;
+  int v34;
 
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        push ecx
-
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword {
-        xor ecx,ecx
-        shr eax,28              // 0xF0000000 -> 0x0000000F
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx             // 0x0F000000 -> 0x00000F00
-        shr eax,16
-        and eax,0x00000F00
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shr eax,4               // 0x00F00000 -> 0x000F0000
-        and eax,0x000F0000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shl eax,8               // 0x000F0000 -> 0x0F000000
-        and eax,0x0F000000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword {
-        xor ecx,ecx
-        mov eax,edx
-        shr eax,12              // 0x0000F000 -> 0x0000000F
-        and eax,0x0000000F
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx             // 0x00000F00 -> 0x00000F00
-        and eax,0x00000F00
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shl eax,12              // 0x000000F0 -> 0x000F0000
-        and eax,0x000F0000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        shl edx,24              // 0x0000000F -> 0x0F000000
-        and edx,0x0F000000
-        or ecx,edx
-        shl edx,4
-        or ecx,edx
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword {
-        xor ecx,ecx
-        shr eax,28              // 0xF0000000 -> 0x0000000F
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx             // 0x0F000000 -> 0x00000F00
-        shr eax,16
-        and eax,0x00000F00
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shr eax,4               // 0x00F00000 -> 0x000F0000
-        and eax,0x000F0000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shl eax,8               // 0x000F0000 -> 0x0F000000
-        and eax,0x0F000000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword {
-        xor ecx,ecx
-        mov eax,edx
-        shr eax,12              // 0x0000F000 -> 0x0000000F
-        and eax,0x0000000F
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx             // 0x00000F00 -> 0x00000F00
-        and eax,0x00000F00
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shl eax,12              // 0x000000F0 -> 0x000F0000
-        and eax,0x000F0000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        shl edx,24              // 0x0000000F -> 0x0F000000
-        and edx,0x0F000000
-        or ecx,edx
-        shl edx,4
-        or ecx,edx
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz near end_y_loop
-        push ecx
-
-        add esi,[line]
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        push ecx
-
-        mov eax,[esi+4]         // read all 8 pixels
-        bswap eax
-        mov edx,eax
-
-        // 1st dword {
-        xor ecx,ecx
-        shr eax,28              // 0xF0000000 -> 0x0000000F
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx             // 0x0F000000 -> 0x00000F00
-        shr eax,16
-        and eax,0x00000F00
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shr eax,4               // 0x00F00000 -> 0x000F0000
-        and eax,0x000F0000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shl eax,8               // 0x000F0000 -> 0x0F000000
-        and eax,0x0F000000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword {
-        xor ecx,ecx
-        mov eax,edx
-        shr eax,12              // 0x0000F000 -> 0x0000000F
-        and eax,0x0000000F
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx             // 0x00000F00 -> 0x00000F00
-        and eax,0x00000F00
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shl eax,12              // 0x000000F0 -> 0x000F0000
-        and eax,0x000F0000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        shl edx,24              // 0x0000000F -> 0x0F000000
-        and edx,0x0F000000
-        or ecx,edx
-        shl edx,4
-        or ecx,edx
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 8 pixels
-        bswap eax
-        add esi,8
-        mov edx,eax
-
-        // 1st dword {
-        xor ecx,ecx
-        shr eax,28              // 0xF0000000 -> 0x0000000F
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx             // 0x0F000000 -> 0x00000F00
-        shr eax,16
-        and eax,0x00000F00
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shr eax,4               // 0x00F00000 -> 0x000F0000
-        and eax,0x000F0000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shl eax,8               // 0x000F0000 -> 0x0F000000
-        and eax,0x0F000000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword {
-        xor ecx,ecx
-        mov eax,edx
-        shr eax,12              // 0x0000F000 -> 0x0000000F
-        and eax,0x0000000F
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx             // 0x00000F00 -> 0x00000F00
-        and eax,0x00000F00
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        mov eax,edx
-        shl eax,12              // 0x000000F0 -> 0x000F0000
-        and eax,0x000F0000
-        or ecx,eax
-        shl eax,4
-        or ecx,eax
-
-        shl edx,24              // 0x0000000F -> 0x0F000000
-        and edx,0x0F000000
-        or ecx,edx
-        shl edx,4
-        or ecx,edx
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-        dec ecx
-        jnz x_loop_2
-
-        add esi,[line]
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v6 = (uint32_t *)src;
+  v7 = (uint32_t *)dst;
+  v8 = height;
+  do
+  {
+    v33 = v8;
+    v9 = wid_64;
+    do
+    {
+      v10 = v9;
+      v11 = bswap32(*v6);
+      v12 = v6 + 1;
+      v13 = v11;
+      v14 = (16 * ((v11 >> 16) & 0xF00)) | ((v11 >> 16) & 0xF00) | (16 * (v11 >> 28)) | (v11 >> 28);
+      v11 >>= 4;
+      *v7 = (16 * ((v13 << 8) & 0xF000000)) | ((v13 << 8) & 0xF000000) | (16 * (v11 & 0xF0000)) | (v11 & 0xF0000) | v14;
+      v15 = v7 + 1;
+      v16 = v13 << 12;
+      *v15 = (16 * ((v13 << 24) & 0xF000000)) | ((v13 << 24) & 0xF000000) | (16 * (v16 & 0xF0000)) | (v16 & 0xF0000) | (16 * (v13 & 0xF00)) | (v13 & 0xF00) | (16 * ((uint16_t)v13 >> 12)) | ((uint16_t)v13 >> 12);
+      ++v15;
+      v17 = bswap32(*v12);
+      v6 = v12 + 1;
+      v18 = v17;
+      v19 = (16 * ((v17 >> 16) & 0xF00)) | ((v17 >> 16) & 0xF00) | (16 * (v17 >> 28)) | (v17 >> 28);
+      v17 >>= 4;
+      *v15 = (16 * ((v18 << 8) & 0xF000000)) | ((v18 << 8) & 0xF000000) | (16 * (v17 & 0xF0000)) | (v17 & 0xF0000) | v19;
+      ++v15;
+      v20 = v18 << 12;
+      *v15 = (16 * ((v18 << 24) & 0xF000000)) | ((v18 << 24) & 0xF000000) | (16 * (v20 & 0xF0000)) | (v20 & 0xF0000) | (16 * (v18 & 0xF00)) | (v18 & 0xF00) | (16 * ((uint16_t)v18 >> 12)) | ((uint16_t)v18 >> 12);
+      v7 = v15 + 1;
+      v9 = v10 - 1;
+    }
+    while ( v10 != 1 );
+    if ( v33 == 1 )
+      break;
+    v34 = v33 - 1;
+    v21 = (uint32_t *)((char *)v6 + line);
+    v22 = (uint32_t *)((char *)v7 + ext);
+    v23 = wid_64;
+    do
+    {
+      v24 = v23;
+      v25 = bswap32(v21[1]);
+      v26 = v25 >> 4;
+      *v22 = (16 * ((v25 << 8) & 0xF000000)) | ((v25 << 8) & 0xF000000) | (16 * (v26 & 0xF0000)) | (v26 & 0xF0000) | (16 * ((v25 >> 16) & 0xF00)) | ((v25 >> 16) & 0xF00) | (16 * (v25 >> 28)) | (v25 >> 28);
+      v27 = v22 + 1;
+      v28 = v25 << 12;
+      *v27 = (16 * ((v25 << 24) & 0xF000000)) | ((v25 << 24) & 0xF000000) | (16 * (v28 & 0xF0000)) | (v28 & 0xF0000) | (16 * (v25 & 0xF00)) | (v25 & 0xF00) | (16 * ((uint16_t)v25 >> 12)) | ((uint16_t)v25 >> 12);
+      ++v27;
+      v29 = bswap32(*v21);
+      v21 += 2;
+      v30 = v29;
+      v31 = (16 * ((v29 >> 16) & 0xF00)) | ((v29 >> 16) & 0xF00) | (16 * (v29 >> 28)) | (v29 >> 28);
+      v29 >>= 4;
+      *v27 = (16 * ((v30 << 8) & 0xF000000)) | ((v30 << 8) & 0xF000000) | (16 * (v29 & 0xF0000)) | (v29 & 0xF0000) | v31;
+      ++v27;
+      v32 = v30 << 12;
+      *v27 = (16 * ((v30 << 24) & 0xF000000)) | ((v30 << 24) & 0xF000000) | (16 * (v32 & 0xF0000)) | (v32 & 0xF0000) | (16 * (v30 & 0xF00)) | (v30 & 0xF00) | (16 * ((uint16_t)v30 >> 12)) | ((uint16_t)v30 >> 12);
+      v22 = v27 + 1;
+      v23 = v24 - 1;
+    }
+    while ( v24 != 1 );
+    v6 = (uint32_t *)((char *)v21 + line);
+    v7 = (uint32_t *)((char *)v22 + ext);
+    v8 = v34 - 1;
+  }
+  while ( v34 != 1 );
 }
 
 //****************************************************************
@@ -1937,24 +594,26 @@ wxUint32 Load4bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int lin
 {
   if (wid_64 < 1) wid_64 = 1;
   if (height < 1) height = 1;
-  int ext = (real_width - (wid_64 << 4)) << 1;
+  int ext = (real_width - (wid_64 << 4));
 
-  if (rdp.tlut_mode == 0) 
+  if (rdp.tlut_mode == 0)
   {
-    //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference. 
+    //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference.
     //Thanks to angrylion for the advice
-    asmLoad4bI (src, dst, wid_64, height, line, ext);	
+    load4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
     return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
   }
 
   wxUIntPtr pal = wxPtrToUInt(rdp.pal_8 + (rdp.tiles[tile].palette << 4));
-  if (rdp.tlut_mode == 2) 
+  if (rdp.tlut_mode == 2)
   {
-    asmLoad4bCI (src, dst, wid_64, height, line, ext, pal);
+    ext <<= 1;
+    load4bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
     return (1 << 16) | GR_TEXFMT_ARGB_1555;
   }
 
-  asmLoad4bIAPal (src, dst, wid_64, height, line, ext, pal);
+  ext <<= 1;
+  load4bIAPal ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, (uint16_t *)pal);
   return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
 }
 
@@ -1971,7 +630,7 @@ wxUint32 Load4bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int lin
   if (wid_64 < 1) wid_64 = 1;
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 4));
-  asmLoad4bIA (src, dst, wid_64, height, line, ext);	
+  load4bIA ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
   return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
 }
 
@@ -1986,7 +645,8 @@ wxUint32 Load4bI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line
   if (wid_64 < 1) wid_64 = 1;
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 4));
-  asmLoad4bI (src, dst, wid_64, height, line, ext);
+  load4bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
+  
   return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
 }
 
diff --git a/Source/Glide64/TexLoad8b.h b/Source/Glide64/TexLoad8b.h
index 388e55e7c..64e01eb40 100644
--- a/Source/Glide64/TexLoad8b.h
+++ b/Source/Glide64/TexLoad8b.h
@@ -37,629 +37,336 @@
 //
 //****************************************************************
 
-extern "C" void  __declspec(naked) asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+static inline void load8bCI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal)
 {
-	_asm {
-		push ebp
-		mov ebp, esp
-		push ebx
-        push esi
-        push edi
+  uint8_t *v7;
+  uint32_t *v8;
+  int v9;
+  int v10;
+  int v11;
+  uint32_t v12;
+  uint32_t *v13;
+  uint32_t v14;
+  uint32_t *v15;
+  uint32_t v16;
+  uint32_t *v17;
+  uint32_t *v18;
+  int v19;
+  int v20;
+  uint32_t v21;
+  uint32_t v22;
+  uint32_t *v23;
+  uint32_t v24;
+  int v25;
+  int v26;
 
-        mov ebx,[pal]
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        push ecx
-
-        mov eax,[esi]           // read all 4 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,15
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,23
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,7
-        and edx,0x1FE
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 4 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,15
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,23
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,7
-        and edx,0x1FE
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz near end_y_loop
-        push ecx
-
-        mov eax,esi
-        add eax,[line]
-        mov esi,[src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        push ecx
-
-        mov eax,[esi+4]         // read all 4 pixels
-        bswap eax
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,15
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,23
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,7
-        and edx,0x1FE
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 4 pixels
-        bswap eax
-        mov edx,esi
-        add edx,8
-        mov esi,[src]
-        sub edx,esi
-        and edx,0x7FF
-        add esi,edx
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,15
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,23
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,1
-        shl ecx,16
-
-        shr edx,7
-        and edx,0x1FE
-        mov cx,[ebx+edx]
-        ror cx,1
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop_2
-
-        mov eax,esi
-        add eax,[line]
-        mov esi,[src]
-        sub eax,esi
-        and eax,0x7FF
-        add esi,eax
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v7 = src;
+  v8 = (uint32_t *)dst;
+  v9 = height;
+  do
+  {
+    v25 = v9;
+    v10 = wid_64;
+    do
+    {
+      v11 = v10;
+      v12 = bswap32(*(uint32_t *)v7);
+      v13 = (uint32_t *)(v7 + 4);
+      ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1FE)), 1);
+      v14 = v10 << 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1FE)), 1);
+      *v8 = v14;
+      v15 = v8 + 1;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v12 & 0x1FE)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1FE)), 1);
+      *v15 = v14;
+      ++v15;
+      v16 = bswap32(*v13);
+      v7 = (uint8_t *)(v13 + 1);
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1FE)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1FE)), 1);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v16 & 0x1FE)), 1);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1FE)), 1);
+      *v15 = v14;
+      v8 = v15 + 1;
+      v10 = v11 - 1;
+    }
+    while ( v11 != 1 );
+    if ( v25 == 1 )
+      break;
+    v26 = v25 - 1;
+    v17 = (uint32_t *)&src[(line + (uintptr_t)v7 - (uintptr_t)src) & 0x7FF];
+    v18 = (uint32_t *)((char *)v8 + ext);
+    v19 = wid_64;
+    do
+    {
+      v20 = v19;
+      v21 = bswap32(v17[1]);
+      ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1FE)), 1);
+      v22 = v19 << 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1FE)), 1);
+      *v18 = v22;
+      v23 = v18 + 1;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v21 & 0x1FE)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1FE)), 1);
+      *v23 = v22;
+      ++v23;
+      v24 = bswap32(*v17);
+      v17 = (uint32_t *)&src[((uintptr_t)v17 + 8 - (uintptr_t)src) & 0x7FF];
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1FE)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1FE)), 1);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v24 & 0x1FE)), 1);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1FE)), 1);
+      *v23 = v22;
+      v18 = v23 + 1;
+      v19 = v20 - 1;
+    }
+    while ( v20 != 1 );
+    v7 = &src[(line + (uintptr_t)v17 - (uintptr_t)src) & 0x7FF];
+    v8 = (uint32_t *)((char *)v18 + ext);
+    v9 = v26 - 1;
+  }
+  while ( v26 != 1 );
 }
 
-extern "C" void  __declspec(naked) asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+static inline void load8bIA8(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext, uint16_t *pal)
 {
-	_asm {
-		push ebp
-		mov ebp, esp
-        push ebx
-        push esi
-        push edi
+  uint32_t *v7;
+  uint32_t *v8;
+  int v9;
+  int v10;
+  int v11;
+  uint32_t v12;
+  uint32_t *v13;
+  uint32_t v14;
+  uint32_t *v15;
+  uint32_t v16;
+  uint32_t *v17;
+  uint32_t *v18;
+  int v19;
+  int v20;
+  uint32_t v21;
+  uint32_t v22;
+  uint32_t *v23;
+  uint32_t v24;
+  int v25;
+  int v26;
 
-        mov ebx,[pal]
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        push ecx
-
-        mov eax,[esi]           // read all 4 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,15
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,23
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,7
-        and edx,0x1FE
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 4 pixels
-        bswap eax
-        add esi,4
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,15
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,23
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,7
-        and edx,0x1FE
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz near end_y_loop
-        push ecx
-
-        add esi,[line]
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        push ecx
-
-        mov eax,[esi+4]         // read all 4 pixels
-        bswap eax
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,15
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,23
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,7
-        and edx,0x1FE
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // * copy
-        mov eax,[esi]           // read all 4 pixels
-        bswap eax
-        add esi,8
-        mov edx,eax
-
-        // 1st dword output {
-        shr eax,15
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        mov eax,edx
-        shr eax,23
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-
-        // 2nd dword output {
-        mov eax,edx
-        shl eax,1
-        and eax,0x1FE
-        mov cx,[ebx+eax]
-        ror cx,8
-        shl ecx,16
-
-        shr edx,7
-        and edx,0x1FE
-        mov cx,[ebx+edx]
-        ror cx,8
-
-        mov [edi],ecx
-        add edi,4
-        // }
-        // *
-
-        pop ecx
-
-        dec ecx
-        jnz x_loop_2
-
-        add esi,[line]
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v7 = (uint32_t *)src;
+  v8 = (uint32_t *)dst;
+  v9 = height;
+  do
+  {
+    v25 = v9;
+    v10 = wid_64;
+    do
+    {
+      v11 = v10;
+      v12 = bswap32(*v7);
+      v13 = v7 + 1;
+      ALOWORD(v10) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 15) & 0x1FE)), 8);
+      v14 = v10 << 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 23) & 0x1FE)), 8);
+      *v8 = v14;
+      v15 = v8 + 1;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v12 & 0x1FE)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v12 >> 7) & 0x1FE)), 8);
+      *v15 = v14;
+      ++v15;
+      v16 = bswap32(*v13);
+      v7 = v13 + 1;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 15) & 0x1FE)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 23) & 0x1FE)), 8);
+      *v15 = v14;
+      ++v15;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v16 & 0x1FE)), 8);
+      v14 <<= 16;
+      ALOWORD(v14) = __ROR__(*(uint16_t *)((char *)pal + ((v16 >> 7) & 0x1FE)), 8);
+      *v15 = v14;
+      v8 = v15 + 1;
+      v10 = v11 - 1;
+    }
+    while ( v11 != 1 );
+    if ( v25 == 1 )
+      break;
+    v26 = v25 - 1;
+    v17 = (uint32_t *)((char *)v7 + line);
+    v18 = (uint32_t *)((char *)v8 + ext);
+    v19 = wid_64;
+    do
+    {
+      v20 = v19;
+      v21 = bswap32(v17[1]);
+      ALOWORD(v19) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 15) & 0x1FE)), 8);
+      v22 = v19 << 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 23) & 0x1FE)), 8);
+      *v18 = v22;
+      v23 = v18 + 1;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v21 & 0x1FE)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v21 >> 7) & 0x1FE)), 8);
+      *v23 = v22;
+      ++v23;
+      v24 = bswap32(*v17);
+      v17 += 2;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 15) & 0x1FE)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 23) & 0x1FE)), 8);
+      *v23 = v22;
+      ++v23;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + (2 * (uint16_t)v24 & 0x1FE)), 8);
+      v22 <<= 16;
+      ALOWORD(v22) = __ROR__(*(uint16_t *)((char *)pal + ((v24 >> 7) & 0x1FE)), 8);
+      *v23 = v22;
+      v18 = v23 + 1;
+      v19 = v20 - 1;
+    }
+    while ( v20 != 1 );
+    v7 = (uint32_t *)((char *)v17 + line);
+    v8 = (uint32_t *)((char *)v18 + ext);
+    v9 = v26 - 1;
+  }
+  while ( v26 != 1 );
 }
 
-extern "C" void  __declspec(naked) asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+static inline void load8bIA4(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
 {
-	_asm {
-		push ebp
-		mov ebp, esp
-        push ebx
-        push esi
-        push edi
+  uint32_t *v6;
+  uint32_t *v7;
+  int v8;
+  int v9;
+  uint32_t v10;
+  uint32_t v11;
+  uint32_t *v12;
+  uint32_t *v13;
+  uint32_t v14;
+  uint32_t v15;
+  uint32_t *v16;
+  uint32_t *v17;
+  int v18;
+  uint32_t *v19;
+  uint32_t v20;
+  int v21;
+  int v22;
 
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        mov eax,[esi] // read all 4 pixels
-        mov edx,eax
-
-        shr eax,4     //all alpha
-        shl edx,4
-        and eax,0x0F0F0F0F
-        and edx,0xF0F0F0F0
-        add esi,4
-        or eax,edx
-
-        mov [edi],eax // save dword
-        add edi,4
-
-        mov eax,[esi] // read all 4 pixels
-        mov edx,eax
-
-        shr eax,4     //all alpha
-        shl edx,4
-        and eax,0x0F0F0F0F
-        and edx,0xF0F0F0F0
-        add esi,4
-        or eax,edx
-
-        mov [edi],eax // save dword
-        add edi,4
-        // *
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz end_y_loop
-        push ecx
-
-        add esi,[line]
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        mov eax,[esi+4] // read both pixels
-        mov edx,eax
-
-        shr eax,4       //all alpha
-        shl edx,4
-        and eax,0x0F0F0F0F
-        and edx,0xF0F0F0F0
-        or eax,edx
-
-        mov [edi],eax //save dword
-        add edi,4
-
-        mov eax,[esi] // read both pixels
-        add esi,8
-        mov edx,eax
-
-        shr eax,4     //all alpha
-        shl edx,4
-        and eax,0x0F0F0F0F
-        and edx,0xF0F0F0F0
-        or eax,edx
-
-        mov [edi],eax //save dword
-        add edi,4
-        // *
-
-        dec ecx
-        jnz x_loop_2
-
-        add esi,[line]
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v6 = (uint32_t *)src;
+  v7 = (uint32_t *)dst;
+  v8 = height;
+  do
+  {
+    v21 = v8;
+    v9 = wid_64;
+    do
+    {
+      v10 = *v6;
+      v11 = (*v6 >> 4) & 0xF0F0F0F;
+      v12 = v6 + 1;
+      *v7 = (16 * v10 & 0xF0F0F0F0) | v11;
+      v13 = v7 + 1;
+      v14 = (*v12 >> 4) & 0xF0F0F0F;
+      v15 = 16 * *v12 & 0xF0F0F0F0;
+      v6 = v12 + 1;
+      *v13 = v15 | v14;
+      v7 = v13 + 1;
+      --v9;
+    }
+    while ( v9 );
+    if ( v21 == 1 )
+      break;
+    v22 = v21 - 1;
+    v16 = (uint32_t *)((char *)v6 + line);
+    v17 = (uint32_t *)((char *)v7 + ext);
+    v18 = wid_64;
+    do
+    {
+      *v17 = (16 * v16[1] & 0xF0F0F0F0) | ((v16[1] >> 4) & 0xF0F0F0F);
+      v19 = v17 + 1;
+      v20 = *v16;
+      v16 += 2;
+      *v19 = (16 * v20 & 0xF0F0F0F0) | ((v20 >> 4) & 0xF0F0F0F);
+      v17 = v19 + 1;
+      --v18;
+    }
+    while ( v18 );
+    v6 = (uint32_t *)((char *)v16 + line);
+    v7 = (uint32_t *)((char *)v17 + ext);
+    v8 = v22 - 1;
+  }
+  while ( v22 != 1 );
 }
 
-extern "C" void  __declspec(naked) asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
+static inline void load8bI(uint8_t *src, uint8_t *dst, int wid_64, int height, int line, int ext)
 {
-	_asm {
-		push ebp
-		mov ebp, esp
-        push ebx
-        push esi
-        push edi
+  uint32_t *v6;
+  uint32_t *v7;
+  int v8;
+  int v9;
+  uint32_t v10;
+  uint32_t *v11;
+  uint32_t *v12;
+  uint32_t v13;
+  uint32_t *v14;
+  uint32_t *v15;
+  int v16;
+  uint32_t *v17;
+  uint32_t v18;
+  int v19;
+  int v20;
 
-        mov esi,[src]
-        mov edi,[dst]
-        mov ecx,[height]
-y_loop:
-        push ecx
-        mov ecx,[wid_64]
-x_loop:
-        mov eax,[esi] // read all 4 pixels
-        add esi,4
-
-        mov [edi],eax // save dword
-        add edi,4
-
-        mov eax,[esi] // read all 4 pixels
-        add esi,4
-
-        mov [edi],eax // save dword
-        add edi,4
-        // *
-
-        dec ecx
-        jnz x_loop
-
-        pop ecx
-        dec ecx
-        jz end_y_loop
-        push ecx
-
-        add esi,[line]
-        add edi,[ext]
-
-        mov ecx,[wid_64]
-x_loop_2:
-        mov eax,[esi+4] // read both pixels
-
-        mov [edi],eax //save dword
-        add edi,4
-
-        mov eax,[esi] // read both pixels
-        add esi,8
-
-        mov [edi],eax //save dword
-        add edi,4
-        // *
-
-        dec ecx
-        jnz x_loop_2
-
-        add esi,[line]
-        add edi,[ext]
-
-        pop ecx
-        dec ecx
-        jnz y_loop
-
-end_y_loop:
-        pop edi
-        pop esi
-        pop ebx
-		mov esp, ebp
-		pop ebp
-		ret
-	}
+  v6 = (uint32_t *)src;
+  v7 = (uint32_t *)dst;
+  v8 = height;
+  do
+  {
+    v19 = v8;
+    v9 = wid_64;
+    do
+    {
+      v10 = *v6;
+      v11 = v6 + 1;
+      *v7 = v10;
+      v12 = v7 + 1;
+      v13 = *v11;
+      v6 = v11 + 1;
+      *v12 = v13;
+      v7 = v12 + 1;
+      --v9;
+    }
+    while ( v9 );
+    if ( v19 == 1 )
+      break;
+    v20 = v19 - 1;
+    v14 = (uint32_t *)((char *)v6 + line);
+    v15 = (uint32_t *)((char *)v7 + ext);
+    v16 = wid_64;
+    do
+    {
+      *v15 = v14[1];
+      v17 = v15 + 1;
+      v18 = *v14;
+      v14 += 2;
+      *v17 = v18;
+      v15 = v17 + 1;
+      --v16;
+    }
+    while ( v16 );
+    v6 = (uint32_t *)((char *)v14 + line);
+    v7 = (uint32_t *)((char *)v15 + ext);
+    v8 = v20 - 1;
+  }
+  while ( v20 != 1 );
 }
 
 //****************************************************************
@@ -671,21 +378,21 @@ wxUint32 Load8bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int lin
   if (wid_64 < 1) wid_64 = 1;
   if (height < 1) height = 1;
   int ext = (real_width - (wid_64 << 3));
-  wxUIntPtr pal = wxPtrToUInt(rdp.pal_8);
+  unsigned short * pal = rdp.pal_8;
 
   switch (rdp.tlut_mode) {
     case 0: //palette is not used
-      //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference. 
+      //in tlut DISABLE mode load CI texture as plain intensity texture instead of palette dereference.
       //Thanks to angrylion for the advice
-      asmLoad8bI (src, dst, wid_64, height, line, ext);	
+      load8bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
       return /*(0 << 16) | */GR_TEXFMT_ALPHA_8;
     case 2: //color palette
       ext <<= 1;
-      asmLoad8bCI (src, dst, wid_64, height, line, ext, pal);
+      load8bCI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, pal);
       return (1 << 16) | GR_TEXFMT_ARGB_1555;
     default: //IA palette
       ext <<= 1;
-      asmLoad8bIA8 (src, dst, wid_64, height, line, ext, pal);
+      load8bIA8 ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext, pal);
       return (1 << 16) | GR_TEXFMT_ALPHA_INTENSITY_88;
   }
 }
@@ -694,30 +401,30 @@ wxUint32 Load8bCI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int lin
 // Size: 1, Format: 3
 //
 
-wxUint32 Load8bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)  
-{ 
+wxUint32 Load8bIA (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
+{
   if (rdp.tlut_mode != 0)
     return Load8bCI (dst, src, wid_64, height, line, real_width, tile);
 
-  if (wid_64 < 1) wid_64 = 1;  
-  if (height < 1) height = 1;  
-  int ext = (real_width - (wid_64 << 3));  
-  asmLoad8bIA4 (src, dst, wid_64, height, line, ext);
-  return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;  
+  if (wid_64 < 1) wid_64 = 1;
+  if (height < 1) height = 1;
+  int ext = (real_width - (wid_64 << 3));
+  load8bIA4 ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
+  return /*(0 << 16) | */GR_TEXFMT_ALPHA_INTENSITY_44;
 } 
 
 //****************************************************************
 // Size: 1, Format: 4
 //
 
-wxUint32 Load8bI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)  
-{ 
+wxUint32 Load8bI (wxUIntPtr dst, wxUIntPtr src, int wid_64, int height, int line, int real_width, int tile)
+{
   if (rdp.tlut_mode != 0)
     return Load8bCI (dst, src, wid_64, height, line, real_width, tile);
 
-  if (wid_64 < 1) wid_64 = 1;  
-  if (height < 1) height = 1;  
-  int ext = (real_width - (wid_64 << 3));  
-  asmLoad8bI (src, dst, wid_64, height, line, ext);	
-  return /*(0 << 16) | */GR_TEXFMT_ALPHA_8;  
+  if (wid_64 < 1) wid_64 = 1;
+  if (height < 1) height = 1;
+  int ext = (real_width - (wid_64 << 3));
+  load8bI ((uint8_t *)src, (uint8_t *)dst, wid_64, height, line, ext);
+  return /*(0 << 16) | */GR_TEXFMT_ALPHA_8;
 }