[Glide64] Move the code out of texture.asm.cpp

2015-10-13 15:54:52 +11:00 · 2015-10-13 15:54:52 +11:00 · 48554d2ad0
parent d7a19c265f
commit 48554d2ad0
10 changed files with 3538 additions and 3886 deletions
--- a/Source/Glide64/Glide64.vcproj
+++ b/Source/Glide64/Glide64.vcproj
@ -270,10 +270,6 @@
 				RelativePath="TexModCI.h"
 				>
 			</File>
-			<File
-				RelativePath=".\Texture.asm.cpp"
-				>
-			</File>
 		</Filter>
 		<Filter
 			Name="Config"
--- a/Source/Glide64/MiClWr32b.h
+++ b/Source/Glide64/MiClWr32b.h
@ -41,9 +41,156 @@
 //
 //****************************************************************

-extern "C" void asmMirror32bS (int tex, int start, int width, int height, int mask, int line, int full, int count);
-extern "C" void asmWrap32bS (int tex, int start, int height, int mask, int line, int full, int count);
-extern "C" void asmClamp32bS (int tex, int constant, int height,int line, int full, int count);
+extern "C" void  __declspec(naked) asmMirror32bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov edi,[start]
+        mov ecx,[height]
+loop_y:
+
+        xor edx,edx
+loop_x:
+        mov esi,[tex]
+        mov ebx,[width]
+        add ebx,edx
+        and ebx,[width]
+        jnz is_mirrored
+
+        mov eax,edx
+        shl eax,2
+        and eax,[mask]
+        add esi,eax
+        mov eax,[esi]
+        mov [edi],eax
+        add edi,4
+        jmp end_mirror_check
+is_mirrored:
+        add esi,[mask]
+        mov eax,edx
+        shl eax,2
+        and eax,[mask]
+        sub esi,eax
+        mov eax,[esi]
+        mov [edi],eax
+        add edi,4
+end_mirror_check:
+
+        inc edx
+        cmp edx,[count]
+        jne loop_x
+
+        add edi,[line]
+        mov eax,[tex]
+        add eax,[full]
+        mov [tex],eax
+
+        dec ecx
+        jnz loop_y
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmWrap32bS (int tex, int start, int height, int mask, int line, int full, int count)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov edi,[start]
+        mov ecx,[height]
+loop_y:
+
+        xor edx,edx
+loop_x:
+
+        mov esi,[tex]
+        mov eax,edx
+        and eax,[mask]
+        shl eax,2
+        add esi,eax
+        mov eax,[esi]
+        mov [edi],eax
+        add edi,4
+
+        inc edx
+        cmp edx,[count]
+        jne loop_x
+
+        add edi,[line]
+        mov eax,[tex]
+        add eax,[full]
+        mov [tex],eax
+
+        dec ecx
+        jnz loop_y
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmClamp32bS (int tex, int constant, int height,int line, int full, int count)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov esi,[constant]
+        mov edi,[tex]
+
+        mov ecx,[height]
+y_loop:
+
+        mov eax,[esi]
+
+        mov edx,[count]
+x_loop:
+
+        mov [edi],eax           // don't unroll or make dword, it may go into next line (doesn't have to be multiple of two)
+        add edi,4
+
+        dec edx
+        jnz x_loop
+
+        add esi,[full]
+        add edi,[line]
+
+        dec ecx
+        jnz y_loop
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}

 //****************************************************************
 // 32-bit Horizontal Mirror
--- a/Source/Glide64/MiClWr8b.h
+++ b/Source/Glide64/MiClWr8b.h
@ -40,9 +40,155 @@
 //****************************************************************
 // 8-bit Horizontal Mirror

-extern "C" void asmMirror8bS (int tex, int start, int width, int height, int mask, int line, int full, int count);
-extern "C" void asmWrap8bS (int tex, int start, int height, int mask, int line, int full, int count);
-extern "C" void asmClamp8bS (int tex, int constant, int height,int line, int full, int count);
+extern "C" void  __declspec(naked) asmMirror8bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
+{
+	_asm{
+		ALIGN 4
+
+		push ebp
+		mov ebp, esp
+		push ebx
+        push esi
+        push edi
+
+        mov edi,[start]
+        mov ecx,[height]
+loop_y:
+
+        xor edx,edx
+loop_x:
+        mov esi,[tex]
+        mov ebx,[width]
+        add ebx,edx
+        and ebx,[width]
+        jnz is_mirrored
+
+        mov eax,edx
+        and eax,[mask]
+        add esi,eax
+        mov al,[esi]
+        mov [edi],al
+        inc edi
+        jmp end_mirror_check
+is_mirrored:
+        add esi,[mask]
+        mov eax,edx
+        and eax,[mask]
+        sub esi,eax
+        mov al,[esi]
+        mov [edi],al
+        inc edi
+end_mirror_check:
+
+        inc edx
+        cmp edx,[count]
+        jne loop_x
+
+        add edi,[line]
+        mov eax,[tex]
+        add eax,[full]
+        mov [tex],eax
+
+        dec ecx
+        jnz loop_y
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmWrap8bS (int tex, int start, int height, int mask, int line, int full, int count)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov edi,[start]
+        mov ecx,[height]
+loop_y:
+
+        xor edx,edx
+loop_x:
+
+        mov esi,[tex]
+        mov eax,edx
+        and eax,[mask]
+        shl eax,2
+        add esi,eax
+        mov eax,[esi]
+        mov [edi],eax
+        add edi,4
+
+        inc edx
+        cmp edx,[count]
+        jne loop_x
+
+        add edi,[line]
+        mov eax,[tex]
+        add eax,[full]
+        mov [tex],eax
+
+        dec ecx
+        jnz loop_y
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmClamp8bS (int tex, int constant, int height,int line, int full, int count)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov esi,[constant]
+        mov edi,[tex]
+
+        mov ecx,[height]
+y_loop:
+
+        mov al,[esi]
+
+        mov edx,[count]
+x_loop:
+
+        mov [edi],al            // don't unroll or make dword, it may go into next line (doesn't have to be multiple of two)
+        inc edi
+
+        dec edx
+        jnz x_loop
+
+        add esi,[full]
+        add edi,[line]
+
+        dec ecx
+        jnz y_loop
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}

 void Mirror8bS (wxUint32 tex, wxUint32 mask, wxUint32 max_width, wxUint32 real_width, wxUint32 height)
 {
--- a/Source/Glide64/TexCache.cpp
+++ b/Source/Glide64/TexCache.cpp
@ -150,7 +150,55 @@ void ClearCache ()
 //****************************************************************
 // GetTexInfo - gets information for either t0 or t1, checks if in cache & fills tex_found

-extern "C" int asmTextureCRC(int addr, int width, int height, int line);
+extern "C" __declspec(naked) int asmTextureCRC(int addr, int width, int height, int line)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+
+        push ebx
+        push edi
+
+        xor eax,eax                             // eax is final result
+        mov ebx,[line]
+        mov ecx,[height]                // ecx is height counter
+        mov edi,[addr]                  // edi is ptr to texture memory
+crc_loop_y:
+        push ecx
+
+        mov ecx,[width]
+crc_loop_x:
+
+        add eax,[edi]           // MUST be 64-bit aligned, so manually unroll
+        add eax,[edi+4]
+        mov edx,ecx
+        mul edx
+        add eax,edx
+        add edi,8
+
+        dec ecx
+        jnz crc_loop_x
+
+        pop ecx
+
+        mov edx,ecx
+        mul edx
+        add eax,edx
+
+        add edi,ebx
+
+        dec ecx
+        jnz crc_loop_y
+
+        pop edi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
 void GetTexInfo (int id, int tile)
 {
  FRDP (" | |-+ GetTexInfo (id: %d, tile: %d)\n", id, tile);
--- a/Source/Glide64/TexConv.h
+++ b/Source/Glide64/TexConv.h
@ -37,10 +37,270 @@
 //
 //****************************************************************

-extern "C" void asmTexConv_ARGB1555_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
-extern "C" void asmTexConv_AI88_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
-extern "C" void asmTexConv_AI44_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
-extern "C" void asmTexConv_A8_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int size);
+extern "C" void  __declspec(naked) asmTexConv_ARGB1555_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[isize]
+
+tc1_loop:
+        mov eax,[esi]
+        add esi,4
+
+        // arrr rrgg gggb bbbb
+        // aaaa rrrr gggg bbbb
+        mov edx,eax
+        and eax,0x80008000
+        mov ebx,eax                             // ebx = 0xa000000000000000
+        shr eax,1
+        or ebx,eax                              // ebx = 0xaa00000000000000
+        shr eax,1
+        or ebx,eax                              // ebx = 0xaaa0000000000000
+        shr eax,1
+        or ebx,eax                              // ebx = 0xaaaa000000000000
+
+        mov eax,edx
+        and eax,0x78007800              // eax = 0x0rrrr00000000000
+        shr eax,3                               // eax = 0x0000rrrr00000000
+        or ebx,eax                              // ebx = 0xaaaarrrr00000000
+
+        mov eax,edx
+        and eax,0x03c003c0              // eax = 0x000000gggg000000
+        shr eax,2                               // eax = 0x00000000gggg0000
+        or ebx,eax                              // ebx = 0xaaaarrrrgggg0000
+
+        and edx,0x001e001e              // edx = 0x00000000000bbbb0
+        shr edx,1                               // edx = 0x000000000000bbbb
+        or ebx,edx                              // ebx = 0xaaaarrrrggggbbbb
+
+        mov [edi],ebx
+        add edi,4
+
+        dec ecx
+        jnz tc1_loop
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmTexConv_AI88_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[isize]
+
+tc1_loop:
+        mov eax,[esi]
+        add esi,4
+
+        // aaaa aaaa iiii iiii
+        // aaaa rrrr gggg bbbb
+        mov edx,eax
+        and eax,0xF000F000              // eax = 0xaaaa000000000000
+        mov ebx,eax                             // ebx = 0xaaaa000000000000
+
+        and edx,0x00F000F0              // edx = 0x00000000iiii0000
+        shl edx,4                               // edx = 0x0000iiii00000000
+        or ebx,edx                              // ebx = 0xaaaaiiii00000000
+        shr edx,4                               // edx = 0x00000000iiii0000
+        or ebx,edx                              // ebx = 0xaaaaiiiiiiii0000
+        shr edx,4                               // edx = 0x000000000000iiii
+        or ebx,edx                              // ebx = 0xaaaaiiiiiiiiiiii
+
+        mov [edi],ebx
+        add edi,4
+
+        dec ecx
+        jnz tc1_loop
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmTexConv_AI44_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[isize]
+
+tc1_loop:
+        mov eax,[esi]
+        add esi,4
+
+        // aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
+        // aaaa1 rrrr1 gggg1 bbbb1 aaaa0 rrrr0 gggg0 bbbb0
+        // aaaa3 rrrr3 gggg3 bbbb3 aaaa2 rrrr2 gggg2 bbbb2
+        mov edx,eax                             // eax = aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
+        shl eax,16                              // eax = aaaa1 iiii1 aaaa0 iiii0 0000  0000  0000  0000
+        and eax,0xFF000000              // eax = aaaa1 iiii1 0000  0000  0000  0000  0000  0000
+        mov ebx,eax                             // ebx = aaaa1 iiii1 0000  0000  0000  0000  0000  0000
+        and eax,0x0F000000              // eax = 0000  iiii1 0000  0000  0000  0000  0000  0000
+        shr eax,4                               // eax = 0000  0000  iiii1 0000  0000  0000  0000  0000
+        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 0000  0000  0000  0000  0000
+        shr eax,4                               // eax = 0000  0000  0000  iiii1 0000  0000  0000  0000
+        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 iiii1 0000  0000  0000  0000
+
+        mov eax,edx                             // eax = aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
+        shl eax,8                               // eax = aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0 0000  0000
+        and eax,0x0000FF00              // eax = 0000  0000  0000  0000  aaaa0 iiii0 0000  0000
+        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 iiii1 aaaa0 iiii0 0000  0000
+        and eax,0x00000F00              // eax = 0000  0000  0000  0000  0000  iiii0 0000  0000
+        shr eax,4                               // eax = 0000  0000  0000  0000  0000  0000  iiii0 0000
+        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 iiii1 aaaa0 iiii0 iiii0 0000
+        shr eax,4                               // eax = 0000  0000  0000  0000  0000  0000  0000  iiii0
+        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 iiii1 aaaa0 iiii0 iiii0 iiii0
+
+        mov [edi],ebx
+        add edi,4
+
+        mov eax,edx                             // eax = aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
+        and eax,0xFF000000              // eax = aaaa3 iiii3 0000  0000  0000  0000  0000  0000
+        mov ebx,eax                             // ebx = aaaa3 iiii3 0000  0000  0000  0000  0000  0000
+        and eax,0x0F000000              // eax = 0000  iiii3 0000  0000  0000  0000  0000  0000
+        shr eax,4                               // eax = 0000  0000  iiii3 0000  0000  0000  0000  0000
+        or ebx,eax                              // ebx = aaaa3 iiii3 iiii3 0000  0000  0000  0000  0000
+        shr eax,4                               // eax = 0000  0000  0000  iiii3 0000  0000  0000  0000
+        or ebx,eax                              // ebx = aaaa3 iiii3 iiii3 iiii3 0000  0000  0000  0000
+
+                                                        // edx = aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
+        shr edx,8                               // edx = 0000  0000  aaaa3 aaaa3 aaaa2 iiii2 aaaa1 iiii1
+        and edx,0x0000FF00              // edx = 0000  0000  0000  0000  aaaa2 iiii2 0000  0000
+        or ebx,edx                              // ebx = aaaa3 iiii3 iiii3 iiii3 aaaa2 iiii2 0000  0000
+        and edx,0x00000F00              // edx = 0000  0000  0000  0000  0000  iiii2 0000  0000
+        shr edx,4                               // edx = 0000  0000  0000  0000  0000  0000  iiii2 0000
+        or ebx,edx                              // ebx = aaaa3 iiii3 iiii3 iiii3 aaaa2 iiii2 iiii2 0000
+        shr edx,4                               // edx = 0000  0000  0000  0000  0000  0000  0000  iiii2
+        or ebx,edx                              // ebx = aaaa3 iiii3 iiii3 iiii3 aaaa2 iiii2 iiii2 iiii2
+
+        mov [edi],ebx
+        add edi,4
+
+        dec ecx
+        jnz tc1_loop
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmTexConv_A8_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[isize]
+
+tc1_loop:
+        mov eax,[esi]
+        add esi,4
+
+        // aaaa3 aaaa3 aaaa2 aaaa2 aaaa1 aaaa1 aaaa0 aaaa0
+        // aaaa1 rrrr1 gggg1 bbbb1 aaaa0 rrrr0 gggg0 bbbb0
+        // aaaa3 rrrr3 gggg3 bbbb3 aaaa2 rrrr2 gggg2 bbbb2
+        mov edx,eax
+        and eax,0x0000F000              // eax = 00 00 00 00 a1 00 00 00
+        shl eax,16                              // eax = a1 00 00 00 00 00 00 00
+        mov ebx,eax                             // ebx = a1 00 00 00 00 00 00 00
+        shr eax,4
+        or ebx,eax                              // ebx = a1 a1 00 00 00 00 00 00
+        shr eax,4
+        or ebx,eax                              // ebx = a1 a1 a1 00 00 00 00 00
+        shr eax,4
+        or ebx,eax                              // ebx = a1 a1 a1 a1 00 00 00 00
+
+        mov eax,edx
+        and eax,0x000000F0              // eax = 00 00 00 00 00 00 a0 00
+        shl eax,8                               // eax = 00 00 00 00 a0 00 00 00
+        or ebx,eax
+        shr eax,4
+        or ebx,eax
+        shr eax,4
+        or ebx,eax
+        shr eax,4
+        or ebx,eax                              // ebx = a1 a1 a1 a1 a0 a0 a0 a0
+
+        mov [edi],ebx
+        add edi,4
+
+        mov eax,edx                             // eax = a3 a3 a2 a2 a1 a1 a0 a0
+        and eax,0xF0000000              // eax = a3 00 00 00 00 00 00 00
+        mov ebx,eax                             // ebx = a3 00 00 00 00 00 00 00
+        shr eax,4
+        or ebx,eax                              // ebx = a3 a3 00 00 00 00 00 00
+        shr eax,4
+        or ebx,eax                              // ebx = a3 a3 a3 00 00 00 00 00
+        shr eax,4
+        or ebx,eax                              // ebx = a3 a3 a3 a3 00 00 00 00
+
+        and edx,0x00F00000              // eax = 00 00 a2 00 00 00 00 00
+        shr edx,8                               // eax = 00 00 00 00 a2 00 00 00
+        or ebx,edx
+        shr edx,4
+        or ebx,edx
+        shr edx,4
+        or ebx,edx
+        shr edx,4
+        or ebx,edx                              // ebx = a3 a3 a3 a3 a2 a2 a2 a2
+
+        mov [edi],ebx
+        add edi,4
+
+        dec ecx
+        jnz tc1_loop
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}

 void TexConv_ARGB1555_ARGB4444 (wxUIntPtr src, wxUIntPtr dst, int width, int height)
 {
--- a/Source/Glide64/TexLoad16b.h
+++ b/Source/Glide64/TexLoad16b.h
@ -37,9 +37,164 @@
 //
 //****************************************************************

-extern "C" void asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
-extern "C" void asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
+extern "C" void __declspec(naked) asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+{
+	_asm {
+		align 4
+		push        ebp  
+		mov         ebp,esp 
+        push ebx
+        push esi
+        push edi

+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[height]
+y_loop:
+        push ecx
+        mov ecx,[wid_64]
+x_loop:
+        mov eax,[esi]   // read both pixels
+        mov ebx,[esi+4] // read both pixels
+        bswap eax
+        bswap ebx
+
+        ror ax,1
+        ror bx,1
+        ror eax,16
+        ror ebx,16
+        ror ax,1
+        ror bx,1
+
+        mov  [edi],eax
+        mov  [edi+4],ebx
+        add esi,8
+        add edi,8
+
+        dec ecx
+        jnz x_loop
+
+        pop ecx
+        dec ecx
+        jz end_y_loop
+        push ecx
+
+        mov eax,esi
+        add eax,[line]
+        mov esi,[src]
+        sub eax, esi
+        and eax, 0xFFF
+        add esi, eax
+        add edi,[ext]
+
+        mov ecx,[wid_64]
+x_loop_2:
+        mov eax,[esi+4] // read both pixels
+        mov ebx,[esi]   // read both pixels
+        bswap eax
+        bswap ebx
+
+        ror ax,1
+        ror bx,1
+        ror eax,16
+        ror ebx,16
+        ror ax,1
+        ror bx,1
+
+        mov [edi],eax
+        mov [edi+4],ebx
+        add esi,8
+        add edi,8
+
+        dec ecx
+        jnz x_loop_2
+
+        mov eax,esi
+        add eax,[line]
+        mov esi,[src]
+        sub eax, esi
+        and eax, 0xFFF
+        add esi, eax
+        add edi,[ext]
+
+        pop ecx
+        dec ecx
+        jnz y_loop
+
+end_y_loop:
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+{
+	_asm {
+		ALIGN 4
+
+		push ebp
+		mov ebp, esp
+		push ebx
+        push esi
+        push edi
+
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[height]
+y_loop:
+        push ecx
+        mov ecx,[wid_64]
+x_loop:
+        mov eax,[esi]   // read both pixels
+        mov ebx,[esi+4] // read both pixels
+        mov [edi],eax
+        mov [edi+4],ebx
+        add esi,8
+        add edi,8
+
+        dec ecx
+        jnz x_loop
+
+        pop ecx
+        dec ecx
+        jz end_y_loop
+        push ecx
+
+        add esi,[line]
+        add edi,[ext]
+
+        mov ecx,[wid_64]
+x_loop_2:
+        mov eax,[esi+4] // read both pixels
+        mov ebx,[esi]   // read both pixels
+        mov [edi],eax
+        mov [edi+4],ebx
+        add esi,8
+        add edi,8
+
+        dec ecx
+        jnz x_loop_2
+
+        add esi,[line]
+        add edi,[ext]
+
+        pop ecx
+        dec ecx
+        jnz y_loop
+
+end_y_loop:
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}

 //****************************************************************
 // Size: 2, Format: 0
--- a/Source/Glide64/TexLoad4b.h
+++ b/Source/Glide64/TexLoad4b.h
--- a/Source/Glide64/TexLoad8b.h
+++ b/Source/Glide64/TexLoad8b.h
@ -37,10 +37,630 @@
 //
 //****************************************************************

-extern "C" void asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
-extern "C" void asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal);
-extern "C" void asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext);
-extern "C" void asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext);
+extern "C" void  __declspec(naked) asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+{
+	_asm {
+		push ebp
+		mov ebp, esp
+		push ebx
+        push esi
+        push edi
+
+        mov ebx,[pal]
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[height]
+y_loop:
+        push ecx
+        mov ecx,[wid_64]
+x_loop:
+        push ecx
+
+        mov eax,[esi]           // read all 4 pixels
+        bswap eax
+        add esi,4
+        mov edx,eax
+
+        // 1st dword output {
+        shr eax,15
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+        shl ecx,16
+
+        mov eax,edx
+        shr eax,23
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // 2nd dword output {
+        mov eax,edx
+        shl eax,1
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+        shl ecx,16
+
+        shr edx,7
+        and edx,0x1FE
+        mov cx,[ebx+edx]
+        ror cx,1
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // * copy
+        mov eax,[esi]           // read all 4 pixels
+        bswap eax
+        add esi,4
+        mov edx,eax
+
+        // 1st dword output {
+        shr eax,15
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+        shl ecx,16
+
+        mov eax,edx
+        shr eax,23
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // 2nd dword output {
+        mov eax,edx
+        shl eax,1
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+        shl ecx,16
+
+        shr edx,7
+        and edx,0x1FE
+        mov cx,[ebx+edx]
+        ror cx,1
+
+        mov [edi],ecx
+        add edi,4
+        // }
+        // *
+
+        pop ecx
+
+        dec ecx
+        jnz x_loop
+
+        pop ecx
+        dec ecx
+        jz near end_y_loop
+        push ecx
+
+        mov eax,esi
+        add eax,[line]
+        mov esi,[src]
+        sub eax,esi
+        and eax,0x7FF
+        add esi,eax
+        add edi,[ext]
+
+        mov ecx,[wid_64]
+x_loop_2:
+        push ecx
+
+        mov eax,[esi+4]         // read all 4 pixels
+        bswap eax
+        mov edx,eax
+
+        // 1st dword output {
+        shr eax,15
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+        shl ecx,16
+
+        mov eax,edx
+        shr eax,23
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // 2nd dword output {
+        mov eax,edx
+        shl eax,1
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+        shl ecx,16
+
+        shr edx,7
+        and edx,0x1FE
+        mov cx,[ebx+edx]
+        ror cx,1
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // * copy
+        mov eax,[esi]           // read all 4 pixels
+        bswap eax
+        mov edx,esi
+        add edx,8
+        mov esi,[src]
+        sub edx,esi
+        and edx,0x7FF
+        add esi,edx
+        mov edx,eax
+
+        // 1st dword output {
+        shr eax,15
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+        shl ecx,16
+
+        mov eax,edx
+        shr eax,23
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // 2nd dword output {
+        mov eax,edx
+        shl eax,1
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,1
+        shl ecx,16
+
+        shr edx,7
+        and edx,0x1FE
+        mov cx,[ebx+edx]
+        ror cx,1
+
+        mov [edi],ecx
+        add edi,4
+        // }
+        // *
+
+        pop ecx
+
+        dec ecx
+        jnz x_loop_2
+
+        mov eax,esi
+        add eax,[line]
+        mov esi,[src]
+        sub eax,esi
+        and eax,0x7FF
+        add esi,eax
+        add edi,[ext]
+
+        pop ecx
+        dec ecx
+        jnz y_loop
+
+end_y_loop:
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
+{
+	_asm {
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov ebx,[pal]
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[height]
+y_loop:
+        push ecx
+        mov ecx,[wid_64]
+x_loop:
+        push ecx
+
+        mov eax,[esi]           // read all 4 pixels
+        bswap eax
+        add esi,4
+        mov edx,eax
+
+        // 1st dword output {
+        shr eax,15
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+        shl ecx,16
+
+        mov eax,edx
+        shr eax,23
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // 2nd dword output {
+        mov eax,edx
+        shl eax,1
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+        shl ecx,16
+
+        shr edx,7
+        and edx,0x1FE
+        mov cx,[ebx+edx]
+        ror cx,8
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // * copy
+        mov eax,[esi]           // read all 4 pixels
+        bswap eax
+        add esi,4
+        mov edx,eax
+
+        // 1st dword output {
+        shr eax,15
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+        shl ecx,16
+
+        mov eax,edx
+        shr eax,23
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // 2nd dword output {
+        mov eax,edx
+        shl eax,1
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+        shl ecx,16
+
+        shr edx,7
+        and edx,0x1FE
+        mov cx,[ebx+edx]
+        ror cx,8
+
+        mov [edi],ecx
+        add edi,4
+        // }
+        // *
+
+        pop ecx
+
+        dec ecx
+        jnz x_loop
+
+        pop ecx
+        dec ecx
+        jz near end_y_loop
+        push ecx
+
+        add esi,[line]
+        add edi,[ext]
+
+        mov ecx,[wid_64]
+x_loop_2:
+        push ecx
+
+        mov eax,[esi+4]         // read all 4 pixels
+        bswap eax
+        mov edx,eax
+
+        // 1st dword output {
+        shr eax,15
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+        shl ecx,16
+
+        mov eax,edx
+        shr eax,23
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // 2nd dword output {
+        mov eax,edx
+        shl eax,1
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+        shl ecx,16
+
+        shr edx,7
+        and edx,0x1FE
+        mov cx,[ebx+edx]
+        ror cx,8
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // * copy
+        mov eax,[esi]           // read all 4 pixels
+        bswap eax
+        add esi,8
+        mov edx,eax
+
+        // 1st dword output {
+        shr eax,15
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+        shl ecx,16
+
+        mov eax,edx
+        shr eax,23
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+
+        mov [edi],ecx
+        add edi,4
+        // }
+
+        // 2nd dword output {
+        mov eax,edx
+        shl eax,1
+        and eax,0x1FE
+        mov cx,[ebx+eax]
+        ror cx,8
+        shl ecx,16
+
+        shr edx,7
+        and edx,0x1FE
+        mov cx,[ebx+edx]
+        ror cx,8
+
+        mov [edi],ecx
+        add edi,4
+        // }
+        // *
+
+        pop ecx
+
+        dec ecx
+        jnz x_loop_2
+
+        add esi,[line]
+        add edi,[ext]
+
+        pop ecx
+        dec ecx
+        jnz y_loop
+
+end_y_loop:
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
+{
+	_asm {
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[height]
+y_loop:
+        push ecx
+        mov ecx,[wid_64]
+x_loop:
+        mov eax,[esi] // read all 4 pixels
+        mov edx,eax
+
+        shr eax,4     //all alpha
+        shl edx,4
+        and eax,0x0F0F0F0F
+        and edx,0xF0F0F0F0
+        add esi,4
+        or eax,edx
+
+        mov [edi],eax // save dword
+        add edi,4
+
+        mov eax,[esi] // read all 4 pixels
+        mov edx,eax
+
+        shr eax,4     //all alpha
+        shl edx,4
+        and eax,0x0F0F0F0F
+        and edx,0xF0F0F0F0
+        add esi,4
+        or eax,edx
+
+        mov [edi],eax // save dword
+        add edi,4
+        // *
+
+        dec ecx
+        jnz x_loop
+
+        pop ecx
+        dec ecx
+        jz end_y_loop
+        push ecx
+
+        add esi,[line]
+        add edi,[ext]
+
+        mov ecx,[wid_64]
+x_loop_2:
+        mov eax,[esi+4] // read both pixels
+        mov edx,eax
+
+        shr eax,4       //all alpha
+        shl edx,4
+        and eax,0x0F0F0F0F
+        and edx,0xF0F0F0F0
+        or eax,edx
+
+        mov [edi],eax //save dword
+        add edi,4
+
+        mov eax,[esi] // read both pixels
+        add esi,8
+        mov edx,eax
+
+        shr eax,4     //all alpha
+        shl edx,4
+        and eax,0x0F0F0F0F
+        and edx,0xF0F0F0F0
+        or eax,edx
+
+        mov [edi],eax //save dword
+        add edi,4
+        // *
+
+        dec ecx
+        jnz x_loop_2
+
+        add esi,[line]
+        add edi,[ext]
+
+        pop ecx
+        dec ecx
+        jnz y_loop
+
+end_y_loop:
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" void  __declspec(naked) asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
+{
+	_asm {
+		push ebp
+		mov ebp, esp
+        push ebx
+        push esi
+        push edi
+
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[height]
+y_loop:
+        push ecx
+        mov ecx,[wid_64]
+x_loop:
+        mov eax,[esi] // read all 4 pixels
+        add esi,4
+
+        mov [edi],eax // save dword
+        add edi,4
+
+        mov eax,[esi] // read all 4 pixels
+        add esi,4
+
+        mov [edi],eax // save dword
+        add edi,4
+        // *
+
+        dec ecx
+        jnz x_loop
+
+        pop ecx
+        dec ecx
+        jz end_y_loop
+        push ecx
+
+        add esi,[line]
+        add edi,[ext]
+
+        mov ecx,[wid_64]
+x_loop_2:
+        mov eax,[esi+4] // read both pixels
+
+        mov [edi],eax //save dword
+        add edi,4
+
+        mov eax,[esi] // read both pixels
+        add esi,8
+
+        mov [edi],eax //save dword
+        add edi,4
+        // *
+
+        dec ecx
+        jnz x_loop_2
+
+        add esi,[line]
+        add edi,[ext]
+
+        pop ecx
+        dec ecx
+        jnz y_loop
+
+end_y_loop:
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}

 //****************************************************************
 // Size: 1, Format: 2
--- a/Source/Glide64/Texture.asm.cpp
+++ b/Source/Glide64/Texture.asm.cpp
--- a/Source/Glide64/rdp.cpp
+++ b/Source/Glide64/rdp.cpp
@ -48,8 +48,38 @@
 #include "FBtoScreen.h"
 #include "CRC.h"

-extern "C" void SwapBlock32 ();
-extern "C" void SwapBlock64 ();
+extern "C" __declspec(naked) void SwapBlock32 ( void )
+{
+//****************************************************************
+// SwapBlock - swaps every other 32-bit word at addr
+//
+// ecx = num_words -> 0
+// edi = addr -> end of dest
+//****************************************************************
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push eax
+        push ebx
+        or ecx,ecx
+        jz swapblock32_end
+swapblock32_loop:
+        mov eax,[edi]
+        mov ebx,[edi+4]
+        mov [edi],ebx
+        mov [edi+4],eax
+        add edi,8
+        dec ecx
+        jnz swapblock32_loop
+swapblock32_end:
+        pop ebx
+        pop eax
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}

 const int NumOfFormats = 3;
 SCREEN_SHOT_FORMAT ScreenShotFormats[NumOfFormats] = { {wxT("BMP"), wxT("bmp"), wxBITMAP_TYPE_BMP}, {wxT("PNG"), wxT("png"), wxBITMAP_TYPE_PNG}, {wxT("JPEG"), wxT("jpeg"), wxBITMAP_TYPE_JPEG} };
@ -1824,7 +1854,168 @@ void setTBufTex(wxUint16 t_mem, wxUint32 cnt)
  }
 }

-extern "C" void asmLoadBlock(int src, int dst, int off, int dxt, int cnt, int swp);
+void __declspec(naked) CopyBlock ( void )
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+        push eax
+        push ebx
+        push esi
+        push edx
+
+        or ecx,ecx
+        jz near copyblock_end
+
+        push ecx
+
+        // first, set the source address and check if not on a dword boundary
+        push esi
+        push edx
+        mov ebx,edx
+        and edx,0FFFFFFFCh
+        add esi,edx
+
+        and ebx,3                               // ebx = # we DON'T need to copy
+        jz copyblock_copy
+
+        mov edx,4                               // ecx = # we DO need to copy
+        sub edx,ebx
+
+        // load the first word, accounting for swapping
+
+        mov eax,[esi]
+        add esi,4
+copyblock_precopy_skip:
+        rol eax,8
+        dec ebx
+        jnz copyblock_precopy_skip
+
+copyblock_precopy_copy:
+        rol eax,8
+        mov [edi],al
+        inc edi
+        dec edx
+        jnz copyblock_precopy_copy
+
+        mov eax,[esi]
+        add esi,4
+        bswap eax
+        mov [edi],eax
+        add edi,4
+
+        dec ecx         // 1 less word to copy
+        jz copyblock_postcopy
+
+copyblock_copy:
+        mov eax,[esi]
+        bswap eax
+        mov [edi],eax
+
+        mov eax,[esi+4]
+        bswap eax
+        mov [edi+4],eax
+
+        add esi,8
+        add edi,8
+
+        dec ecx
+        jnz copyblock_copy
+
+copyblock_postcopy:
+        pop edx
+        pop esi
+        pop ecx
+
+        // check again if on dword boundary
+        mov ebx,edx     // ebx = # we DO need to copy
+
+        and ebx,3
+        jz copyblock_end
+
+        shl ecx,3       // ecx = num_words * 8
+        add edx,ecx
+        and edx,0FFFFFFFCh
+        add esi,edx
+
+        mov eax,[esi]
+
+copyblock_postcopy_copy:
+        rol eax,8
+        mov [edi],al
+        inc edi
+        dec ebx
+        jnz copyblock_postcopy_copy
+
+copyblock_end:
+        pop edx
+        pop esi
+        pop ebx
+        pop eax
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
+extern "C" __declspec(naked) void asmLoadBlock(int src, int dst, int off, int dxt, int cnt, wxUIntPtr swp)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+
+        push ebx
+        push esi
+        push edi
+
+        // copy the data
+        mov esi,[src]
+        mov edi,[dst]
+        mov ecx,[cnt]
+        mov edx,[off]
+        call CopyBlock
+
+        // now swap it
+        mov eax,[cnt]   // eax = count remaining
+        xor edx,edx         // edx = dxt counter
+        mov edi,[dst]
+        mov ebx,[dxt]
+
+        xor ecx,ecx     // ecx = how much to copy
+dxt_test:
+        add edi,8
+        dec eax
+        jz end_dxt_test
+        add edx,ebx
+        jns dxt_test
+
+dxt_s_test:
+        inc ecx
+        dec eax
+        jz end_dxt_test
+        add edx,ebx
+        js dxt_s_test
+
+        // swap this data (ecx set, dst set)
+        call [swp] // (ecx reset to 0 after)
+
+        jmp dxt_test  // and repeat
+
+end_dxt_test:
+        // swap any remaining data
+        call [swp]
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
 void LoadBlock32b(wxUint32 tile, wxUint32 ul_s, wxUint32 ul_t, wxUint32 lr_s, wxUint32 dxt);
 static void rdp_loadblock()
 {
@ -1916,7 +2107,65 @@ static void rdp_loadblock()
    setTBufTex(rdp.tiles[tile].t_mem, cnt);
 }

-extern "C" void asmLoadTile(int src, int dst, int width, int height, int line, int off, int end);
+extern "C" __declspec(naked) void asmLoadTile(int src, int dst, int width, int height, int line, int off, int end)
+{
+	_asm {
+		align 4
+		push ebp
+		mov ebp, esp
+
+        push ebx
+        push esi
+        push edi
+
+        // set initial values
+        mov edi,[dst]
+        mov ecx,[width]
+        mov esi,[src]
+        mov edx,[off]
+        xor ebx,ebx         // swap this line?
+        mov eax,[height]
+
+loadtile_loop:
+        cmp [end],edi   // end of tmem: error
+        jc loadtile_end
+
+        // copy this line
+        push edi
+        push ecx
+        call CopyBlock
+        pop ecx
+
+        // swap it?
+        xor ebx,1
+        jnz loadtile_no_swap
+
+        // (ecx set, restore edi)
+        pop edi
+        push ecx
+        call SwapBlock32
+        pop ecx
+        jmp loadtile_swap_end
+loadtile_no_swap:
+        add sp,4  // forget edi, we are already at the next position
+loadtile_swap_end:
+
+        add edx,[line]
+
+        dec eax
+        jnz loadtile_loop
+
+loadtile_end:
+
+        pop edi
+        pop esi
+        pop ebx
+		mov esp, ebp
+		pop ebp
+		ret
+	}
+}
+
 void LoadTile32b (wxUint32 tile, wxUint32 ul_s, wxUint32 ul_t, wxUint32 width, wxUint32 height);
 static void rdp_loadtile()
 {