project64/Source/Glide64/Texture.asm.cpp

/*
* Glide64 - Glide video plugin for Nintendo 64 emulators.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

/****************************************************************

 Glide64 - Glide Plugin for Nintendo 64 emulators
 Project started on December 29th, 2001

 Authors:
 Dave2001, original author, founded the project in 2001, left it in 2002
 Gugaman, joined the project in 2002, left it in 2002
 Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
 Hiroshi 'KoolSmoky' Morii, joined the project in 2007

****************************************************************

 To modify Glide64:
 * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
 * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.

****************************************************************
*/

#include "Gfx #1.3.h"

/****************************************************************

               ******** Textures load ********

****************************************************************/


/*****************************************************************
4b textures load
*****************************************************************/


/****************************************************************
 Size: 0, Format: 2
 2009 ported to NASM - Sergey (Gonetz) Lipski
 *****************************************************************/
extern "C" void __declspec(naked) asmLoad4bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
{
	_asm {
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov ebx,[pal]
        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        push ecx

        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword output {
        shr eax,23
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,27
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shr eax,15
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,19
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 3rd dword output {
        mov eax,edx
        shr eax,7
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,11
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 4th dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        shr edx,3
        and edx,0x1E
        mov cx,[ebx+edx]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword output {
        shr eax,23
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,27
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shr eax,15
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,19
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 3rd dword output {
        mov eax,edx
        shr eax,7
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,11
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 4th dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        shr edx,3
        and edx,0x1E
        mov cx,[ebx+edx]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx

        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz near end_y_loop
        push ecx

        mov eax,esi
        add eax,[line]
        mov esi,[src]
        sub eax,esi
        and eax,0x7FF
        add esi,eax
        add edi,[ext]

        mov ecx,[wid_64]
 x_loop_2:
        push ecx

        mov eax,[esi+4]         // read all 8 pixels
        bswap eax
        mov edx,eax

        // 1st dword output {
        shr eax,23
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,27
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shr eax,15
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,19
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 3rd dword output {
        mov eax,edx
        shr eax,7
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,11
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 4th dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        shr edx,3
        and edx,0x1E
        mov cx,[ebx+edx]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 8 pixels
        bswap eax
        mov edx,esi
        add edx,8
        mov esi,[src]
        sub edx,esi
        and edx,0x7FF
        add esi,edx
        mov edx,eax

        // 1st dword output {
        shr eax,23
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,27
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shr eax,15
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,19
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 3rd dword output {
        mov eax,edx
        shr eax,7
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,11
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 4th dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        shr edx,3
        and edx,0x1E
        mov cx,[ebx+edx]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx

        dec ecx
        jnz x_loop_2

        mov eax,esi
        add eax,[line]
        mov esi,[src]
        sub eax,esi
        and eax,0x7FF
        add esi,eax
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmLoad4bIAPal (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
{
	_asm {
		push ebp
		mov ebp, esp
		push ebx
        push esi
        push edi

        mov ebx,[pal]
        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        push ecx

        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword output {
        shr eax,23
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,27
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shr eax,15
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,19
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 3rd dword output {
        mov eax,edx
        shr eax,7
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,11
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 4th dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        shr edx,3
        and edx,0x1E
        mov cx,[ebx+edx]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword output {
        shr eax,23
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,27
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shr eax,15
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,19
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 3rd dword output {
        mov eax,edx
        shr eax,7
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,11
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 4th dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        shr edx,3
        and edx,0x1E
        mov cx,[ebx+edx]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx

        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz near end_y_loop
        push ecx

        mov eax,esi
        add eax,[line]
        mov esi,[src]
        sub eax,esi
        and eax,0x7FF
        add esi,eax
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        push ecx

        mov eax,[esi+4]         // read all 8 pixels
        bswap eax
        mov edx,eax

        // 1st dword output {
        shr eax,23
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,27
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shr eax,15
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,19
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 3rd dword output {
        mov eax,edx
        shr eax,7
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,11
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 4th dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        shr edx,3
        and edx,0x1E
        mov cx,[ebx+edx]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 8 pixels
        bswap eax
        mov edx,esi
        add edx,8
        mov esi,[src]
        sub edx,esi
        and edx,0x7FF
        add esi,edx
        mov edx,eax

        // 1st dword output {
        shr eax,23
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,27
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shr eax,15
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,19
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 3rd dword output {
        mov eax,edx
        shr eax,7
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,11
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 4th dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1E
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        shr edx,3
        and edx,0x1E
        mov cx,[ebx+edx]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx

        dec ecx
        jnz x_loop_2

        mov eax,esi
        add eax,[line]
        mov esi,[src]
        sub eax,esi
        and eax,0x7FF
        add esi,eax
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

/*****************************************************************
 Size: 0, Format: 3

 ** BY GUGAMAN **
 2009 ported to NASM - Sergey (Gonetz) Lipski
*****************************************************************/
extern "C" void  __declspec(naked) asmLoad4bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
{
	_asm {
		push ebp
		mov ebp, esp
		push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        push ecx

        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword {
        xor ecx,ecx

        // pixel #1
        //       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
        mov eax,edx
        shr eax,24 //Alpha
        and eax,0x00000010
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,28 // Intensity
        and eax,0x0000000E
        or ecx,eax
        shr eax,3
        or ecx,eax

        // pixel #2
        //       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
        mov eax,edx
        shr eax,12 //Alpha
        and eax,0x00001000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,16 // Intensity
        and eax,0x00000E00
        or ecx,eax
        shr eax,3
        and eax,0x00000100
        or ecx,eax

        // pixel #3
        //       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
        //Alpha
        mov eax,edx
        and eax,0x00100000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,4 // Intensity
        and eax,0x000E0000
        or ecx,eax
        shr eax,3
        and eax,0x00010000
        or ecx,eax

        // pixel #4
        //       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
        mov eax,edx
        shl eax,12 //Alpha
        and eax,0x10000000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,8 // Intensity
        and eax,0x0E000000
        or ecx,eax
        shr eax,3
        and eax,0x01000000
        or ecx,eax


        mov [edi],ecx
        add edi,4
        // }

// 2nd dword {
        xor ecx,ecx

        // pixel #5
        //       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
        mov eax,edx
        shr eax,8 //Alpha
        and eax,0x00000010
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,12 // Intensity
        and eax,0x0000000E
        or ecx,eax
        shr eax,3
        or ecx,eax

        // pixel #6
        //       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
        //Alpha
        mov eax,edx
        shl eax,4
        and eax,0x00001000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx     // Intensity
        and eax,0x00000E00
        or ecx,eax
        shr eax,3
        and eax,0x00000100
        or ecx,eax

        // pixel #7
        //       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
        //Alpha
        mov eax,edx
        shl eax,16
        and eax,0x00100000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,12 // Intensity
        and eax,0x000E0000
        or ecx,eax
        shr eax,3
        and eax,0x00010000
        or ecx,eax

        // pixel #8
        //       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
        mov eax,edx
        shl eax,28 //Alpha
        and eax,0x10000000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,24 // Intensity
        and eax,0x0E000000
        or ecx,eax
        shr eax,3
        and eax,0x01000000
        or ecx,eax

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword {
        xor ecx,ecx

        // pixel #1
        //       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
        mov eax,edx
        shr eax,24 //Alpha
        and eax,0x00000010
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,28 // Intensity
        and eax,0x0000000E
        or ecx,eax
        shr eax,3
        or ecx,eax

        // pixel #2
        //       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
        mov eax,edx
        shr eax,12 //Alpha
        and eax,0x00001000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,16 // Intensity
        and eax,0x00000E00
        or ecx,eax
        shr eax,3
        and eax,0x00000100
        or ecx,eax

        // pixel #3
        //       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
        //Alpha
        mov eax,edx
        and eax,0x00100000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,4 // Intensity
        and eax,0x000E0000
        or ecx,eax
        shr eax,3
        and eax,0x00010000
        or ecx,eax

        // pixel #4
        //       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
        mov eax,edx
        shl eax,12 //Alpha
        and eax,0x10000000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,8 // Intensity
        and eax,0x0E000000
        or ecx,eax
        shr eax,3
        and eax,0x01000000
        or ecx,eax


        mov [edi],ecx
        add edi,4
        // }

// 2nd dword {
        xor ecx,ecx

        // pixel #5
        //       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
        mov eax,edx
        shr eax,8 //Alpha
        and eax,0x00000010
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,12 // Intensity
        and eax,0x0000000E
        or ecx,eax
        shr eax,3
        or ecx,eax

        // pixel #6
        //       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
        //Alpha
        mov eax,edx
        shl eax,4
        and eax,0x00001000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx     // Intensity
        and eax,0x00000E00
        or ecx,eax
        shr eax,3
        and eax,0x00000100
        or ecx,eax

        // pixel #7
        //       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
        //Alpha
        mov eax,edx
        shl eax,16
        and eax,0x00100000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,12 // Intensity
        and eax,0x000E0000
        or ecx,eax
        shr eax,3
        and eax,0x00010000
        or ecx,eax

        // pixel #8
        //       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
        mov eax,edx
        shl eax,28 //Alpha
        and eax,0x10000000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,24 // Intensity
        and eax,0x0E000000
        or ecx,eax
        shr eax,3
        and eax,0x01000000
        or ecx,eax

        mov [edi],ecx
        add edi,4
        // }

        // *

        pop ecx
        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz near end_y_loop
        push ecx

        add esi,[line]
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        push ecx

        mov eax,[esi+4]         // read all 8 pixels
        bswap eax
        mov edx,eax

        // 1st dword {
        xor ecx,ecx

        // pixel #1
        //       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
        mov eax,edx
        shr eax,24 //Alpha
        and eax,0x00000010
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,28 // Intensity
        and eax,0x0000000E
        or ecx,eax
        shr eax,3
        or ecx,eax

        // pixel #2
        //       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
        mov eax,edx
        shr eax,12 //Alpha
        and eax,0x00001000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,16 // Intensity
        and eax,0x00000E00
        or ecx,eax
        shr eax,3
        and eax,0x00000100
        or ecx,eax

        // pixel #3
        //       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
        //Alpha
        mov eax,edx
        and eax,0x00100000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,4 // Intensity
        and eax,0x000E0000
        or ecx,eax
        shr eax,3
        and eax,0x00010000
        or ecx,eax

        // pixel #4
        //       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
        mov eax,edx
        shl eax,12 //Alpha
        and eax,0x10000000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,8 // Intensity
        and eax,0x0E000000
        or ecx,eax
        shr eax,3
        and eax,0x01000000
        or ecx,eax


        mov [edi],ecx
        add edi,4
        // }

// 2nd dword {
        xor ecx,ecx

        // pixel #5
        //       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
        mov eax,edx
        shr eax,8 //Alpha
        and eax,0x00000010
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,12 // Intensity
        and eax,0x0000000E
        or ecx,eax
        shr eax,3
        or ecx,eax

        // pixel #6
        //       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
        //Alpha
        mov eax,edx
        shl eax,4
        and eax,0x00001000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx     // Intensity
        and eax,0x00000E00
        or ecx,eax
        shr eax,3
        and eax,0x00000100
        or ecx,eax

        // pixel #7
        //       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
        //Alpha
        mov eax,edx
        shl eax,16
        and eax,0x00100000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,12 // Intensity
        and eax,0x000E0000
        or ecx,eax
        shr eax,3
        and eax,0x00010000
        or ecx,eax

        // pixel #8
        //       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
        mov eax,edx
        shl eax,28 //Alpha
        and eax,0x10000000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,24 // Intensity
        and eax,0x0E000000
        or ecx,eax
        shr eax,3
        and eax,0x01000000
        or ecx,eax

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,8
        mov edx,eax

// 1st dword {
        xor ecx,ecx

        // pixel #1
        //       IIIAxxxxxxxxxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
        mov eax,edx
        shr eax,24 //Alpha
        and eax,0x00000010
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,28 // Intensity
        and eax,0x0000000E
        or ecx,eax
        shr eax,3
        or ecx,eax

        // pixel #2
        //       xxxxIIIAxxxxxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
        mov eax,edx
        shr eax,12 //Alpha
        and eax,0x00001000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,16 // Intensity
        and eax,0x00000E00
        or ecx,eax
        shr eax,3
        and eax,0x00000100
        or ecx,eax

        // pixel #3
        //       xxxxxxxxIIIAxxxxxxxxxxxxxxxxxxxx
        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
        //Alpha
        mov eax,edx
        and eax,0x00100000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,4 // Intensity
        and eax,0x000E0000
        or ecx,eax
        shr eax,3
        and eax,0x00010000
        or ecx,eax

        // pixel #4
        //       xxxxxxxxxxxxIIIAxxxxxxxxxxxxxxxx
        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
        mov eax,edx
        shl eax,12 //Alpha
        and eax,0x10000000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,8 // Intensity
        and eax,0x0E000000
        or ecx,eax
        shr eax,3
        and eax,0x01000000
        or ecx,eax


        mov [edi],ecx
        add edi,4
        // }

// 2nd dword {
        xor ecx,ecx

        // pixel #5
        //       xxxxxxxxxxxxxxxxIIIAxxxxxxxxxxxx
        //       xxxxxxxxxxxxxxxxxxxxxxxxAAAAIIII
        mov eax,edx
        shr eax,8 //Alpha
        and eax,0x00000010
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shr eax,12 // Intensity
        and eax,0x0000000E
        or ecx,eax
        shr eax,3
        or ecx,eax

        // pixel #6
        //       xxxxxxxxxxxxxxxxxxxxIIIAxxxxxxxx
        //       xxxxxxxxxxxxxxxxAAAAIIIIxxxxxxxx
        //Alpha
        mov eax,edx
        shl eax,4
        and eax,0x00001000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx     // Intensity
        and eax,0x00000E00
        or ecx,eax
        shr eax,3
        and eax,0x00000100
        or ecx,eax

        // pixel #7
        //       xxxxxxxxxxxxxxxxxxxxxxxxIIIAxxxx
        //       xxxxxxxxAAAAIIIIxxxxxxxxxxxxxxxx
        //Alpha
        mov eax,edx
        shl eax,16
        and eax,0x00100000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,12 // Intensity
        and eax,0x000E0000
        or ecx,eax
        shr eax,3
        and eax,0x00010000
        or ecx,eax

        // pixel #8
        //       xxxxxxxxxxxxxxxxxxxxxxxxxxxxIIIA
        //       AAAAIIIIxxxxxxxxxxxxxxxxxxxxxxxx
        mov eax,edx
        shl eax,28 //Alpha
        and eax,0x10000000
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        shl eax,1
        or ecx,eax
        mov eax,edx
        shl eax,24 // Intensity
        and eax,0x0E000000
        or ecx,eax
        shr eax,3
        and eax,0x01000000
        or ecx,eax

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx
        dec ecx
        jnz x_loop_2

        add esi,[line]
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
// Size: 0, Format: 4
// 2009 ported to NASM - Sergey (Gonetz) Lipski

extern "C" void  __declspec(naked) asmLoad4bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
{
	_asm {
		push ebp
		mov ebp, esp
		push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        push ecx

        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword {
        xor ecx,ecx
        shr eax,28              // 0xF0000000 -> 0x0000000F
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx             // 0x0F000000 -> 0x00000F00
        shr eax,16
        and eax,0x00000F00
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shr eax,4               // 0x00F00000 -> 0x000F0000
        and eax,0x000F0000
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shl eax,8               // 0x000F0000 -> 0x0F000000
        and eax,0x0F000000
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword {
        xor ecx,ecx
        mov eax,edx
        shr eax,12              // 0x0000F000 -> 0x0000000F
        and eax,0x0000000F
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx             // 0x00000F00 -> 0x00000F00
        and eax,0x00000F00
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shl eax,12              // 0x000000F0 -> 0x000F0000
        and eax,0x000F0000
        or ecx,eax
        shl eax,4
        or ecx,eax

        shl edx,24              // 0x0000000F -> 0x0F000000
        and edx,0x0F000000
        or ecx,edx
        shl edx,4
        or ecx,edx

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword {
        xor ecx,ecx
        shr eax,28              // 0xF0000000 -> 0x0000000F
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx             // 0x0F000000 -> 0x00000F00
        shr eax,16
        and eax,0x00000F00
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shr eax,4               // 0x00F00000 -> 0x000F0000
        and eax,0x000F0000
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shl eax,8               // 0x000F0000 -> 0x0F000000
        and eax,0x0F000000
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword {
        xor ecx,ecx
        mov eax,edx
        shr eax,12              // 0x0000F000 -> 0x0000000F
        and eax,0x0000000F
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx             // 0x00000F00 -> 0x00000F00
        and eax,0x00000F00
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shl eax,12              // 0x000000F0 -> 0x000F0000
        and eax,0x000F0000
        or ecx,eax
        shl eax,4
        or ecx,eax

        shl edx,24              // 0x0000000F -> 0x0F000000
        and edx,0x0F000000
        or ecx,edx
        shl edx,4
        or ecx,edx

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx
        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz near end_y_loop
        push ecx

        add esi,[line]
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        push ecx

        mov eax,[esi+4]         // read all 8 pixels
        bswap eax
        mov edx,eax

        // 1st dword {
        xor ecx,ecx
        shr eax,28              // 0xF0000000 -> 0x0000000F
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx             // 0x0F000000 -> 0x00000F00
        shr eax,16
        and eax,0x00000F00
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shr eax,4               // 0x00F00000 -> 0x000F0000
        and eax,0x000F0000
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shl eax,8               // 0x000F0000 -> 0x0F000000
        and eax,0x0F000000
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword {
        xor ecx,ecx
        mov eax,edx
        shr eax,12              // 0x0000F000 -> 0x0000000F
        and eax,0x0000000F
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx             // 0x00000F00 -> 0x00000F00
        and eax,0x00000F00
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shl eax,12              // 0x000000F0 -> 0x000F0000
        and eax,0x000F0000
        or ecx,eax
        shl eax,4
        or ecx,eax

        shl edx,24              // 0x0000000F -> 0x0F000000
        and edx,0x0F000000
        or ecx,edx
        shl edx,4
        or ecx,edx

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 8 pixels
        bswap eax
        add esi,8
        mov edx,eax

        // 1st dword {
        xor ecx,ecx
        shr eax,28              // 0xF0000000 -> 0x0000000F
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx             // 0x0F000000 -> 0x00000F00
        shr eax,16
        and eax,0x00000F00
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shr eax,4               // 0x00F00000 -> 0x000F0000
        and eax,0x000F0000
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shl eax,8               // 0x000F0000 -> 0x0F000000
        and eax,0x0F000000
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword {
        xor ecx,ecx
        mov eax,edx
        shr eax,12              // 0x0000F000 -> 0x0000000F
        and eax,0x0000000F
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx             // 0x00000F00 -> 0x00000F00
        and eax,0x00000F00
        or ecx,eax
        shl eax,4
        or ecx,eax

        mov eax,edx
        shl eax,12              // 0x000000F0 -> 0x000F0000
        and eax,0x000F0000
        or ecx,eax
        shl eax,4
        or ecx,eax

        shl edx,24              // 0x0000000F -> 0x0F000000
        and edx,0x0F000000
        or ecx,edx
        shl edx,4
        or ecx,edx

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx
        dec ecx
        jnz x_loop_2

        add esi,[line]
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}
//****************************************************************
//8b textures load
//****************************************************************

//****************************************************************
// Size: 1, Format: 2
//
// 2008.03.29 cleaned up - H.Morii
// 2009 ported to NASM - Sergey (Gonetz) Lipski

extern "C" void  __declspec(naked) asmLoad8bCI (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
{
	_asm {
		push ebp
		mov ebp, esp
		push ebx
        push esi
        push edi

        mov ebx,[pal]
        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        push ecx

        mov eax,[esi]           // read all 4 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword output {
        shr eax,15
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,23
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        shr edx,7
        and edx,0x1FE
        mov cx,[ebx+edx]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 4 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword output {
        shr eax,15
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,23
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        shr edx,7
        and edx,0x1FE
        mov cx,[ebx+edx]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx

        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz near end_y_loop
        push ecx

        mov eax,esi
        add eax,[line]
        mov esi,[src]
        sub eax,esi
        and eax,0x7FF
        add esi,eax
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        push ecx

        mov eax,[esi+4]         // read all 4 pixels
        bswap eax
        mov edx,eax

        // 1st dword output {
        shr eax,15
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,23
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        shr edx,7
        and edx,0x1FE
        mov cx,[ebx+edx]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 4 pixels
        bswap eax
        mov edx,esi
        add edx,8
        mov esi,[src]
        sub edx,esi
        and edx,0x7FF
        add esi,edx
        mov edx,eax

        // 1st dword output {
        shr eax,15
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        mov eax,edx
        shr eax,23
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,1
        shl ecx,16

        shr edx,7
        and edx,0x1FE
        mov cx,[ebx+edx]
        ror cx,1

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx

        dec ecx
        jnz x_loop_2

        mov eax,esi
        add eax,[line]
        mov esi,[src]
        sub eax,esi
        and eax,0x7FF
        add esi,eax
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmLoad8bIA8 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext, wxUIntPtr pal)
{
	_asm {
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov ebx,[pal]
        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        push ecx

        mov eax,[esi]           // read all 4 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword output {
        shr eax,15
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,23
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        shr edx,7
        and edx,0x1FE
        mov cx,[ebx+edx]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 4 pixels
        bswap eax
        add esi,4
        mov edx,eax

        // 1st dword output {
        shr eax,15
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,23
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        shr edx,7
        and edx,0x1FE
        mov cx,[ebx+edx]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx

        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz near end_y_loop
        push ecx

        add esi,[line]
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        push ecx

        mov eax,[esi+4]         // read all 4 pixels
        bswap eax
        mov edx,eax

        // 1st dword output {
        shr eax,15
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,23
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        shr edx,7
        and edx,0x1FE
        mov cx,[ebx+edx]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // * copy
        mov eax,[esi]           // read all 4 pixels
        bswap eax
        add esi,8
        mov edx,eax

        // 1st dword output {
        shr eax,15
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        mov eax,edx
        shr eax,23
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }

        // 2nd dword output {
        mov eax,edx
        shl eax,1
        and eax,0x1FE
        mov cx,[ebx+eax]
        ror cx,8
        shl ecx,16

        shr edx,7
        and edx,0x1FE
        mov cx,[ebx+edx]
        ror cx,8

        mov [edi],ecx
        add edi,4
        // }
        // *

        pop ecx

        dec ecx
        jnz x_loop_2

        add esi,[line]
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
// Size: 1, Format: 3
//
// ** by Gugaman **
//
// 2008.03.29 cleaned up - H.Morii
// 2009 ported to NASM - Sergey (Gonetz) Lipski

extern "C" void  __declspec(naked) asmLoad8bIA4 (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
{
	_asm {
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        mov eax,[esi] // read all 4 pixels
        mov edx,eax

        shr eax,4     //all alpha
        shl edx,4
        and eax,0x0F0F0F0F
        and edx,0xF0F0F0F0
        add esi,4
        or eax,edx

        mov [edi],eax // save dword
        add edi,4

        mov eax,[esi] // read all 4 pixels
        mov edx,eax

        shr eax,4     //all alpha
        shl edx,4
        and eax,0x0F0F0F0F
        and edx,0xF0F0F0F0
        add esi,4
        or eax,edx

        mov [edi],eax // save dword
        add edi,4
        // *

        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz end_y_loop
        push ecx

        add esi,[line]
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        mov eax,[esi+4] // read both pixels
        mov edx,eax

        shr eax,4       //all alpha
        shl edx,4
        and eax,0x0F0F0F0F
        and edx,0xF0F0F0F0
        or eax,edx

        mov [edi],eax //save dword
        add edi,4

        mov eax,[esi] // read both pixels
        add esi,8
        mov edx,eax

        shr eax,4     //all alpha
        shl edx,4
        and eax,0x0F0F0F0F
        and edx,0xF0F0F0F0
        or eax,edx

        mov [edi],eax //save dword
        add edi,4
        // *

        dec ecx
        jnz x_loop_2

        add esi,[line]
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
// Size: 1, Format: 4
//
// ** by Gugaman **
// 2009 ported to NASM - Sergey (Gonetz) Lipski

extern "C" void  __declspec(naked) asmLoad8bI (wxUIntPtr src, int dst, wxUIntPtr wid_64, int height, int line, int ext)
{
	_asm {
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        mov eax,[esi] // read all 4 pixels
        add esi,4

        mov [edi],eax // save dword
        add edi,4

        mov eax,[esi] // read all 4 pixels
        add esi,4

        mov [edi],eax // save dword
        add edi,4
        // *

        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz end_y_loop
        push ecx

        add esi,[line]
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        mov eax,[esi+4] // read both pixels

        mov [edi],eax //save dword
        add edi,4

        mov eax,[esi] // read both pixels
        add esi,8

        mov [edi],eax //save dword
        add edi,4
        // *

        dec ecx
        jnz x_loop_2

        add esi,[line]
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}


//****************************************************************
//16b textures load
//****************************************************************

//****************************************************************
// Size: 2, Format: 0
//
// 2008.03.29 cleaned up - H.Morii
// 2009 ported to NASM - Sergey (Gonetz) Lipski

extern "C" void __declspec(naked) asmLoad16bRGBA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
{
	_asm {
		align 4
		push        ebp
		mov         ebp,esp
        push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        mov eax,[esi]   // read both pixels
        mov ebx,[esi+4] // read both pixels
        bswap eax
        bswap ebx

        ror ax,1
        ror bx,1
        ror eax,16
        ror ebx,16
        ror ax,1
        ror bx,1

        mov  [edi],eax
        mov  [edi+4],ebx
        add esi,8
        add edi,8

        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz end_y_loop
        push ecx

        mov eax,esi
        add eax,[line]
        mov esi,[src]
        sub eax, esi
        and eax, 0xFFF
        add esi, eax
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        mov eax,[esi+4] // read both pixels
        mov ebx,[esi]   // read both pixels
        bswap eax
        bswap ebx

        ror ax,1
        ror bx,1
        ror eax,16
        ror ebx,16
        ror ax,1
        ror bx,1

        mov [edi],eax
        mov [edi+4],ebx
        add esi,8
        add edi,8

        dec ecx
        jnz x_loop_2

        mov eax,esi
        add eax,[line]
        mov esi,[src]
        sub eax, esi
        and eax, 0xFFF
        add esi, eax
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}


//****************************************************************
// Size: 2, Format: 3
//
// ** by Gugaman/Dave2001 **
//
// 2008.03.29 cleaned up - H.Morii
// 2009 ported to NASM - Sergey (Gonetz) Lipski

extern "C" void  __declspec(naked) asmLoad16bIA (wxUIntPtr src, wxUIntPtr dst, int wid_64, int height, int line, int ext)
{
	_asm {
		ALIGN 4

		push ebp
		mov ebp, esp
		push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[height]
y_loop:
        push ecx
        mov ecx,[wid_64]
x_loop:
        mov eax,[esi]   // read both pixels
        mov ebx,[esi+4] // read both pixels
        mov [edi],eax
        mov [edi+4],ebx
        add esi,8
        add edi,8

        dec ecx
        jnz x_loop

        pop ecx
        dec ecx
        jz end_y_loop
        push ecx

        add esi,[line]
        add edi,[ext]

        mov ecx,[wid_64]
x_loop_2:
        mov eax,[esi+4] // read both pixels
        mov ebx,[esi]   // read both pixels
        mov [edi],eax
        mov [edi+4],ebx
        add esi,8
        add edi,8

        dec ecx
        jnz x_loop_2

        add esi,[line]
        add edi,[ext]

        pop ecx
        dec ecx
        jnz y_loop

end_y_loop:
        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
//
//            ******** Textures mirror/clamp/wrap ********
//
//****************************************************************

//****************************************************************
//8b textures mirror/clamp/wrap
//****************************************************************

extern "C" void  __declspec(naked) asmMirror8bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
{
	_asm{
		ALIGN 4

		push ebp
		mov ebp, esp
		push ebx
        push esi
        push edi

        mov edi,[start]
        mov ecx,[height]
loop_y:

        xor edx,edx
loop_x:
        mov esi,[tex]
        mov ebx,[width]
        add ebx,edx
        and ebx,[width]
        jnz is_mirrored

        mov eax,edx
        and eax,[mask]
        add esi,eax
        mov al,[esi]
        mov [edi],al
        inc edi
        jmp end_mirror_check
is_mirrored:
        add esi,[mask]
        mov eax,edx
        and eax,[mask]
        sub esi,eax
        mov al,[esi]
        mov [edi],al
        inc edi
end_mirror_check:

        inc edx
        cmp edx,[count]
        jne loop_x

        add edi,[line]
        mov eax,[tex]
        add eax,[full]
        mov [tex],eax

        dec ecx
        jnz loop_y

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmWrap8bS (int tex, int start, int height, int mask, int line, int full, int count)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov edi,[start]
        mov ecx,[height]
loop_y:

        xor edx,edx
loop_x:

        mov esi,[tex]
        mov eax,edx
        and eax,[mask]
        shl eax,2
        add esi,eax
        mov eax,[esi]
        mov [edi],eax
        add edi,4

        inc edx
        cmp edx,[count]
        jne loop_x

        add edi,[line]
        mov eax,[tex]
        add eax,[full]
        mov [tex],eax

        dec ecx
        jnz loop_y

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmClamp8bS (int tex, int constant, int height,int line, int full, int count)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[constant]
        mov edi,[tex]

        mov ecx,[height]
y_loop:

        mov al,[esi]

        mov edx,[count]
x_loop:

        mov [edi],al            // don't unroll or make dword, it may go into next line (doesn't have to be multiple of two)
        inc edi

        dec edx
        jnz x_loop

        add esi,[full]
        add edi,[line]

        dec ecx
        jnz y_loop

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
//16b textures mirror/clamp/wrap
//****************************************************************

extern "C" void  __declspec(naked) asmMirror16bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov edi,[start]
        mov ecx,[height]
loop_y:

        xor edx,edx
loop_x:
        mov esi,[tex]
        mov ebx,[width]
        add ebx,edx
        and ebx,[width]
        jnz is_mirrored

        mov eax,edx
        shl eax,1
        and eax,[mask]
        add esi,eax
        mov ax,[esi]
        mov [edi],ax
        add edi,2
        jmp end_mirror_check
is_mirrored:
        add esi,[mask]
        mov eax,edx
        shl eax,1
        and eax,[mask]
        sub esi,eax
        mov ax,[esi]
        mov [edi],ax
        add edi,2
end_mirror_check:

        inc edx
        cmp edx,[count]
        jne loop_x

        add edi,[line]
        mov eax,[tex]
        add eax,[full]
        mov [tex],eax

        dec ecx
        jnz loop_y

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmWrap16bS (int tex, int start, int height, int mask, int line, int full, int count)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov edi,[start]
        mov ecx,[height]
loop_y:

        xor edx,edx
loop_x:

        mov esi,[tex]
        mov eax,edx
        and eax,[mask]
        shl eax,2
        add esi,eax
        mov eax,[esi]
        mov [edi],eax
        add edi,4

        inc edx
        cmp edx,[count]
        jne loop_x

        add edi,[line]
        mov eax,[tex]
        add eax,[full]
        mov [tex],eax

        dec ecx
        jnz loop_y

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmClamp16bS (int tex, int constant, int height,int line, int full, int count)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[constant]
        mov edi,[tex]

        mov ecx,[height]
y_loop:

        mov ax,[esi]

        mov edx,[count]
x_loop:

        mov [edi],ax            // don't unroll or make dword, it may go into next line (doesn't have to be multiple of two)
        add edi,2

        dec edx
        jnz x_loop

        add esi,[full]
        add edi,[line]

        dec ecx
        jnz y_loop

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
//32b textures mirror/clamp/wrap
//****************************************************************

extern "C" void  __declspec(naked) asmMirror32bS (int tex, int start, int width, int height, int mask, int line, int full, int count)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov edi,[start]
        mov ecx,[height]
loop_y:

        xor edx,edx
loop_x:
        mov esi,[tex]
        mov ebx,[width]
        add ebx,edx
        and ebx,[width]
        jnz is_mirrored

        mov eax,edx
        shl eax,2
        and eax,[mask]
        add esi,eax
        mov eax,[esi]
        mov [edi],eax
        add edi,4
        jmp end_mirror_check
is_mirrored:
        add esi,[mask]
        mov eax,edx
        shl eax,2
        and eax,[mask]
        sub esi,eax
        mov eax,[esi]
        mov [edi],eax
        add edi,4
end_mirror_check:

        inc edx
        cmp edx,[count]
        jne loop_x

        add edi,[line]
        mov eax,[tex]
        add eax,[full]
        mov [tex],eax

        dec ecx
        jnz loop_y

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmWrap32bS (int tex, int start, int height, int mask, int line, int full, int count)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov edi,[start]
        mov ecx,[height]
loop_y:

        xor edx,edx
loop_x:

        mov esi,[tex]
        mov eax,edx
        and eax,[mask]
        shl eax,2
        add esi,eax
        mov eax,[esi]
        mov [edi],eax
        add edi,4

        inc edx
        cmp edx,[count]
        jne loop_x

        add edi,[line]
        mov eax,[tex]
        add eax,[full]
        mov [tex],eax

        dec ecx
        jnz loop_y

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmClamp32bS (int tex, int constant, int height,int line, int full, int count)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[constant]
        mov edi,[tex]

        mov ecx,[height]
y_loop:

        mov eax,[esi]

        mov edx,[count]
x_loop:

        mov [edi],eax           // don't unroll or make dword, it may go into next line (doesn't have to be multiple of two)
        add edi,4

        dec edx
        jnz x_loop

        add esi,[full]
        add edi,[line]

        dec ecx
        jnz y_loop

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
//
//             ******** Textures conversion ********
//
//****************************************************************

extern "C" void  __declspec(naked) asmTexConv_ARGB1555_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[isize]

tc1_loop:
        mov eax,[esi]
        add esi,4

        // arrr rrgg gggb bbbb
        // aaaa rrrr gggg bbbb
        mov edx,eax
        and eax,0x80008000
        mov ebx,eax                             // ebx = 0xa000000000000000
        shr eax,1
        or ebx,eax                              // ebx = 0xaa00000000000000
        shr eax,1
        or ebx,eax                              // ebx = 0xaaa0000000000000
        shr eax,1
        or ebx,eax                              // ebx = 0xaaaa000000000000

        mov eax,edx
        and eax,0x78007800              // eax = 0x0rrrr00000000000
        shr eax,3                               // eax = 0x0000rrrr00000000
        or ebx,eax                              // ebx = 0xaaaarrrr00000000

        mov eax,edx
        and eax,0x03c003c0              // eax = 0x000000gggg000000
        shr eax,2                               // eax = 0x00000000gggg0000
        or ebx,eax                              // ebx = 0xaaaarrrrgggg0000

        and edx,0x001e001e              // edx = 0x00000000000bbbb0
        shr edx,1                               // edx = 0x000000000000bbbb
        or ebx,edx                              // ebx = 0xaaaarrrrggggbbbb

        mov [edi],ebx
        add edi,4

        dec ecx
        jnz tc1_loop

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmTexConv_AI88_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[isize]

tc1_loop:
        mov eax,[esi]
        add esi,4

        // aaaa aaaa iiii iiii
        // aaaa rrrr gggg bbbb
        mov edx,eax
        and eax,0xF000F000              // eax = 0xaaaa000000000000
        mov ebx,eax                             // ebx = 0xaaaa000000000000

        and edx,0x00F000F0              // edx = 0x00000000iiii0000
        shl edx,4                               // edx = 0x0000iiii00000000
        or ebx,edx                              // ebx = 0xaaaaiiii00000000
        shr edx,4                               // edx = 0x00000000iiii0000
        or ebx,edx                              // ebx = 0xaaaaiiiiiiii0000
        shr edx,4                               // edx = 0x000000000000iiii
        or ebx,edx                              // ebx = 0xaaaaiiiiiiiiiiii

        mov [edi],ebx
        add edi,4

        dec ecx
        jnz tc1_loop

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmTexConv_AI44_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[isize]

tc1_loop:
        mov eax,[esi]
        add esi,4

        // aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
        // aaaa1 rrrr1 gggg1 bbbb1 aaaa0 rrrr0 gggg0 bbbb0
        // aaaa3 rrrr3 gggg3 bbbb3 aaaa2 rrrr2 gggg2 bbbb2
        mov edx,eax                             // eax = aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
        shl eax,16                              // eax = aaaa1 iiii1 aaaa0 iiii0 0000  0000  0000  0000
        and eax,0xFF000000              // eax = aaaa1 iiii1 0000  0000  0000  0000  0000  0000
        mov ebx,eax                             // ebx = aaaa1 iiii1 0000  0000  0000  0000  0000  0000
        and eax,0x0F000000              // eax = 0000  iiii1 0000  0000  0000  0000  0000  0000
        shr eax,4                               // eax = 0000  0000  iiii1 0000  0000  0000  0000  0000
        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 0000  0000  0000  0000  0000
        shr eax,4                               // eax = 0000  0000  0000  iiii1 0000  0000  0000  0000
        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 iiii1 0000  0000  0000  0000

        mov eax,edx                             // eax = aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
        shl eax,8                               // eax = aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0 0000  0000
        and eax,0x0000FF00              // eax = 0000  0000  0000  0000  aaaa0 iiii0 0000  0000
        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 iiii1 aaaa0 iiii0 0000  0000
        and eax,0x00000F00              // eax = 0000  0000  0000  0000  0000  iiii0 0000  0000
        shr eax,4                               // eax = 0000  0000  0000  0000  0000  0000  iiii0 0000
        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 iiii1 aaaa0 iiii0 iiii0 0000
        shr eax,4                               // eax = 0000  0000  0000  0000  0000  0000  0000  iiii0
        or ebx,eax                              // ebx = aaaa1 iiii1 iiii1 iiii1 aaaa0 iiii0 iiii0 iiii0

        mov [edi],ebx
        add edi,4

        mov eax,edx                             // eax = aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
        and eax,0xFF000000              // eax = aaaa3 iiii3 0000  0000  0000  0000  0000  0000
        mov ebx,eax                             // ebx = aaaa3 iiii3 0000  0000  0000  0000  0000  0000
        and eax,0x0F000000              // eax = 0000  iiii3 0000  0000  0000  0000  0000  0000
        shr eax,4                               // eax = 0000  0000  iiii3 0000  0000  0000  0000  0000
        or ebx,eax                              // ebx = aaaa3 iiii3 iiii3 0000  0000  0000  0000  0000
        shr eax,4                               // eax = 0000  0000  0000  iiii3 0000  0000  0000  0000
        or ebx,eax                              // ebx = aaaa3 iiii3 iiii3 iiii3 0000  0000  0000  0000

                                                        // edx = aaaa3 iiii3 aaaa2 iiii2 aaaa1 iiii1 aaaa0 iiii0
        shr edx,8                               // edx = 0000  0000  aaaa3 aaaa3 aaaa2 iiii2 aaaa1 iiii1
        and edx,0x0000FF00              // edx = 0000  0000  0000  0000  aaaa2 iiii2 0000  0000
        or ebx,edx                              // ebx = aaaa3 iiii3 iiii3 iiii3 aaaa2 iiii2 0000  0000
        and edx,0x00000F00              // edx = 0000  0000  0000  0000  0000  iiii2 0000  0000
        shr edx,4                               // edx = 0000  0000  0000  0000  0000  0000  iiii2 0000
        or ebx,edx                              // ebx = aaaa3 iiii3 iiii3 iiii3 aaaa2 iiii2 iiii2 0000
        shr edx,4                               // edx = 0000  0000  0000  0000  0000  0000  0000  iiii2
        or ebx,edx                              // ebx = aaaa3 iiii3 iiii3 iiii3 aaaa2 iiii2 iiii2 iiii2

        mov [edi],ebx
        add edi,4

        dec ecx
        jnz tc1_loop

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" void  __declspec(naked) asmTexConv_A8_ARGB4444(wxUIntPtr src, wxUIntPtr dst, int isize)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push ebx
        push esi
        push edi

        mov esi,[src]
        mov edi,[dst]
        mov ecx,[isize]

tc1_loop:
        mov eax,[esi]
        add esi,4

        // aaaa3 aaaa3 aaaa2 aaaa2 aaaa1 aaaa1 aaaa0 aaaa0
        // aaaa1 rrrr1 gggg1 bbbb1 aaaa0 rrrr0 gggg0 bbbb0
        // aaaa3 rrrr3 gggg3 bbbb3 aaaa2 rrrr2 gggg2 bbbb2
        mov edx,eax
        and eax,0x0000F000              // eax = 00 00 00 00 a1 00 00 00
        shl eax,16                              // eax = a1 00 00 00 00 00 00 00
        mov ebx,eax                             // ebx = a1 00 00 00 00 00 00 00
        shr eax,4
        or ebx,eax                              // ebx = a1 a1 00 00 00 00 00 00
        shr eax,4
        or ebx,eax                              // ebx = a1 a1 a1 00 00 00 00 00
        shr eax,4
        or ebx,eax                              // ebx = a1 a1 a1 a1 00 00 00 00

        mov eax,edx
        and eax,0x000000F0              // eax = 00 00 00 00 00 00 a0 00
        shl eax,8                               // eax = 00 00 00 00 a0 00 00 00
        or ebx,eax
        shr eax,4
        or ebx,eax
        shr eax,4
        or ebx,eax
        shr eax,4
        or ebx,eax                              // ebx = a1 a1 a1 a1 a0 a0 a0 a0

        mov [edi],ebx
        add edi,4

        mov eax,edx                             // eax = a3 a3 a2 a2 a1 a1 a0 a0
        and eax,0xF0000000              // eax = a3 00 00 00 00 00 00 00
        mov ebx,eax                             // ebx = a3 00 00 00 00 00 00 00
        shr eax,4
        or ebx,eax                              // ebx = a3 a3 00 00 00 00 00 00
        shr eax,4
        or ebx,eax                              // ebx = a3 a3 a3 00 00 00 00 00
        shr eax,4
        or ebx,eax                              // ebx = a3 a3 a3 a3 00 00 00 00

        and edx,0x00F00000              // eax = 00 00 a2 00 00 00 00 00
        shr edx,8                               // eax = 00 00 00 00 a2 00 00 00
        or ebx,edx
        shr edx,4
        or ebx,edx
        shr edx,4
        or ebx,edx
        shr edx,4
        or ebx,edx                              // ebx = a3 a3 a3 a3 a2 a2 a2 a2

        mov [edi],ebx
        add edi,4

        dec ecx
        jnz tc1_loop

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
//
//                ******** Tmem functions ********
//
//****************************************************************

//****************************************************************
// CopyBlock - copies a block from base_addr+offset to dest_addr, while unswapping the
//  data within.
//
// edi = dest_addr -> end of dest
// ecx = num_words
// esi = base_addr (preserved)
// edx = offset (preserved)
//****************************************************************
void __declspec(naked) CopyBlock ( void )
{
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push eax
        push ebx
        push esi
        push edx

        or ecx,ecx
        jz near copyblock_end

        push ecx

        // first, set the source address and check if not on a dword boundary
        push esi
        push edx
        mov ebx,edx
        and edx,0FFFFFFFCh
        add esi,edx

        and ebx,3                               // ebx = # we DON'T need to copy
        jz copyblock_copy

        mov edx,4                               // ecx = # we DO need to copy
        sub edx,ebx

        // load the first word, accounting for swapping

        mov eax,[esi]
        add esi,4
copyblock_precopy_skip:
        rol eax,8
        dec ebx
        jnz copyblock_precopy_skip

copyblock_precopy_copy:
        rol eax,8
        mov [edi],al
        inc edi
        dec edx
        jnz copyblock_precopy_copy

        mov eax,[esi]
        add esi,4
        bswap eax
        mov [edi],eax
        add edi,4

        dec ecx         // 1 less word to copy
        jz copyblock_postcopy

copyblock_copy:
        mov eax,[esi]
        bswap eax
        mov [edi],eax

        mov eax,[esi+4]
        bswap eax
        mov [edi+4],eax

        add esi,8
        add edi,8

        dec ecx
        jnz copyblock_copy

copyblock_postcopy:
        pop edx
        pop esi
        pop ecx

        // check again if on dword boundary
        mov ebx,edx     // ebx = # we DO need to copy

        and ebx,3
        jz copyblock_end

        shl ecx,3       // ecx = num_words * 8
        add edx,ecx
        and edx,0FFFFFFFCh
        add esi,edx

        mov eax,[esi]

copyblock_postcopy_copy:
        rol eax,8
        mov [edi],al
        inc edi
        dec ebx
        jnz copyblock_postcopy_copy

copyblock_end:
        pop edx
        pop esi
        pop ebx
        pop eax
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" __declspec(naked) void SwapBlock32 ( void )
{
//****************************************************************
// SwapBlock - swaps every other 32-bit word at addr
//
// ecx = num_words -> 0
// edi = addr -> end of dest
//****************************************************************
	_asm {
		align 4
		push ebp
		mov ebp, esp
        push eax
        push ebx
        or ecx,ecx
        jz swapblock32_end
swapblock32_loop:
        mov eax,[edi]
        mov ebx,[edi+4]
        mov [edi],ebx
        mov [edi+4],eax
        add edi,8
        dec ecx
        jnz swapblock32_loop
swapblock32_end:
        pop ebx
        pop eax
		mov esp, ebp
		pop ebp
		ret
	}
}

//****************************************************************
//
//               ******** Load block/tile ********
//
//****************************************************************

extern "C" __declspec(naked) void asmLoadBlock(int src, int dst, int off, int dxt, int cnt, wxUIntPtr swp)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp

        push ebx
        push esi
        push edi

        // copy the data
        mov esi,[src]
        mov edi,[dst]
        mov ecx,[cnt]
        mov edx,[off]
        call CopyBlock

        // now swap it
        mov eax,[cnt]   // eax = count remaining
        xor edx,edx         // edx = dxt counter
        mov edi,[dst]
        mov ebx,[dxt]

        xor ecx,ecx     // ecx = how much to copy
dxt_test:
        add edi,8
        dec eax
        jz end_dxt_test
        add edx,ebx
        jns dxt_test

dxt_s_test:
        inc ecx
        dec eax
        jz end_dxt_test
        add edx,ebx
        js dxt_s_test

        // swap this data (ecx set, dst set)
        call [swp] // (ecx reset to 0 after)

        jmp dxt_test  // and repeat

end_dxt_test:
        // swap any remaining data
        call [swp]

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}

extern "C" __declspec(naked) void asmLoadTile(int src, int dst, int width, int height, int line, int off, int end)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp

        push ebx
        push esi
        push edi

        // set initial values
        mov edi,[dst]
        mov ecx,[width]
        mov esi,[src]
        mov edx,[off]
        xor ebx,ebx         // swap this line?
        mov eax,[height]

loadtile_loop:
        cmp [end],edi   // end of tmem: error
        jc loadtile_end

        // copy this line
        push edi
        push ecx
        call CopyBlock
        pop ecx

        // swap it?
        xor ebx,1
        jnz loadtile_no_swap

        // (ecx set, restore edi)
        pop edi
        push ecx
        call SwapBlock32
        pop ecx
        jmp loadtile_swap_end
loadtile_no_swap:
        add sp,4  // forget edi, we are already at the next position
loadtile_swap_end:

        add edx,[line]

        dec eax
        jnz loadtile_loop

loadtile_end:

        pop edi
        pop esi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}


//****************************************************************
//
//               ******** Texture CRC ********
//
//****************************************************************
extern "C" __declspec(naked) int asmTextureCRC(int addr, int width, int height, int line)
{
	_asm {
		align 4
		push ebp
		mov ebp, esp

        push ebx
        push edi

        xor eax,eax                             // eax is final result
        mov ebx,[line]
        mov ecx,[height]                // ecx is height counter
        mov edi,[addr]                  // edi is ptr to texture memory
crc_loop_y:
        push ecx

        mov ecx,[width]
crc_loop_x:

        add eax,[edi]           // MUST be 64-bit aligned, so manually unroll
        add eax,[edi+4]
        mov edx,ecx
        mul edx
        add eax,edx
        add edi,8

        dec ecx
        jnz crc_loop_x

        pop ecx

        mov edx,ecx
        mul edx
        add eax,edx

        add edi,ebx

        dec ecx
        jnz crc_loop_y

        pop edi
        pop ebx
		mov esp, ebp
		pop ebp
		ret
	}
}