[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt)

e110f50489
This commit is contained in:
zilmar 2015-10-09 16:18:09 +11:00
parent 1203155d37
commit d0f45f17ab
3 changed files with 212 additions and 550 deletions

View File

@ -204,71 +204,229 @@ void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication // 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
// and 3DNOW! 4x4 4x4 matrix multiplication // and 3DNOW! 4x4 4x4 matrix multiplication
// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
// This will need fixing.
MULMATRIX MulMatrices = MulMatricesC; MULMATRIX MulMatrices = MulMatricesC;
TRANSFORMVECTOR TransformVector = TransformVectorC; TRANSFORMVECTOR TransformVector = TransformVectorC;
TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC; TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
DOTPRODUCT DotProduct = DotProductC; DOTPRODUCT DotProduct = DotProductC;
NORMALIZEVECTOR NormalizeVector = NormalizeVectorC; NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;
extern "C" void TransformVector3DNOW(float *src, float *dst, float mat[4][4]); void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
extern "C" void InverseTransformVector3DNOW(float *src, float *dst, float mat[4][4]); {
extern "C" void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4]); #if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
extern "C" void MulMatrices3DNOW(float m1[4][4],float m2[4][4],float r[4][4]); /* [row][col]*/
extern "C" float DotProductSSE3(register float *v1, register float *v2); typedef float v4sf __attribute__ ((vector_size (16)));
extern "C" float DotProduct3DNOW(register float *v1, register float *v2); v4sf row0 = __builtin_ia32_loadups(m2[0]);
extern "C" void NormalizeVectorSSE(float *v); v4sf row1 = __builtin_ia32_loadups(m2[1]);
extern "C" void NormalizeVector3DNOW(float *v); v4sf row2 = __builtin_ia32_loadups(m2[2]);
v4sf row3 = __builtin_ia32_loadups(m2[3]);
for (int i = 0; i < 4; ++i)
{
v4sf leftrow = __builtin_ia32_loadups(m1[i]);
// Fill tmp with four copies of leftrow[0]
v4sf tmp = leftrow;
tmp = _mm_shuffle_ps (tmp, tmp, 0);
// Calculate the four first summands
v4sf destrow = tmp * row0;
// Fill tmp with four copies of leftrow[1]
tmp = leftrow;
tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
destrow += tmp * row1;
// Fill tmp with four copies of leftrow[2]
tmp = leftrow;
tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
destrow += tmp * row2;
// Fill tmp with four copies of leftrow[3]
tmp = leftrow;
tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
destrow += tmp * row3;
__builtin_ia32_storeups(r[i], destrow);
}
#elif !defined(NO_ASM) && !defined(NOSSE)
__asm
{
mov eax, dword ptr [r]
mov ecx, dword ptr [m1]
mov edx, dword ptr [m2]
movaps xmm0,[edx]
movaps xmm1,[edx+16]
movaps xmm2,[edx+32]
movaps xmm3,[edx+48]
// r[0][0],r[0][1],r[0][2],r[0][3]
movaps xmm4,xmmword ptr[ecx]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps xmmword ptr[eax],xmm4
// r[1][0],r[1][1],r[1][2],r[1][3]
movaps xmm4,xmmword ptr[ecx+16]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps xmmword ptr[eax+16],xmm4
// r[2][0],r[2][1],r[2][2],r[2][3]
movaps xmm4,xmmword ptr[ecx+32]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps xmmword ptr[eax+32],xmm4
// r[3][0],r[3][1],r[3][2],r[3][3]
movaps xmm4,xmmword ptr[ecx+48]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps xmmword ptr[eax+48],xmm4
}
#endif // _WIN32
}
extern "C" void DetectSIMD(int function, int * iedx, int * iecx);
void math_init() void math_init()
{ {
#ifndef _DEBUG #ifndef _DEBUG
int iecx = 0, iedx = 0; int IsSSE = FALSE;
int edx, eax;
#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
GLIDE64_TRY GLIDE64_TRY
{ {
DetectSIMD(0x0000001, &iedx, &iecx); #if defined(__x86_64__)
asm volatile(" cpuid; "
: "=a"(eax), "=d"(edx)
: "0"(1)
: "rbx", "rcx"
);
#else
asm volatile(" push %%ebx; "
" push %%ecx; "
" cpuid; "
" pop %%ecx; "
" pop %%ebx; "
: "=a"(eax), "=d"(edx)
: "0"(1)
:
);
#endif
} }
GLIDE64_CATCH GLIDE64_CATCH
{ return; }
// Check for SSE
if (edx & (1 << 25))
IsSSE = TRUE;
#elif !defined(NO_ASM) && !defined(NOSSE)
DWORD dwEdx;
__try
{
__asm
{
mov eax,1
cpuid
mov dwEdx,edx
}
}
__except(EXCEPTION_EXECUTE_HANDLER)
{ {
return; return;
} }
if (iedx & 0x2000000) //SSE
if (dwEdx & (1<<25))
{
if (dwEdx & (1<<24))
{
__try
{
__asm xorps xmm0, xmm0
IsSSE = TRUE;
}
__except(EXCEPTION_EXECUTE_HANDLER)
{
return;
}
}
}
#endif // _WIN32
if (IsSSE)
{ {
MulMatrices = MulMatricesSSE; MulMatrices = MulMatricesSSE;
//InverseTransformVector = InverseTransformVectorSSE;
//NormalizeVector = NormalizeVectorSSE; /* not ready yet */
LOG("SSE detected.\n");
}
if (iedx & 0x4000000) // SSE2
{
LOG("SSE2 detected.\n");
}
if (iecx & 0x1) // SSE3
{
//DotProduct = DotProductSSE3; /* not ready yet */
LOG("SSE3 detected.\n");
}
// the 3dnow version is faster than sse
iecx = 0;
iedx = 0;
GLIDE64_TRY
{
DetectSIMD(0x80000001, &iedx, &iecx);
}
GLIDE64_CATCH
{
return;
}
if (iedx & 0x80000000) //3DNow!
{
MulMatrices = MulMatrices3DNOW;
TransformVector = TransformVector3DNOW;
InverseTransformVector = InverseTransformVector3DNOW;
//DotProduct = DotProduct3DNOW; //not ready yet
NormalizeVector = NormalizeVector3DNOW; // not ready yet
LOG("3DNOW! detected.\n"); LOG("3DNOW! detected.\n");
} }
#endif //_DEBUG #endif //_DEBUG
} }

View File

@ -1,492 +0,0 @@
;/*
;* Glide64 - Glide video plugin for Nintendo 64 emulators.
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
;* the Free Software Foundation; either version 2 of the License, or
;* any later version.
;*
;* This program is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;* GNU General Public License for more details.
;*
;* You should have received a copy of the GNU General Public License
;* along with this program; if not, write to the Free Software
;* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
;
;****************************************************************
;
; Glide64 - Glide Plugin for Nintendo 64 emulators
; Project started on December 29th, 2001
;
; Authors:
; Dave2001, original author, founded the project in 2001, left it in 2002
; Gugaman, joined the project in 2002, left it in 2002
; Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
; Hiroshi 'KoolSmoky' Morii, joined the project in 2007
;
;****************************************************************
;
; To modify Glide64:
; * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
; * Do NOT send me the whole project or file that you modified. Take out your modified code sections, and tell me where to put them. If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
;
;****************************************************************
%include "inc/c32.mac"
segment .text
*/
extern "C" void __declspec(naked) DetectSIMD(int func, int * iedx, int * iecx)
{
_asm {
push ebp
mov ebp,esp
mov eax,[func]
cpuid
mov eax,[iedx]
mov [eax],edx
mov eax,[iecx]
mov [eax],ecx
leave
ret
}
}
/****************************************************************
;
; ******** SSE ********
;
;****************************************************************/
extern "C" void __declspec(naked) MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
{
__asm
{
push ebp
mov ebp,esp
mov eax,[r]
mov ecx,[m1]
mov edx,[m2]
movaps xmm0,[edx]
movaps xmm1,[edx+16]
movaps xmm2,[edx+32]
movaps xmm3,[edx+48]
; r[0][0],r[0][1],r[0][2],r[0][3]
movaps xmm4,[ecx]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps [eax],xmm4
; r[1][0],r[1][1],r[1][2],r[1][3]
movaps xmm4,[ecx+16]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps [eax+16],xmm4
; r[2][0],r[2][1],r[2][2],r[2][3]
movaps xmm4,[ecx+32]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps [eax+32],xmm4
; r[3][0],r[3][1],r[3][2],r[3][3]
movaps xmm4,[ecx+48]
movaps xmm5,xmm4
movaps xmm6,xmm4
movaps xmm7,xmm4
shufps xmm4,xmm4,00000000b
shufps xmm5,xmm5,01010101b
shufps xmm6,xmm6,10101010b
shufps xmm7,xmm7,11111111b
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
addps xmm4,xmm5
addps xmm4,xmm6
addps xmm4,xmm7
movaps [eax+48],xmm4
leave
ret
}
}
extern "C" void __declspec(naked) NormalizeVectorSSE (float *v)
{
_asm
{
push ebp
mov ebp,esp
mov edx, [v]
movaps xmm0, [edx] ; x y z 0
movaps xmm2, xmm0 ; x y z 0
mulps xmm0, xmm0 ; x*x y*y z*z 0
movaps xmm1, xmm0 ; x*x y*y z*z 0
shufps xmm0, xmm1, 0x4e ; z*z 0 x*x y*y
addps xmm0, xmm1 ; x*x+z*z y*y z*z+x*x y*y
movaps xmm1, xmm0 ; x*x+z*z y*y z*z+x*x y*y
shufps xmm1, xmm1, 0x11 ; y*y z*z+x*x y*y z*z+x*x
addps xmm0, xmm1 ; x*x+z*z+y*y
rsqrtps xmm0, xmm0 ; 1.0/sqrt(x*x+z*z+y*y)
mulps xmm2, xmm0 ; x/sqrt(x*x+z*z+y*y) y/sqrt(x*x+z*z+y*y) z/sqrt(x*x+z*z+y*y) 0
movaps [edx], xmm2
leave
ret
}
}
/*****************************************************************
;
; ******** SSE3 ********
;
;****************************************************************/
float __declspec(naked) DotProductSSE3(register float *v1, register float *v2)
{
_asm {
push ebp
mov ebp,esp
mov eax,[v1]
mov edx,[v2]
movaps xmm0, [eax]
mulps xmm0, [edx]
haddps xmm0, xmm0
haddps xmm0, xmm0
; movss eax, xmm0
leave
ret
}
}
/****************************************************************
;
; ******** 3DNOW ********
;
;****************************************************************/
extern "C" void __declspec(naked) TransformVector3DNOW(float *src, float *dst, float mat[4][4])
{
_asm {
push ebp
mov ebp,esp
femms
mov ecx,[src]
mov eax,[dst]
mov edx,[mat]
movq mm0,[ecx] ; src[1] src[0]
movd mm2,[ecx+8] ; 0 src[2]
movq mm1,mm0 ; src[1] src[0]
punpckldq mm0,mm0 ; src[0] src[0]
punpckhdq mm1,mm1 ; src[1] src[1]
punpckldq mm2,mm2 ; src[2] src[2]
movq mm3,mm0 ; src[0] src[0]
movq mm4,mm1 ; src[1] src[1]
movq mm5,mm2 ; src[2] src[2]
pfmul mm0,[edx] ; src[0]*mat[0][1] src[0]*mat[0][0]
pfmul mm3,[edx+8] ; 0 src[0]*mat[0][2]
pfmul mm1,[edx+16] ; src[1]*mat[1][1] src[1]*mat[1][0]
pfmul mm4,[edx+24] ; 0 src[1]*mat[1][2]
pfmul mm2,[edx+32] ; src[2]*mat[2][1] src[2]*mat[2][0]
pfmul mm5,[edx+40] ; 0 src[2]*mat[2][2]
pfadd mm0,mm1 ; src[0]*mat[0][1]+src[1]*mat[1][1] src[0]*mat[0][0]+src[1]*mat[1][0]
pfadd mm3,mm4 ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]
pfadd mm0,mm2 ; src[0]*mat[0][1]+src[1]*mat[1][1]+src[2]*mat[2][1] src[0]*mat[0][0]+src[1]*mat[1][0]+src[2]*mat[2][0]
pfadd mm3,mm5 ; 0 src[0]*mat[0][2]+src[1]*mat[1][2]+src[2]*mat[2][2]
movq [eax],mm0 ; mat[0][1]*src[0]+mat[1][1]*src[1]+mat[2][1]*src[2] mat[0][0]*src[0]+mat[1][0]*src[1]+mat[2][0]*src[2]
movd [eax+8],mm3 ; mat[0][2]*src[0]+mat[1][2]*src[1]+mat[2][2]*src[2]
femms
leave
ret
}
}
extern "C" void __declspec(naked) InverseTransformVector3DNOW(float *src, float *dst, float mat[4][4])
{
_asm {
push ebp
mov ebp,esp
femms
mov ecx,[src]
mov eax,[dst]
mov edx,[mat]
movq mm0,[ecx] ; src[1] src[0]
movd mm4,[ecx+8] ; 0 src[2]
movq mm1,mm0 ; src[1] src[0]
pfmul mm0,[edx] ; src[1]*mat[0][1] src[0]*mat[0][0]
movq mm5,mm4 ; 0 src[2]
pfmul mm4,[edx+8] ; 0 src[2]*mat[0][2]
movq mm2,mm1 ; src[1] src[0]
pfmul mm1,[edx+16] ; src[1]*mat[1][1] src[0]*mat[1][0]
movq mm6,mm5 ; 0 src[2]
pfmul mm5,[edx+24] ; 0 src[2]*mat[1][2]
movq mm3,mm2 ; src[1] src[0]
pfmul mm2,[edx+32] ; src[1]*mat[2][1] src[0]*mat[2][0]
movq mm7,mm6 ; 0 src[2]
pfmul mm6,[edx+40] ; 0 src[2]*mat[2][2]
pfacc mm0,mm4 ; src[2]*mat[0][2] src[1]*mat[0][1]+src[0]*mat[0][0]
pfacc mm1,mm5 ; src[2]*mat[1][2] src[1]*mat[1][1]+src[0]*mat[1][0]
pfacc mm2,mm6 ; src[2]*mat[2][2] src[1]*mat[2][1]+src[0]*mat[2][0]
pfacc mm0,mm1 ; src[2]*mat[1][2]+src[1]*mat[1][1]+src[0]*mat[1][0] src[2]*mat[0][2]+src[1]*mat[0][1]+src[0]*mat[0][0]
pfacc mm2,mm3 ; 0 src[2]*mat[2][2]+src[1]*mat[2][1]+src[0]*mat[2][0]
movq [eax],mm0 ; mat[1][0]*src[0]+mat[1][1]*src[1]+mat[1][2]*src[2] mat[0][0]*src[0]+mat[0][1]*src[1]+mat[0][2]*src[2]
movd [eax+8],mm2 ; mat[2][0]*src[0]+mat[2][1]*src[1]+mat[2][2]*src[2]
femms
leave
ret
}
}
extern "C" void __declspec(naked) MulMatrices3DNOW(float m1[4][4],float m2[4][4],float r[4][4])
{
_asm {
push ebp
mov ebp,esp
femms
mov ecx,[m1]
mov eax,[r]
mov edx,[m2]
movq mm0,[ecx]
movq mm1,[ecx+8]
movq mm4,[edx]
punpckhdq mm2,mm0
movq mm5,[edx+16]
punpckhdq mm3,mm1
movq mm6,[edx+32]
punpckldq mm0,mm0
punpckldq mm1,mm1
pfmul mm4,mm0
punpckhdq mm2,mm2
pfmul mm0,[edx+8]
movq mm7,[edx+48]
pfmul mm5,mm2
punpckhdq mm3,mm3
pfmul mm2,[edx+24]
pfmul mm6,mm1
pfadd mm5,mm4
pfmul mm1,[edx+40]
pfadd mm2,mm0
pfmul mm7,mm3
pfadd mm6,mm5
pfmul mm3,[edx+56]
pfadd mm2,mm1
pfadd mm7,mm6
movq mm0,[ecx+16]
pfadd mm3,mm2
movq mm1,[ecx+24]
movq [eax],mm7
movq mm4,[edx]
movq [eax+8],mm3
punpckhdq mm2,mm0
movq mm5,[edx+16]
punpckhdq mm3,mm1
movq mm6,[edx+32]
punpckldq mm0,mm0
punpckldq mm1,mm1
pfmul mm4,mm0
punpckhdq mm2,mm2
pfmul mm0,[edx+8]
movq mm7,[edx+48]
pfmul mm5,mm2
punpckhdq mm3,mm3
pfmul mm2,[edx+24]
pfmul mm6,mm1
pfadd mm5,mm4
pfmul mm1,[edx+40]
pfadd mm2,mm0
pfmul mm7,mm3
pfadd mm6,mm5
pfmul mm3,[edx+56]
pfadd mm2,mm1
pfadd mm7,mm6
movq mm0,[ecx+32]
pfadd mm3,mm2
movq mm1,[ecx+40]
movq [eax+16],mm7
movq mm4,[edx]
movq [eax+24],mm3
punpckhdq mm2,mm0
movq mm5,[edx+16]
punpckhdq mm3,mm1
movq mm6,[edx+32]
punpckldq mm0,mm0
punpckldq mm1,mm1
pfmul mm4,mm0
punpckhdq mm2,mm2
pfmul mm0,[edx+8]
movq mm7,[edx+48]
pfmul mm5,mm2
punpckhdq mm3,mm3
pfmul mm2,[edx+24]
pfmul mm6,mm1
pfadd mm5,mm4
pfmul mm1,[edx+40]
pfadd mm2,mm0
pfmul mm7,mm3
pfadd mm6,mm5
pfmul mm3,[edx+56]
pfadd mm2,mm1
pfadd mm7,mm6
movq mm0,[ecx+48]
pfadd mm3,mm2
movq mm1,[ecx+56]
movq [eax+32],mm7
movq mm4,[edx]
movq [eax+40],mm3
punpckhdq mm2,mm0
movq mm5,[edx+16]
punpckhdq mm3,mm1
movq mm6,[edx+32]
punpckldq mm0,mm0
punpckldq mm1,mm1
pfmul mm4,mm0
punpckhdq mm2,mm2
pfmul mm0,[edx+8]
movq mm7,[edx+48]
pfmul mm5,mm2
punpckhdq mm3,mm3
pfmul mm2,[edx+24]
pfmul mm6,mm1
pfadd mm5,mm4
pfmul mm1,[edx+40]
pfadd mm2,mm0
pfmul mm7,mm3
pfadd mm6,mm5
pfmul mm3,[edx+56]
pfadd mm2,mm1
pfadd mm7,mm6
pfadd mm3,mm2
movq [eax+48],mm7
movq [eax+56],mm3
femms
leave
ret
}
}
extern "C" float __declspec(naked) DotProduct3DNOW(register float *v1, register float *v2)
{
_asm {
push ebp
mov ebp,esp
femms
mov edx,[v1]
mov eax,[v2]
movq mm0,[edx]
movq mm3,[eax]
pfmul mm0,mm3
movq mm2,[edx+8]
movq mm1,[eax+8]
pfacc mm0,mm0
pfmul mm1,mm2
pfadd mm0,mm1
movd eax,mm0
femms
leave
ret
}
}
extern "C" void __declspec(naked) NormalizeVector3DNOW(float *v)
{
_asm {
push ebp
mov ebp,esp
femms
mov edx,[v]
movq mm0,[edx]
movq mm3,[edx+8]
movq mm1,mm0
movq mm2,mm3
pfmul mm0,mm0
pfmul mm3,mm3
pfacc mm0,mm0
pfadd mm0,mm3
;movq mm4,mm0 ; prepare for 24bit precision
;punpckldq mm4,mm4 ; prepare for 24bit precision
pfrsqrt mm0,mm0 ; 15bit precision 1/sqrtf(v)
;movq mm3,mm0
;pfmul mm0,mm0
;pfrsqit1 mm0,mm4
;pfrcpit2 mm0,mm3 ; 24bit precision 1/sqrtf(v)
pfmul mm1,mm0
pfmul mm2,mm0
movq [edx],mm1
movq [edx+8],mm2
femms
leave
ret
}
}

View File

@ -356,10 +356,6 @@
RelativePath="3dmath.h" RelativePath="3dmath.h"
> >
</File> </File>
<File
RelativePath=".\3dmathSIMD.asm.cpp"
>
</File>
<File <File
RelativePath="Combine.cpp" RelativePath="Combine.cpp"
> >