project64/Source/Glide64/3dmath.cpp

/*
* Glide64 - Glide video plugin for Nintendo 64 emulators.
* Copyright (c) 2002  Dave2001
* Copyright (c) 2003-2009  Sergey 'Gonetz' Lipski
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

//****************************************************************
//
// Glide64 - Glide Plugin for Nintendo 64 emulators
// Project started on December 29th, 2001
//
// Authors:
// Dave2001, original author, founded the project in 2001, left it in 2002
// Gugaman, joined the project in 2002, left it in 2002
// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002
// Hiroshi 'KoolSmoky' Morii, joined the project in 2007
//
//****************************************************************
//
// To modify Glide64:
// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.
// * Do NOT send me the whole project or file that you modified.  Take out your modified code sections, and tell me where to put them.  If people sent the whole thing, I would have many different versions, but no idea how to combine them all.
//
//****************************************************************

#include "Gfx_1.3.h"
extern "C" {
#ifndef NOSSE
#include <xmmintrin.h>
#endif
}

#include <math.h>
#include "3dmath.h"

void calc_light (VERTEX *v)
{
  float light_intensity = 0.0f;
  register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};
  for (uint32_t l=0; l<rdp.num_lights; l++)
  {
    light_intensity = DotProduct (rdp.light_vector[l], v->vec);
    
    if (light_intensity > 0.0f) 
    {
      color[0] += rdp.light[l].r * light_intensity;
      color[1] += rdp.light[l].g * light_intensity;
      color[2] += rdp.light[l].b * light_intensity;
    }
  }
  
  if (color[0] > 1.0f) color[0] = 1.0f;
  if (color[1] > 1.0f) color[1] = 1.0f;
  if (color[2] > 1.0f) color[2] = 1.0f;
  
  v->r = (uint8_t)(color[0]*255.0f);
  v->g = (uint8_t)(color[1]*255.0f);
  v->b = (uint8_t)(color[2]*255.0f);
}

//*
void calc_linear (VERTEX *v)
{
  if (settings.force_calc_sphere)
  {
    calc_sphere(v);
    return;
  }
  DECLAREALIGN16VAR(vec[3]);
  
  TransformVector (v->vec, vec, rdp.model);
  //    TransformVector (v->vec, vec, rdp.combined);
  NormalizeVector (vec);
  float x, y;
  if (!rdp.use_lookat)
  {
    x = vec[0];
    y = vec[1];
  }
  else
  {
    x = DotProduct (rdp.lookat[0], vec);
    y = DotProduct (rdp.lookat[1], vec);
  }
  
  if (x > 1.0f)
    x = 1.0f;
  else if (x < -1.0f)
    x = -1.0f;
  if (y > 1.0f)
    y = 1.0f;
  else if (y < -1.0f)
    y = -1.0f;
  
  if (rdp.cur_cache[0])
  {
    // scale >> 6 is size to map to
    v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);
    v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);
  }
  v->uv_scaled = 1;
#ifdef EXTREME_LOGGING
  FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);
#endif
}

void calc_sphere (VERTEX *v)
{
//  LRDP("calc_sphere\n");
  DECLAREALIGN16VAR(vec[3]);
  int s_scale, t_scale;
  if (settings.hacks&hack_Chopper)
  {
    s_scale = minval(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);
    t_scale = minval(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);
  }
  else
  {
    s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;
    t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;
  }
  TransformVector (v->vec, vec, rdp.model);
  //    TransformVector (v->vec, vec, rdp.combined);
  NormalizeVector (vec);
  float x, y;
  if (!rdp.use_lookat)
  {
    x = vec[0];
    y = vec[1];
  }
  else
  {
    x = DotProduct (rdp.lookat[0], vec);
    y = DotProduct (rdp.lookat[1], vec);
  }
  v->ou = (x * 0.5f + 0.5f) * s_scale;
  v->ov = (y * 0.5f + 0.5f) * t_scale;
  v->uv_scaled = 1;
#ifdef EXTREME_LOGGING
  FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);
#endif
}

float DotProductC(register float *v1, register float *v2)
{
    register float result;
    result = v1[0]*v2[0] + v1[1]*v2[1] + v1[2]*v2[2];
    return(result);
}

void NormalizeVectorC(float *v)
{
    register float len;
    len = sqrtf(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
    if (len > 0.0f)
    {
        v[0] /= len;
        v[1] /= len;
        v[2] /= len;
    }
}

void TransformVectorC(float *src, float *dst, float mat[4][4])
{
  dst[0] = mat[0][0]*src[0] + mat[1][0]*src[1] + mat[2][0]*src[2];
  dst[1] = mat[0][1]*src[0] + mat[1][1]*src[1] + mat[2][1]*src[2];
  dst[2] = mat[0][2]*src[0] + mat[1][2]*src[1] + mat[2][2]*src[2];
}

void InverseTransformVectorC (float *src, float *dst, float mat[4][4])
{
  dst[0] = mat[0][0]*src[0] + mat[0][1]*src[1] + mat[0][2]*src[2];
  dst[1] = mat[1][0]*src[0] + mat[1][1]*src[1] + mat[1][2]*src[2];
  dst[2] = mat[2][0]*src[0] + mat[2][1]*src[1] + mat[2][2]*src[2];
}

void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
{
    float row[4][4];
    register unsigned int i, j;

    for (i = 0; i < 4; i++)
        for (j = 0; j < 4; j++)
            row[i][j] = m2[i][j];
    for (i = 0; i < 4; i++)
    {
        // auto-vectorizable algorithm
        // vectorized loop style, such that compilers can
        // easily create optimized SSE instructions.
        float leftrow[4];
        float summand[4][4];

        for (j = 0; j < 4; j++)
            leftrow[j] = m1[i][j];

        for (j = 0; j < 4; j++)
            summand[0][j] = leftrow[0] * row[0][j];
        for (j = 0; j < 4; j++)
            summand[1][j] = leftrow[1] * row[1][j];
        for (j = 0; j < 4; j++)
            summand[2][j] = leftrow[2] * row[2][j];
        for (j = 0; j < 4; j++)
            summand[3][j] = leftrow[3] * row[3][j];

        for (j = 0; j < 4; j++)
            r[i][j] =
                summand[0][j]
              + summand[1][j]
              + summand[2][j]
              + summand[3][j]
        ;
    }
}

// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication
//                      and 3DNOW! 4x4 4x4 matrix multiplication
// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible
// This will need fixing.
MULMATRIX MulMatrices = MulMatricesC;
TRANSFORMVECTOR TransformVector = TransformVectorC;
TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;
DOTPRODUCT DotProduct = DotProductC;
NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;

void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])
{
#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
   /* [row][col]*/
  typedef float v4sf __attribute__ ((vector_size (16)));
  v4sf row0 = _mm_loadu_ps(m2[0]);
  v4sf row1 = _mm_loadu_ps(m2[1]);
  v4sf row2 = _mm_loadu_ps(m2[2]);
  v4sf row3 = _mm_loadu_ps(m2[3]);

  for (int i = 0; i < 4; ++i)
  {
    v4sf leftrow = _mm_loadu_ps(m1[i]);

    // Fill tmp with four copies of leftrow[0]
    v4sf tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 0);
    // Calculate the four first summands
    v4sf destrow = tmp * row0;

    // Fill tmp with four copies of leftrow[1]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));
    destrow += tmp * row1;

    // Fill tmp with four copies of leftrow[2]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));
    destrow += tmp * row2;

    // Fill tmp with four copies of leftrow[3]
    tmp = leftrow;
    tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));
    destrow += tmp * row3;

    __builtin_ia32_storeups(r[i], destrow);
  }
 #elif !defined(NO_ASM) && !defined(NOSSE)
  __asm
  {
    mov     eax, dword ptr [r]  
      mov     ecx, dword ptr [m1]
      mov     edx, dword ptr [m2]

      movaps  xmm0,[edx]
      movaps  xmm1,[edx+16]
      movaps  xmm2,[edx+32]
      movaps  xmm3,[edx+48]

// r[0][0],r[0][1],r[0][2],r[0][3]

      movaps  xmm4,xmmword ptr[ecx]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax],xmm4

// r[1][0],r[1][1],r[1][2],r[1][3]

      movaps  xmm4,xmmword ptr[ecx+16]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+16],xmm4


// r[2][0],r[2][1],r[2][2],r[2][3]

      movaps  xmm4,xmmword ptr[ecx+32]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+32],xmm4

// r[3][0],r[3][1],r[3][2],r[3][3]

      movaps  xmm4,xmmword ptr[ecx+48]
      movaps  xmm5,xmm4
      movaps  xmm6,xmm4
      movaps  xmm7,xmm4

      shufps  xmm4,xmm4,00000000b
      shufps  xmm5,xmm5,01010101b
      shufps  xmm6,xmm6,10101010b
      shufps  xmm7,xmm7,11111111b

      mulps   xmm4,xmm0
      mulps   xmm5,xmm1
      mulps   xmm6,xmm2
      mulps   xmm7,xmm3

      addps   xmm4,xmm5
      addps   xmm4,xmm6
      addps   xmm4,xmm7

      movaps  xmmword ptr[eax+48],xmm4
    }
#endif // _WIN32
  }


  void math_init()
  {
#ifndef _DEBUG
    int IsSSE = FALSE;
#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)
    int edx, eax;
    GLIDE64_TRY
    {
  #if defined(__x86_64__)
      asm volatile(" cpuid;        "
        : "=a"(eax), "=d"(edx)
        : "0"(1)
        : "rbx", "rcx"
        );
  #else
      asm volatile(" push %%ebx;   "
        " push %%ecx;   "
        " cpuid;        "
        " pop %%ecx;    "
        " pop %%ebx;    "
        : "=a"(eax), "=d"(edx)
        : "0"(1)
        :
      );
  #endif
    }
    GLIDE64_CATCH
      { return; }
    // Check for SSE
    if (edx & (1 << 25))
      IsSSE = TRUE;
#elif !defined(NO_ASM) && !defined(NOSSE)
    DWORD dwEdx;
    __try
    {
      __asm 
      {
        mov  eax,1
          cpuid
          mov dwEdx,edx
        }  
      }
      __except(EXCEPTION_EXECUTE_HANDLER)
      {
        return;
      }

      if (dwEdx & (1<<25)) 
      {
        if (dwEdx & (1<<24))
        {      
          __try
          {
            __asm xorps xmm0, xmm0
              IsSSE = TRUE;
          }
          __except(EXCEPTION_EXECUTE_HANDLER)
          {
            return;
          }
        }
      }
#endif // _WIN32
      if (IsSSE)
      {
        MulMatrices = MulMatricesSSE;
        LOG("3DNOW! detected.\n");
      }

#endif //_DEBUG
    }
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`/*`
			`* Glide64 - Glide video plugin for Nintendo 64 emulators.`
			`* Copyright (c) 2002 Dave2001`
			`* Copyright (c) 2003-2009 Sergey 'Gonetz' Lipski`
			`*`
			`* This program is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* any later version.`
			`*`
			`* This program is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program; if not, write to the Free Software`
			`* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`
			`*/`

			`//****************************************************************`
			`//`
			`// Glide64 - Glide Plugin for Nintendo 64 emulators`
			`// Project started on December 29th, 2001`
			`//`
			`// Authors:`
			`// Dave2001, original author, founded the project in 2001, left it in 2002`
			`// Gugaman, joined the project in 2002, left it in 2002`
			`// Sergey 'Gonetz' Lipski, joined the project in 2002, main author since fall of 2002`
			`// Hiroshi 'KoolSmoky' Morii, joined the project in 2007`
			`//`
			`//****************************************************************`
			`//`
			`// To modify Glide64:`
			`// * Write your name and (optional)email, commented by your work, so I know who did it, and so that you can find which parts you modified when it comes time to send it to me.`
			`// * Do NOT send me the whole project or file that you modified. Take out your modified code sections, and tell me where to put them. If people sent the whole thing, I would have many different versions, but no idea how to combine them all.`
			`//`
			`//****************************************************************`

[Glide64] Remove space from filename to avoid confusion of build tools https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/a1b0c70492c5664ee5208f429bf79a4b296f9936 2015-10-09 00:07:09 +00:00			`#include "Gfx_1.3.h"`
[Glide64] Other cleanups like changing __WINDOWS__ to WIN32 2015-10-08 23:44:03 +00:00			`extern "C" {`
			`#ifndef NOSSE`
			`#include <xmmintrin.h>`
			`#endif`
			`}`

			`#include <math.h>`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`#include "3dmath.h"`

			`void calc_light (VERTEX *v)`
			`{`
			`float light_intensity = 0.0f;`
			`register float color[3] = {rdp.light[rdp.num_lights].r, rdp.light[rdp.num_lights].g, rdp.light[rdp.num_lights].b};`
[Project64] Convert wxUint32 to uint32_t 2016-01-20 06:14:48 +00:00			`for (uint32_t l=0; l<rdp.num_lights; l++)`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`{`
			`light_intensity = DotProduct (rdp.light_vector[l], v->vec);`

			`if (light_intensity > 0.0f)`
			`{`
			`color[0] += rdp.light[l].r * light_intensity;`
			`color[1] += rdp.light[l].g * light_intensity;`
			`color[2] += rdp.light[l].b * light_intensity;`
			`}`
			`}`

			`if (color[0] > 1.0f) color[0] = 1.0f;`
			`if (color[1] > 1.0f) color[1] = 1.0f;`
			`if (color[2] > 1.0f) color[2] = 1.0f;`

[Project64] Rename wxUint8 to uint8_t 2016-01-20 06:17:45 +00:00			`v->r = (uint8_t)(color[0]*255.0f);`
			`v->g = (uint8_t)(color[1]*255.0f);`
			`v->b = (uint8_t)(color[2]*255.0f);`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`}`

			`//*`
			`void calc_linear (VERTEX *v)`
			`{`
			`if (settings.force_calc_sphere)`
			`{`
			`calc_sphere(v);`
			`return;`
			`}`
			`DECLAREALIGN16VAR(vec[3]);`

			`TransformVector (v->vec, vec, rdp.model);`
			`// TransformVector (v->vec, vec, rdp.combined);`
			`NormalizeVector (vec);`
			`float x, y;`
			`if (!rdp.use_lookat)`
			`{`
			`x = vec[0];`
			`y = vec[1];`
			`}`
			`else`
			`{`
			`x = DotProduct (rdp.lookat[0], vec);`
			`y = DotProduct (rdp.lookat[1], vec);`
			`}`

			`if (x > 1.0f)`
			`x = 1.0f;`
			`else if (x < -1.0f)`
			`x = -1.0f;`
			`if (y > 1.0f)`
			`y = 1.0f;`
			`else if (y < -1.0f)`
			`y = -1.0f;`

			`if (rdp.cur_cache[0])`
			`{`
			`// scale >> 6 is size to map to`
			`v->ou = (acosf(x)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_s_scale >> 6);`
			`v->ov = (acosf(y)/3.141592654f) * (rdp.tiles[rdp.cur_tile].org_t_scale >> 6);`
			`}`
			`v->uv_scaled = 1;`
			`#ifdef EXTREME_LOGGING`
			`FRDP ("calc linear u: %f, v: %f\n", v->ou, v->ov);`
			`#endif`
			`}`

			`void calc_sphere (VERTEX *v)`
			`{`
			`// LRDP("calc_sphere\n");`
			`DECLAREALIGN16VAR(vec[3]);`
			`int s_scale, t_scale;`
			`if (settings.hacks&hack_Chopper)`
			`{`
[Glide64] Rename max and min to be maxval and minval 2016-01-25 09:58:29 +00:00			`s_scale = minval(rdp.tiles[rdp.cur_tile].org_s_scale >> 6, rdp.tiles[rdp.cur_tile].lr_s);`
			`t_scale = minval(rdp.tiles[rdp.cur_tile].org_t_scale >> 6, rdp.tiles[rdp.cur_tile].lr_t);`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`}`
			`else`
			`{`
			`s_scale = rdp.tiles[rdp.cur_tile].org_s_scale >> 6;`
			`t_scale = rdp.tiles[rdp.cur_tile].org_t_scale >> 6;`
			`}`
			`TransformVector (v->vec, vec, rdp.model);`
			`// TransformVector (v->vec, vec, rdp.combined);`
			`NormalizeVector (vec);`
			`float x, y;`
			`if (!rdp.use_lookat)`
			`{`
			`x = vec[0];`
			`y = vec[1];`
			`}`
			`else`
			`{`
			`x = DotProduct (rdp.lookat[0], vec);`
			`y = DotProduct (rdp.lookat[1], vec);`
			`}`
			`v->ou = (x * 0.5f + 0.5f) * s_scale;`
			`v->ov = (y * 0.5f + 0.5f) * t_scale;`
			`v->uv_scaled = 1;`
			`#ifdef EXTREME_LOGGING`
			`FRDP ("calc sphere u: %f, v: %f\n", v->ou, v->ov);`
			`#endif`
			`}`

			`float DotProductC(register float v1, register float v2)`
			`{`
			`register float result;`
			`result = v1[0]v2[0] + v1[1]v2[1] + v1[2]*v2[2];`
			`return(result);`
			`}`

			`void NormalizeVectorC(float *v)`
			`{`
			`register float len;`
			`len = sqrtf(v[0]v[0] + v[1]v[1] + v[2]*v[2]);`
			`if (len > 0.0f)`
			`{`
			`v[0] /= len;`
			`v[1] /= len;`
			`v[2] /= len;`
			`}`
			`}`

			`void TransformVectorC(float src, float dst, float mat[4][4])`
			`{`
			`dst[0] = mat[0][0]src[0] + mat[1][0]src[1] + mat[2][0]*src[2];`
			`dst[1] = mat[0][1]src[0] + mat[1][1]src[1] + mat[2][1]*src[2];`
			`dst[2] = mat[0][2]src[0] + mat[1][2]src[1] + mat[2][2]*src[2];`
			`}`

			`void InverseTransformVectorC (float src, float dst, float mat[4][4])`
			`{`
			`dst[0] = mat[0][0]src[0] + mat[0][1]src[1] + mat[0][2]*src[2];`
			`dst[1] = mat[1][0]src[0] + mat[1][1]src[1] + mat[1][2]*src[2];`
			`dst[2] = mat[2][0]src[0] + mat[2][1]src[1] + mat[2][2]*src[2];`
			`}`

			`void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])`
			`{`
Rewrite MulMatricesC in a more auto-vectorizable way. https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/c5998a531b837d13658b732eaee447af7535afd5 2015-10-10 12:23:26 +00:00			`float row[4][4];`
			`register unsigned int i, j;`

			`for (i = 0; i < 4; i++)`
			`for (j = 0; j < 4; j++)`
			`row[i][j] = m2[i][j];`
			`for (i = 0; i < 4; i++)`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`{`
Rewrite MulMatricesC in a more auto-vectorizable way. https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/c5998a531b837d13658b732eaee447af7535afd5 2015-10-10 12:23:26 +00:00			`// auto-vectorizable algorithm`
			`// vectorized loop style, such that compilers can`
			`// easily create optimized SSE instructions.`
			`float leftrow[4];`
			`float summand[4][4];`

			`for (j = 0; j < 4; j++)`
			`leftrow[j] = m1[i][j];`

			`for (j = 0; j < 4; j++)`
			`summand[0][j] = leftrow[0] * row[0][j];`
			`for (j = 0; j < 4; j++)`
			`summand[1][j] = leftrow[1] * row[1][j];`
			`for (j = 0; j < 4; j++)`
			`summand[2][j] = leftrow[2] * row[2][j];`
			`for (j = 0; j < 4; j++)`
			`summand[3][j] = leftrow[3] * row[3][j];`

			`for (j = 0; j < 4; j++)`
			`r[i][j] =`
			`summand[0][j]`
			`+ summand[1][j]`
			`+ summand[2][j]`
			`+ summand[3][j]`
			`;`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`}`
			`}`

			`// 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication`
			`// and 3DNOW! 4x4 4x4 matrix multiplication`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`// 2011-01-03 Balrog - removed because is in NASM format and not 64-bit compatible`
			`// This will need fixing.`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`MULMATRIX MulMatrices = MulMatricesC;`
			`TRANSFORMVECTOR TransformVector = TransformVectorC;`
			`TRANSFORMVECTOR InverseTransformVector = InverseTransformVectorC;`
			`DOTPRODUCT DotProduct = DotProductC;`
			`NORMALIZEVECTOR NormalizeVector = NormalizeVectorC;`

[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`void MulMatricesSSE(float m1[4][4],float m2[4][4],float r[4][4])`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`{`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)`
			`/* [row][col]*/`
			`typedef float v4sf __attribute__ ((vector_size (16)));`
Use generic _mm_loadu_ps instead of GCC specific __builtin_ia32_loadups https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/be0a0272732a49ebc853dbc3d245642738e85ec1 2015-10-11 20:08:12 +00:00			`v4sf row0 = _mm_loadu_ps(m2[0]);`
			`v4sf row1 = _mm_loadu_ps(m2[1]);`
			`v4sf row2 = _mm_loadu_ps(m2[2]);`
			`v4sf row3 = _mm_loadu_ps(m2[3]);`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`for (int i = 0; i < 4; ++i)`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`{`
Use generic _mm_loadu_ps instead of GCC specific __builtin_ia32_loadups https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/be0a0272732a49ebc853dbc3d245642738e85ec1 2015-10-11 20:08:12 +00:00			`v4sf leftrow = _mm_loadu_ps(m1[i]);`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00
			`// Fill tmp with four copies of leftrow[0]`
			`v4sf tmp = leftrow;`
			`tmp = _mm_shuffle_ps (tmp, tmp, 0);`
			`// Calculate the four first summands`
			`v4sf destrow = tmp * row0;`

			`// Fill tmp with four copies of leftrow[1]`
			`tmp = leftrow;`
			`tmp = _mm_shuffle_ps (tmp, tmp, 1 + (1 << 2) + (1 << 4) + (1 << 6));`
			`destrow += tmp * row1;`

			`// Fill tmp with four copies of leftrow[2]`
			`tmp = leftrow;`
			`tmp = _mm_shuffle_ps (tmp, tmp, 2 + (2 << 2) + (2 << 4) + (2 << 6));`
			`destrow += tmp * row2;`

			`// Fill tmp with four copies of leftrow[3]`
			`tmp = leftrow;`
			`tmp = _mm_shuffle_ps (tmp, tmp, 3 + (3 << 2) + (3 << 4) + (3 << 6));`
			`destrow += tmp * row3;`

			`__builtin_ia32_storeups(r[i], destrow);`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`}`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`#elif !defined(NO_ASM) && !defined(NOSSE)`
			`__asm`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`{`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`mov eax, dword ptr [r]`
			`mov ecx, dword ptr [m1]`
			`mov edx, dword ptr [m2]`

			`movaps xmm0,[edx]`
			`movaps xmm1,[edx+16]`
			`movaps xmm2,[edx+32]`
			`movaps xmm3,[edx+48]`

			`// r[0][0],r[0][1],r[0][2],r[0][3]`

			`movaps xmm4,xmmword ptr[ecx]`
			`movaps xmm5,xmm4`
			`movaps xmm6,xmm4`
			`movaps xmm7,xmm4`

			`shufps xmm4,xmm4,00000000b`
			`shufps xmm5,xmm5,01010101b`
			`shufps xmm6,xmm6,10101010b`
			`shufps xmm7,xmm7,11111111b`

			`mulps xmm4,xmm0`
			`mulps xmm5,xmm1`
			`mulps xmm6,xmm2`
			`mulps xmm7,xmm3`

			`addps xmm4,xmm5`
			`addps xmm4,xmm6`
			`addps xmm4,xmm7`

			`movaps xmmword ptr[eax],xmm4`

			`// r[1][0],r[1][1],r[1][2],r[1][3]`

			`movaps xmm4,xmmword ptr[ecx+16]`
			`movaps xmm5,xmm4`
			`movaps xmm6,xmm4`
			`movaps xmm7,xmm4`

			`shufps xmm4,xmm4,00000000b`
			`shufps xmm5,xmm5,01010101b`
			`shufps xmm6,xmm6,10101010b`
			`shufps xmm7,xmm7,11111111b`

			`mulps xmm4,xmm0`
			`mulps xmm5,xmm1`
			`mulps xmm6,xmm2`
			`mulps xmm7,xmm3`

			`addps xmm4,xmm5`
			`addps xmm4,xmm6`
			`addps xmm4,xmm7`

			`movaps xmmword ptr[eax+16],xmm4`


			`// r[2][0],r[2][1],r[2][2],r[2][3]`

			`movaps xmm4,xmmword ptr[ecx+32]`
			`movaps xmm5,xmm4`
			`movaps xmm6,xmm4`
			`movaps xmm7,xmm4`

			`shufps xmm4,xmm4,00000000b`
			`shufps xmm5,xmm5,01010101b`
			`shufps xmm6,xmm6,10101010b`
			`shufps xmm7,xmm7,11111111b`

			`mulps xmm4,xmm0`
			`mulps xmm5,xmm1`
			`mulps xmm6,xmm2`
			`mulps xmm7,xmm3`

			`addps xmm4,xmm5`
			`addps xmm4,xmm6`
			`addps xmm4,xmm7`

			`movaps xmmword ptr[eax+32],xmm4`

			`// r[3][0],r[3][1],r[3][2],r[3][3]`

			`movaps xmm4,xmmword ptr[ecx+48]`
			`movaps xmm5,xmm4`
			`movaps xmm6,xmm4`
			`movaps xmm7,xmm4`

			`shufps xmm4,xmm4,00000000b`
			`shufps xmm5,xmm5,01010101b`
			`shufps xmm6,xmm6,10101010b`
			`shufps xmm7,xmm7,11111111b`

			`mulps xmm4,xmm0`
			`mulps xmm5,xmm1`
			`mulps xmm6,xmm2`
			`mulps xmm7,xmm3`

			`addps xmm4,xmm5`
			`addps xmm4,xmm6`
			`addps xmm4,xmm7`

			`movaps xmmword ptr[eax+48],xmm4`
			`}`
			`#endif // _WIN32`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`}`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00


			`void math_init()`
Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`{`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`#ifndef _DEBUG`
			`int IsSSE = FALSE;`
			`#if defined(__GNUC__) && !defined(NO_ASM) && !defined(NOSSE)`
from fixed warnings and errors in vs2012 https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/56f2b7ef85d447c3dfe7885e0a8ca47bb1a5b547 2015-10-11 20:17:01 +00:00			`int edx, eax;`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`GLIDE64_TRY`
			`{`
			`#if defined(__x86_64__)`
			`asm volatile(" cpuid; "`
			`: "=a"(eax), "=d"(edx)`
			`: "0"(1)`
			`: "rbx", "rcx"`
			`);`
			`#else`
			`asm volatile(" push %%ebx; "`
			`" push %%ecx; "`
			`" cpuid; "`
			`" pop %%ecx; "`
			`" pop %%ebx; "`
			`: "=a"(eax), "=d"(edx)`
			`: "0"(1)`
			`:`
			`);`
			`#endif`
			`}`
			`GLIDE64_CATCH`
			`{ return; }`
			`// Check for SSE`
			`if (edx & (1 << 25))`
			`IsSSE = TRUE;`
			`#elif !defined(NO_ASM) && !defined(NOSSE)`
			`DWORD dwEdx;`
			`__try`
			`{`
			`__asm`
			`{`
			`mov eax,1`
			`cpuid`
			`mov dwEdx,edx`
			`}`
			`}`
			`__except(EXCEPTION_EXECUTE_HANDLER)`
			`{`
			`return;`
			`}`

			`if (dwEdx & (1<<25))`
			`{`
			`if (dwEdx & (1<<24))`
			`{`
			`__try`
			`{`
			`__asm xorps xmm0, xmm0`
			`IsSSE = TRUE;`
			`}`
			`__except(EXCEPTION_EXECUTE_HANDLER)`
			`{`
			`return;`
			`}`
			`}`
			`}`
			`#endif // _WIN32`
			`if (IsSSE)`
			`{`
			`MulMatrices = MulMatricesSSE;`
			`LOG("3DNOW! detected.\n");`
			`}`

Add Glide64 plugin 2013-04-04 21:22:19 +00:00			`#endif //_DEBUG`
[glide64] remove 3dmathSIMD.asm.cpp (from Merge Glide64 changes from the previous attempt) https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/e110f5048929560afd6a505b76d868d7e6c1899f 2015-10-09 05:18:09 +00:00			`}`