From 1dd25b2063d88147bdbe74b5e7f9887e00a642ef Mon Sep 17 00:00:00 2001
From: zilmar <zilmar@pj64-emu.com>
Date: Sat, 10 Oct 2015 23:23:26 +1100
Subject: [PATCH] Rewrite MulMatricesC in a more auto-vectorizable way.

https://github.com/mupen64plus/mupen64plus-video-glide64mk2/commit/c5998a531b837d13658b732eaee447af7535afd5
---
 Source/Glide64/3dmath.cpp | 40 +++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/Source/Glide64/3dmath.cpp b/Source/Glide64/3dmath.cpp
index 675739d82..a91a48c08 100644
--- a/Source/Glide64/3dmath.cpp
+++ b/Source/Glide64/3dmath.cpp
@@ -190,16 +190,40 @@ void InverseTransformVectorC (float *src, float *dst, float mat[4][4])
 
 void MulMatricesC(float m1[4][4],float m2[4][4],float r[4][4])
 {
-  for (int i=0; i<4; i++)
-  {
-    for (int j=0; j<4; j++)
+    float row[4][4];
+    register unsigned int i, j;
+
+    for (i = 0; i < 4; i++)
+        for (j = 0; j < 4; j++)
+            row[i][j] = m2[i][j];
+    for (i = 0; i < 4; i++)
     {
-      r[i][j] = m1[i][0] * m2[0][j] +
-                m1[i][1] * m2[1][j] +
-                m1[i][2] * m2[2][j] +
-                m1[i][3] * m2[3][j];
+        // auto-vectorizable algorithm
+        // vectorized loop style, such that compilers can
+        // easily create optimized SSE instructions.
+        float leftrow[4];
+        float summand[4][4];
+
+        for (j = 0; j < 4; j++)
+            leftrow[j] = m1[i][j];
+
+        for (j = 0; j < 4; j++)
+            summand[0][j] = leftrow[0] * row[0][j];
+        for (j = 0; j < 4; j++)
+            summand[1][j] = leftrow[1] * row[1][j];
+        for (j = 0; j < 4; j++)
+            summand[2][j] = leftrow[2] * row[2][j];
+        for (j = 0; j < 4; j++)
+            summand[3][j] = leftrow[3] * row[3][j];
+
+        for (j = 0; j < 4; j++)
+            r[i][j] =
+                summand[0][j]
+              + summand[1][j]
+              + summand[2][j]
+              + summand[3][j]
+        ;
     }
-  }
 }
 
 // 2008.03.29 H.Morii - added SSE 3DNOW! 3x3 1x3 matrix multiplication