rewrites UpdateClipMatrix to avoid 2 memcpies

Small optimization of UpdateClipMatrix. There is of course the possibility that at O3 the compiler can tell those copies are superfluous and optimize them away anyway. Although I haven't done super rigorous tests, comparing the uncapped framerates of versions compiled with and without the change the new version does seem to be generally a bit faster depending on the scene. On the startup screen of Pokemon Black the new version seemed to be about a couple to a few% faster than the old one, but about the same on starter selection. MatrixMult4x4 was also rewritten for better reuse, but unfortunately it's no longer in style with the other matrix multiplication functions. Fun observation, because there are 53 bits of mantissa in a double and 64-12=52<53, it's possible to convert these integers to doubles, perform the calculation on them and then convert back again without losing any precision. This may sound completely goofy, but compilers are better at vectorizing floating point arithmetic as you can see on e.g. compiler explorer, so such a thing could possibly improve performance in some scenarios. I tried it, and alas, even though the improved vectorization from using floating points made a good effort, it just wasn't enough to make up for the extra copies and conversions. I even tried using BLAS, but that didn't do any better. But that does raise an interesting thought, if the compilers don't vectorize the integer instructions that much, could the matrix multiplications be sped up with explicit use of SIMD intrinsics? Possibly with the vectorclass library? Another thing I haven't tried is getting rid of the memcpy in the multiplications. This would probably require heap allocating the arrays so that pointers could be reassigned, and that extra heap faffery would probably more than negate the improvement from getting rid of the memcpy. If you add some intermediate arrays maybe there's some way to shuffle stack allocated arrays around to get around heap allocating. Another thing I tried was multithreading, although I only tried the standard threading library. To begin with, since these matrices aren't that big I suspected the overhead from threading would negate or overwhelm any possible speedup. And indeed, using that approach ground the emulator to a near halt, even on a computer that should be fairly overspecced. But bear in mind that I'm a total multithreading n00b and was only using std::thread, so maybe a multithreading whiz could come up with an approach that's at least on par with the single threaded version.
2022-08-30 04:38:53 +02:00 · 2022-08-30 04:38:53 +02:00 · 22f4a16998
parent 5baf5fe77b
commit 22f4a16998
1 changed files with 30 additions and 25 deletions
--- a/src/GPU3D.cpp
+++ b/src/GPU3D.cpp
@ -661,31 +661,29 @@ void MatrixLoad4x3(s32* m, s32* s)
    m[12] = s[9]; m[13] = s[10]; m[14] = s[11]; m[15] = 0x1000;
 }

-void MatrixMult4x4(s32* m, s32* s)
+void MatrixMult4x4(s32* m, s32* s, s32* t)
 {
-    s32 tmp[16];
-    memcpy(tmp, m, 16*4);

    // m = s*m
-    m[0] = ((s64)s[0]*tmp[0] + (s64)s[1]*tmp[4] + (s64)s[2]*tmp[8] + (s64)s[3]*tmp[12]) >> 12;
-    m[1] = ((s64)s[0]*tmp[1] + (s64)s[1]*tmp[5] + (s64)s[2]*tmp[9] + (s64)s[3]*tmp[13]) >> 12;
-    m[2] = ((s64)s[0]*tmp[2] + (s64)s[1]*tmp[6] + (s64)s[2]*tmp[10] + (s64)s[3]*tmp[14]) >> 12;
-    m[3] = ((s64)s[0]*tmp[3] + (s64)s[1]*tmp[7] + (s64)s[2]*tmp[11] + (s64)s[3]*tmp[15]) >> 12;
+    m[0] = ((s64)s[0]*t[0] + (s64)s[1]*t[4] + (s64)s[2]*t[8] + (s64)s[3]*t[12]) >> 12;
+    m[1] = ((s64)s[0]*t[1] + (s64)s[1]*t[5] + (s64)s[2]*t[9] + (s64)s[3]*t[13]) >> 12;
+    m[2] = ((s64)s[0]*t[2] + (s64)s[1]*t[6] + (s64)s[2]*t[10] + (s64)s[3]*t[14]) >> 12;
+    m[3] = ((s64)s[0]*t[3] + (s64)s[1]*t[7] + (s64)s[2]*t[11] + (s64)s[3]*t[15]) >> 12;

-    m[4] = ((s64)s[4]*tmp[0] + (s64)s[5]*tmp[4] + (s64)s[6]*tmp[8] + (s64)s[7]*tmp[12]) >> 12;
-    m[5] = ((s64)s[4]*tmp[1] + (s64)s[5]*tmp[5] + (s64)s[6]*tmp[9] + (s64)s[7]*tmp[13]) >> 12;
-    m[6] = ((s64)s[4]*tmp[2] + (s64)s[5]*tmp[6] + (s64)s[6]*tmp[10] + (s64)s[7]*tmp[14]) >> 12;
-    m[7] = ((s64)s[4]*tmp[3] + (s64)s[5]*tmp[7] + (s64)s[6]*tmp[11] + (s64)s[7]*tmp[15]) >> 12;
+    m[4] = ((s64)s[4]*t[0] + (s64)s[5]*t[4] + (s64)s[6]*t[8] + (s64)s[7]*t[12]) >> 12;
+    m[5] = ((s64)s[4]*t[1] + (s64)s[5]*t[5] + (s64)s[6]*t[9] + (s64)s[7]*t[13]) >> 12;
+    m[6] = ((s64)s[4]*t[2] + (s64)s[5]*t[6] + (s64)s[6]*t[10] + (s64)s[7]*t[14]) >> 12;
+    m[7] = ((s64)s[4]*t[3] + (s64)s[5]*t[7] + (s64)s[6]*t[11] + (s64)s[7]*t[15]) >> 12;

-    m[8] = ((s64)s[8]*tmp[0] + (s64)s[9]*tmp[4] + (s64)s[10]*tmp[8] + (s64)s[11]*tmp[12]) >> 12;
-    m[9] = ((s64)s[8]*tmp[1] + (s64)s[9]*tmp[5] + (s64)s[10]*tmp[9] + (s64)s[11]*tmp[13]) >> 12;
-    m[10] = ((s64)s[8]*tmp[2] + (s64)s[9]*tmp[6] + (s64)s[10]*tmp[10] + (s64)s[11]*tmp[14]) >> 12;
-    m[11] = ((s64)s[8]*tmp[3] + (s64)s[9]*tmp[7] + (s64)s[10]*tmp[11] + (s64)s[11]*tmp[15]) >> 12;
+    m[8] = ((s64)s[8]*t[0] + (s64)s[9]*t[4] + (s64)s[10]*t[8] + (s64)s[11]*t[12]) >> 12;
+    m[9] = ((s64)s[8]*t[1] + (s64)s[9]*t[5] + (s64)s[10]*t[9] + (s64)s[11]*t[13]) >> 12;
+    m[10] = ((s64)s[8]*t[2] + (s64)s[9]*t[6] + (s64)s[10]*t[10] + (s64)s[11]*t[14]) >> 12;
+    m[11] = ((s64)s[8]*t[3] + (s64)s[9]*t[7] + (s64)s[10]*t[11] + (s64)s[11]*t[15]) >> 12;

-    m[12] = ((s64)s[12]*tmp[0] + (s64)s[13]*tmp[4] + (s64)s[14]*tmp[8] + (s64)s[15]*tmp[12]) >> 12;
-    m[13] = ((s64)s[12]*tmp[1] + (s64)s[13]*tmp[5] + (s64)s[14]*tmp[9] + (s64)s[15]*tmp[13]) >> 12;
-    m[14] = ((s64)s[12]*tmp[2] + (s64)s[13]*tmp[6] + (s64)s[14]*tmp[10] + (s64)s[15]*tmp[14]) >> 12;
-    m[15] = ((s64)s[12]*tmp[3] + (s64)s[13]*tmp[7] + (s64)s[14]*tmp[11] + (s64)s[15]*tmp[15]) >> 12;
+    m[12] = ((s64)s[12]*t[0] + (s64)s[13]*t[4] + (s64)s[14]*t[8] + (s64)s[15]*t[12]) >> 12;
+    m[13] = ((s64)s[12]*t[1] + (s64)s[13]*t[5] + (s64)s[14]*t[9] + (s64)s[15]*t[13]) >> 12;
+    m[14] = ((s64)s[12]*t[2] + (s64)s[13]*t[6] + (s64)s[14]*t[10] + (s64)s[15]*t[14]) >> 12;
+    m[15] = ((s64)s[12]*t[3] + (s64)s[13]*t[7] + (s64)s[14]*t[11] + (s64)s[15]*t[15]) >> 12;
 }

 void MatrixMult4x3(s32* m, s32* s)
@ -768,8 +766,7 @@ void UpdateClipMatrix()
    if (!ClipMatrixDirty) return;
    ClipMatrixDirty = false;

-    memcpy(ClipMatrix, ProjMatrix, 16*4);
-    MatrixMult4x4(ClipMatrix, PosMatrix);
+    MatrixMult4x4(ClipMatrix, PosMatrix, ProjMatrix);
 }


@ -2268,21 +2265,29 @@ void ExecuteCommand()
                case 0x18: // mult 4x4
                    if (MatrixMode == 0)
                    {
-                        MatrixMult4x4(ProjMatrix, (s32*)ExecParams);
+                        s32 tmp[16];
+                        memcpy(tmp, ProjMatrix, 16*4);
+                        MatrixMult4x4(ProjMatrix, (s32*)ExecParams, tmp);
                        ClipMatrixDirty = true;
                        AddCycles(35 - 16);
                    }
                    else if (MatrixMode == 3)
                    {
-                        MatrixMult4x4(TexMatrix, (s32*)ExecParams);
+                        s32 tmp[16];
+                        memcpy(tmp, TexMatrix, 16*4);
+                        MatrixMult4x4(TexMatrix, (s32*)ExecParams, tmp);
                        AddCycles(33 - 16);
                    }
                    else
                    {
-                        MatrixMult4x4(PosMatrix, (s32*)ExecParams);
+                        s32 tmp[16];
+                        memcpy(tmp, PosMatrix, 16*4);
+                        MatrixMult4x4(PosMatrix, (s32*)ExecParams, tmp);
                        if (MatrixMode == 2)
                        {
-                            MatrixMult4x4(VecMatrix, (s32*)ExecParams);
+                            s32 tmp[16];
+                            memcpy(tmp, VecMatrix, 16*4);
+                            MatrixMult4x4(VecMatrix, (s32*)ExecParams, tmp);
                            AddCycles(35 + 30 - 16);
                        }
                        else AddCycles(35 - 16);