rewrites UpdateClipMatrix to avoid 2 memcpies

Small optimization of UpdateClipMatrix. There is of course the
possibility that at O3 the compiler can tell those copies are
superfluous and optimize them away anyway. Although I haven't done super
rigorous tests, comparing the uncapped framerates of versions compiled
with and without the change the new version does seem to be generally a
bit faster depending on the scene. On the startup screen of Pokemon
Black the new version seemed to be about a couple to a few% faster than
the old one, but about the same on starter selection.
MatrixMult4x4 was also rewritten for better reuse, but unfortunately
it's no longer in style with the other matrix multiplication functions.

Fun observation, because there are 53 bits of mantissa in a double and
64-12=52<53, it's possible to convert these integers to doubles, perform
the calculation on them and then convert back again without losing any
precision. This may sound completely goofy, but compilers are better at
vectorizing floating point arithmetic as you can see on e.g. compiler
explorer, so such a thing could possibly improve performance in some
scenarios. I tried it, and alas, even though the improved vectorization
from using floating points made a good effort, it just wasn't enough to
make up for the extra copies and conversions. I even tried using BLAS,
but that didn't do any better. But that does raise an interesting
thought, if the compilers don't vectorize the integer instructions that
much, could the matrix multiplications be sped up with explicit use of
SIMD intrinsics? Possibly with the vectorclass library? Another thing I
haven't tried is getting rid of the memcpy in the
multiplications. This would probably require heap allocating the arrays
so that pointers could be reassigned, and that extra heap faffery would
probably more than negate the improvement from getting rid of the memcpy.
If you add some intermediate arrays maybe there's some way to shuffle
stack allocated arrays around to get around heap allocating.

Another thing I tried was multithreading, although I only tried the
standard threading library. To begin with, since these matrices aren't
that big I suspected the overhead from threading would negate or
overwhelm any possible speedup. And indeed, using that approach ground
the emulator to a near halt, even on a computer that should be fairly
overspecced. But bear in mind that I'm a total multithreading n00b and
was only using std::thread, so maybe a multithreading whiz could come up
with an approach that's at least on par with the single threaded version.
This commit is contained in:
Valtýr Kári Daníelsson 2022-08-30 04:38:53 +02:00
parent 5baf5fe77b
commit 22f4a16998
1 changed files with 30 additions and 25 deletions

View File

@ -661,31 +661,29 @@ void MatrixLoad4x3(s32* m, s32* s)
m[12] = s[9]; m[13] = s[10]; m[14] = s[11]; m[15] = 0x1000; m[12] = s[9]; m[13] = s[10]; m[14] = s[11]; m[15] = 0x1000;
} }
void MatrixMult4x4(s32* m, s32* s) void MatrixMult4x4(s32* m, s32* s, s32* t)
{ {
s32 tmp[16];
memcpy(tmp, m, 16*4);
// m = s*m // m = s*m
m[0] = ((s64)s[0]*tmp[0] + (s64)s[1]*tmp[4] + (s64)s[2]*tmp[8] + (s64)s[3]*tmp[12]) >> 12; m[0] = ((s64)s[0]*t[0] + (s64)s[1]*t[4] + (s64)s[2]*t[8] + (s64)s[3]*t[12]) >> 12;
m[1] = ((s64)s[0]*tmp[1] + (s64)s[1]*tmp[5] + (s64)s[2]*tmp[9] + (s64)s[3]*tmp[13]) >> 12; m[1] = ((s64)s[0]*t[1] + (s64)s[1]*t[5] + (s64)s[2]*t[9] + (s64)s[3]*t[13]) >> 12;
m[2] = ((s64)s[0]*tmp[2] + (s64)s[1]*tmp[6] + (s64)s[2]*tmp[10] + (s64)s[3]*tmp[14]) >> 12; m[2] = ((s64)s[0]*t[2] + (s64)s[1]*t[6] + (s64)s[2]*t[10] + (s64)s[3]*t[14]) >> 12;
m[3] = ((s64)s[0]*tmp[3] + (s64)s[1]*tmp[7] + (s64)s[2]*tmp[11] + (s64)s[3]*tmp[15]) >> 12; m[3] = ((s64)s[0]*t[3] + (s64)s[1]*t[7] + (s64)s[2]*t[11] + (s64)s[3]*t[15]) >> 12;
m[4] = ((s64)s[4]*tmp[0] + (s64)s[5]*tmp[4] + (s64)s[6]*tmp[8] + (s64)s[7]*tmp[12]) >> 12; m[4] = ((s64)s[4]*t[0] + (s64)s[5]*t[4] + (s64)s[6]*t[8] + (s64)s[7]*t[12]) >> 12;
m[5] = ((s64)s[4]*tmp[1] + (s64)s[5]*tmp[5] + (s64)s[6]*tmp[9] + (s64)s[7]*tmp[13]) >> 12; m[5] = ((s64)s[4]*t[1] + (s64)s[5]*t[5] + (s64)s[6]*t[9] + (s64)s[7]*t[13]) >> 12;
m[6] = ((s64)s[4]*tmp[2] + (s64)s[5]*tmp[6] + (s64)s[6]*tmp[10] + (s64)s[7]*tmp[14]) >> 12; m[6] = ((s64)s[4]*t[2] + (s64)s[5]*t[6] + (s64)s[6]*t[10] + (s64)s[7]*t[14]) >> 12;
m[7] = ((s64)s[4]*tmp[3] + (s64)s[5]*tmp[7] + (s64)s[6]*tmp[11] + (s64)s[7]*tmp[15]) >> 12; m[7] = ((s64)s[4]*t[3] + (s64)s[5]*t[7] + (s64)s[6]*t[11] + (s64)s[7]*t[15]) >> 12;
m[8] = ((s64)s[8]*tmp[0] + (s64)s[9]*tmp[4] + (s64)s[10]*tmp[8] + (s64)s[11]*tmp[12]) >> 12; m[8] = ((s64)s[8]*t[0] + (s64)s[9]*t[4] + (s64)s[10]*t[8] + (s64)s[11]*t[12]) >> 12;
m[9] = ((s64)s[8]*tmp[1] + (s64)s[9]*tmp[5] + (s64)s[10]*tmp[9] + (s64)s[11]*tmp[13]) >> 12; m[9] = ((s64)s[8]*t[1] + (s64)s[9]*t[5] + (s64)s[10]*t[9] + (s64)s[11]*t[13]) >> 12;
m[10] = ((s64)s[8]*tmp[2] + (s64)s[9]*tmp[6] + (s64)s[10]*tmp[10] + (s64)s[11]*tmp[14]) >> 12; m[10] = ((s64)s[8]*t[2] + (s64)s[9]*t[6] + (s64)s[10]*t[10] + (s64)s[11]*t[14]) >> 12;
m[11] = ((s64)s[8]*tmp[3] + (s64)s[9]*tmp[7] + (s64)s[10]*tmp[11] + (s64)s[11]*tmp[15]) >> 12; m[11] = ((s64)s[8]*t[3] + (s64)s[9]*t[7] + (s64)s[10]*t[11] + (s64)s[11]*t[15]) >> 12;
m[12] = ((s64)s[12]*tmp[0] + (s64)s[13]*tmp[4] + (s64)s[14]*tmp[8] + (s64)s[15]*tmp[12]) >> 12; m[12] = ((s64)s[12]*t[0] + (s64)s[13]*t[4] + (s64)s[14]*t[8] + (s64)s[15]*t[12]) >> 12;
m[13] = ((s64)s[12]*tmp[1] + (s64)s[13]*tmp[5] + (s64)s[14]*tmp[9] + (s64)s[15]*tmp[13]) >> 12; m[13] = ((s64)s[12]*t[1] + (s64)s[13]*t[5] + (s64)s[14]*t[9] + (s64)s[15]*t[13]) >> 12;
m[14] = ((s64)s[12]*tmp[2] + (s64)s[13]*tmp[6] + (s64)s[14]*tmp[10] + (s64)s[15]*tmp[14]) >> 12; m[14] = ((s64)s[12]*t[2] + (s64)s[13]*t[6] + (s64)s[14]*t[10] + (s64)s[15]*t[14]) >> 12;
m[15] = ((s64)s[12]*tmp[3] + (s64)s[13]*tmp[7] + (s64)s[14]*tmp[11] + (s64)s[15]*tmp[15]) >> 12; m[15] = ((s64)s[12]*t[3] + (s64)s[13]*t[7] + (s64)s[14]*t[11] + (s64)s[15]*t[15]) >> 12;
} }
void MatrixMult4x3(s32* m, s32* s) void MatrixMult4x3(s32* m, s32* s)
@ -768,8 +766,7 @@ void UpdateClipMatrix()
if (!ClipMatrixDirty) return; if (!ClipMatrixDirty) return;
ClipMatrixDirty = false; ClipMatrixDirty = false;
memcpy(ClipMatrix, ProjMatrix, 16*4); MatrixMult4x4(ClipMatrix, PosMatrix, ProjMatrix);
MatrixMult4x4(ClipMatrix, PosMatrix);
} }
@ -2268,21 +2265,29 @@ void ExecuteCommand()
case 0x18: // mult 4x4 case 0x18: // mult 4x4
if (MatrixMode == 0) if (MatrixMode == 0)
{ {
MatrixMult4x4(ProjMatrix, (s32*)ExecParams); s32 tmp[16];
memcpy(tmp, ProjMatrix, 16*4);
MatrixMult4x4(ProjMatrix, (s32*)ExecParams, tmp);
ClipMatrixDirty = true; ClipMatrixDirty = true;
AddCycles(35 - 16); AddCycles(35 - 16);
} }
else if (MatrixMode == 3) else if (MatrixMode == 3)
{ {
MatrixMult4x4(TexMatrix, (s32*)ExecParams); s32 tmp[16];
memcpy(tmp, TexMatrix, 16*4);
MatrixMult4x4(TexMatrix, (s32*)ExecParams, tmp);
AddCycles(33 - 16); AddCycles(33 - 16);
} }
else else
{ {
MatrixMult4x4(PosMatrix, (s32*)ExecParams); s32 tmp[16];
memcpy(tmp, PosMatrix, 16*4);
MatrixMult4x4(PosMatrix, (s32*)ExecParams, tmp);
if (MatrixMode == 2) if (MatrixMode == 2)
{ {
MatrixMult4x4(VecMatrix, (s32*)ExecParams); s32 tmp[16];
memcpy(tmp, VecMatrix, 16*4);
MatrixMult4x4(VecMatrix, (s32*)ExecParams, tmp);
AddCycles(35 + 30 - 16); AddCycles(35 + 30 - 16);
} }
else AddCycles(35 - 16); else AddCycles(35 - 16);