From 8e35ec06201e120088e08ef23dbd421e2ba5f643 Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Sat, 16 May 2015 13:31:42 -0500 Subject: [PATCH 1/3] Rewrite FMA mul sub/add to avoid register shuffling Also downgrade some AVX opcodes to SSE --- src/xenia/cpu/backend/x64/x64_sequences.cc | 106 +++++++++++---------- 1 file changed, 54 insertions(+), 52 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 48271dca6..51954bb9d 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -3742,27 +3742,30 @@ EMITTER_OPCODE_TABLE( // TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. // dest could be src2 or src3 - need to ensure it's not before overwriting dest // perhaps use other 132/213/etc +// Forms: +// - 132 -> $1 = $1 * $3 + $2 +// - 213 -> $1 = $2 * $1 + $3 +// - 231 -> $1 = $2 * $3 + $1 EMITTER(MUL_ADD_F32, MATCH(I, F32<>, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { if (i.dest == i.src1) { e.vfmadd213ss(i.dest, i.src2, i.src3); + } else if (i.dest == i.src2) { + e.vfmadd213ss(i.dest, i.src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmadd231ss(i.dest, i.src1, i.src2); } else { - if (i.dest != i.src2 && i.dest != i.src3) { - e.vmovss(i.dest, i.src1); - e.vfmadd213ss(i.dest, i.src2, i.src3); - } else { - e.vmovss(e.xmm0, i.src1); - e.vfmadd213ss(e.xmm0, i.src2, i.src3); - e.vmovss(i.dest, e.xmm0); - } + // Dest not equal to anything + e.movss(i.dest, i.src1); + e.vfmadd213ss(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. Xmm src3 = i.src3; if (i.dest == i.src3) { - e.vmovss(e.xmm0, i.src3); + e.movss(e.xmm0, i.src3); src3 = e.xmm0; } @@ -3777,21 +3780,20 @@ EMITTER(MUL_ADD_F64, MATCH(I, F64<>, F64<>, F64<>>)) { if (e.IsFeatureEnabled(kX64EmitFMA)) { if (i.dest == i.src1) { e.vfmadd213sd(i.dest, i.src2, i.src3); + } else if (i.dest == i.src2) { + e.vfmadd213sd(i.dest, i.src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmadd231sd(i.dest, i.src1, i.src2); } else { - if (i.dest != i.src2 && i.dest != i.src3) { - e.vmovsd(i.dest, i.src1); - e.vfmadd213sd(i.dest, i.src2, i.src3); - } else { - e.vmovsd(e.xmm0, i.src1); - e.vfmadd213sd(e.xmm0, i.src2, i.src3); - e.vmovsd(i.dest, e.xmm0); - } + // Dest not equal to anything + e.movsd(i.dest, i.src1); + e.vfmadd213sd(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. Xmm src3 = i.src3; if (i.dest == i.src3) { - e.vmovsd(e.xmm0, i.src3); + e.movsd(e.xmm0, i.src3); src3 = e.xmm0; } @@ -3806,15 +3808,14 @@ EMITTER(MUL_ADD_V128, MATCH(I, V128<>, V128<>, V128<>>)) if (e.IsFeatureEnabled(kX64EmitFMA)) { if (i.dest == i.src1) { e.vfmadd213ps(i.dest, i.src2, i.src3); + } else if (i.dest == i.src2) { + e.vfmadd213ps(i.dest, i.src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmadd231ps(i.dest, i.src1, i.src2); } else { - if (i.dest != i.src2 && i.dest != i.src3) { - e.vmovdqa(i.dest, i.src1); - e.vfmadd213ps(i.dest, i.src2, i.src3); - } else { - e.vmovdqa(e.xmm0, i.src1); - e.vfmadd213ps(e.xmm0, i.src2, i.src3); - e.vmovdqa(i.dest, e.xmm0); - } + // Dest not equal to anything + e.vmovqda(i.dest, i.src1); + e.vfmadd213ps(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. @@ -3844,27 +3845,30 @@ EMITTER_OPCODE_TABLE( // TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. // dest could be src2 or src3 - need to ensure it's not before overwriting dest // perhaps use other 132/213/etc +// Forms: +// - 132 -> $1 = $1 * $3 - $2 +// - 213 -> $1 = $2 * $1 - $3 +// - 231 -> $1 = $2 * $3 - $1 EMITTER(MUL_SUB_F32, MATCH(I, F32<>, F32<>, F32<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { if (i.dest == i.src1) { e.vfmsub213ss(i.dest, i.src2, i.src3); + } else if (i.dest == i.src2) { + e.vfmsub213ss(i.dest, i.src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmsub231ss(i.dest, i.src1, i.src2); } else { - if (i.dest != i.src2 && i.dest != i.src3) { - e.vmovss(i.dest, i.src1); - e.vfmsub213ss(i.dest, i.src2, i.src3); - } else { - e.vmovss(e.xmm0, i.src1); - e.vfmsub213ss(e.xmm0, i.src2, i.src3); - e.vmovss(i.dest, e.xmm0); - } + // Dest not equal to anything + e.movss(i.dest, i.src1); + e.vfmsub213ss(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. Xmm src3 = i.src3; if (i.dest == i.src3) { - e.vmovss(e.xmm0, i.src3); + e.movss(e.xmm0, i.src3); src3 = e.xmm0; } @@ -3879,21 +3883,20 @@ EMITTER(MUL_SUB_F64, MATCH(I, F64<>, F64<>, F64<>>)) { if (e.IsFeatureEnabled(kX64EmitFMA)) { if (i.dest == i.src1) { e.vfmsub213sd(i.dest, i.src2, i.src3); + } else if (i.dest == i.src2) { + e.vfmsub213sd(i.dest, i.src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmsub231sd(i.dest, i.src1, i.src2); } else { - if (i.dest != i.src2 && i.dest != i.src3) { - e.vmovsd(i.dest, i.src1); - e.vfmsub213sd(i.dest, i.src2, i.src3); - } else { - e.vmovsd(e.xmm0, i.src1); - e.vfmsub213sd(e.xmm0, i.src2, i.src3); - e.vmovsd(i.dest, e.xmm0); - } + // Dest not equal to anything + e.movsd(i.dest, i.src1); + e.vfmsub213sd(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. Xmm src3 = i.src3; if (i.dest == i.src3) { - e.vmovsd(e.xmm0, i.src3); + e.movsd(e.xmm0, i.src3); src3 = e.xmm0; } @@ -3908,15 +3911,14 @@ EMITTER(MUL_SUB_V128, MATCH(I, V128<>, V128<>, V128<>>)) if (e.IsFeatureEnabled(kX64EmitFMA)) { if (i.dest == i.src1) { e.vfmsub213ps(i.dest, i.src2, i.src3); + } else if (i.dest == i.src2) { + e.vfmsub213ps(i.dest, i.src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmsub231ps(i.dest, i.src1, i.src2); } else { - if (i.dest != i.src2 && i.dest != i.src3) { - e.vmovdqa(i.dest, i.src1); - e.vfmsub213ps(i.dest, i.src2, i.src3); - } else { - e.vmovdqa(e.xmm0, i.src1); - e.vfmsub213ps(e.xmm0, i.src2, i.src3); - e.vmovdqa(i.dest, e.xmm0); - } + // Dest not equal to anything + e.vmovdqa(i.dest, i.src1); + e.vfmsub213ps(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. From 65ff8624950a6531cc7e91d36de2d6dc434e3534 Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Sat, 16 May 2015 13:41:33 -0500 Subject: [PATCH 2/3] Whoops (typo) --- src/xenia/cpu/backend/x64/x64_sequences.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 51954bb9d..c2a9da6e9 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -3814,7 +3814,7 @@ EMITTER(MUL_ADD_V128, MATCH(I, V128<>, V128<>, V128<>>)) e.vfmadd231ps(i.dest, i.src1, i.src2); } else { // Dest not equal to anything - e.vmovqda(i.dest, i.src1); + e.vmovdqa(i.dest, i.src1); e.vfmadd213ps(i.dest, i.src2, i.src3); } } else { From 5dbec09818dbf01398ca63e9311b93595d468fb1 Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Sat, 16 May 2015 14:10:28 -0500 Subject: [PATCH 3/3] Change SSE opcodes back to AVX (mixing is bad) --- src/xenia/cpu/backend/x64/x64_sequences.cc | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index c2a9da6e9..fd3c09e98 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -3758,19 +3758,19 @@ EMITTER(MUL_ADD_F32, MATCH(I, F32<>, F32<>, F32<>>)) { e.vfmadd231ss(i.dest, i.src1, i.src2); } else { // Dest not equal to anything - e.movss(i.dest, i.src1); + e.vmovss(i.dest, i.src1); e.vfmadd213ss(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. Xmm src3 = i.src3; if (i.dest == i.src3) { - e.movss(e.xmm0, i.src3); + e.vmovss(e.xmm0, i.src3); src3 = e.xmm0; } e.vmulss(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.addss(i.dest, src3); // $0 = $0 + $1 + e.vaddss(i.dest, i.dest, src3); // $0 = $1 + $2 } } }; @@ -3786,19 +3786,19 @@ EMITTER(MUL_ADD_F64, MATCH(I, F64<>, F64<>, F64<>>)) { e.vfmadd231sd(i.dest, i.src1, i.src2); } else { // Dest not equal to anything - e.movsd(i.dest, i.src1); + e.vmovsd(i.dest, i.src1); e.vfmadd213sd(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. Xmm src3 = i.src3; if (i.dest == i.src3) { - e.movsd(e.xmm0, i.src3); + e.vmovsd(e.xmm0, i.src3); src3 = e.xmm0; } e.vmulsd(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.addsd(i.dest, src3); // $0 = $0 + $1 + e.vaddsd(i.dest, i.dest, src3); // $0 = $1 + $2 } } }; @@ -3826,7 +3826,7 @@ EMITTER(MUL_ADD_V128, MATCH(I, V128<>, V128<>, V128<>>)) } e.vmulps(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.addps(i.dest, src3); // $0 = $0 + $1 + e.vaddps(i.dest, i.dest, src3); // $0 = $1 + $2 } } }; @@ -3861,19 +3861,19 @@ EMITTER(MUL_SUB_F32, MATCH(I, F32<>, F32<>, F32<>>)) { e.vfmsub231ss(i.dest, i.src1, i.src2); } else { // Dest not equal to anything - e.movss(i.dest, i.src1); + e.vmovss(i.dest, i.src1); e.vfmsub213ss(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. Xmm src3 = i.src3; if (i.dest == i.src3) { - e.movss(e.xmm0, i.src3); + e.vmovss(e.xmm0, i.src3); src3 = e.xmm0; } e.vmulss(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.subss(i.dest, src3); // $0 = $0 - $1 + e.vsubss(i.dest, i.dest, src3); // $0 = $1 - $2 } } }; @@ -3889,19 +3889,19 @@ EMITTER(MUL_SUB_F64, MATCH(I, F64<>, F64<>, F64<>>)) { e.vfmsub231sd(i.dest, i.src1, i.src2); } else { // Dest not equal to anything - e.movsd(i.dest, i.src1); + e.vmovsd(i.dest, i.src1); e.vfmsub213sd(i.dest, i.src2, i.src3); } } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. Xmm src3 = i.src3; if (i.dest == i.src3) { - e.movsd(e.xmm0, i.src3); + e.vmovsd(e.xmm0, i.src3); src3 = e.xmm0; } e.vmulsd(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.subsd(i.dest, src3); // $0 = $0 - $1 + e.vsubsd(i.dest, i.dest, src3); // $0 = $1 - $2 } } }; @@ -3929,7 +3929,7 @@ EMITTER(MUL_SUB_V128, MATCH(I, V128<>, V128<>, V128<>>)) } e.vmulps(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.subps(i.dest, i.src3); // $0 = $0 - $1 + e.vsubps(i.dest, i.dest, src3); // $0 = $1 - $2 } } };