From bd9a290b30d1ab1315d2df8049475df8c4eaad55 Mon Sep 17 00:00:00 2001
From: Wunkolo <Wunkolo@gmail.com>
Date: Sun, 23 Jan 2022 10:25:01 -0800
Subject: [PATCH] [x64] Add `GFNI`-based optimization for
 `VECTOR_SH{R,L}_V128(Int8)`

In the `Int8` case of `VECTOR_SH{R,L}_V128`, when all the values are the
same, then a single-instruction `gf2p8affineqb` can be emitted that does
an int8-based arithmetic-shift, utilizing GF(8) arithmetic.

More info here:
https://wunkolo.github.io/post/2020/11/gf2p8affineqb-int8-shifting/

Also fixes the iteration-type for when detecting if all of the simd
lanes are the same value(was iterating `u16` and not `u8`)
---
 src/xenia/cpu/backend/x64/x64_seq_vector.cc | 42 ++++++++++++++++++++-
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
index c428018ba..72761aa6f 100644
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@@ -731,6 +731,25 @@ struct VECTOR_SHL_V128
   static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
+      if (e.IsFeatureEnabled(kX64EmitGFNI)) {
+        const auto& shamt = i.src2.constant();
+        bool all_same = true;
+        for (size_t n = 0; n < 16 - n; ++n) {
+          if (shamt.u8[n] != shamt.u8[n + 1]) {
+            all_same = false;
+            break;
+          }
+        }
+        if (all_same) {
+          // Every count is the same, so we can use gf2p8affineqb.
+          const uint8_t shift_amount = shamt.u8[0];
+          const uint64_t shift_matrix =
+              0x0102040810204080 >> (shift_amount * 8);
+          e.vgf2p8affineqb(i.dest, i.src1,
+                           e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
+          return;
+        }
+      }
       e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
     } else {
       e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
@@ -920,6 +939,25 @@ struct VECTOR_SHR_V128
   static void EmitInt8(X64Emitter& e, const EmitArgType& i) {
     // TODO(benvanik): native version (with shift magic).
     if (i.src2.is_constant) {
+      if (e.IsFeatureEnabled(kX64EmitGFNI)) {
+        const auto& shamt = i.src2.constant();
+        bool all_same = true;
+        for (size_t n = 0; n < 16 - n; ++n) {
+          if (shamt.u8[n] != shamt.u8[n + 1]) {
+            all_same = false;
+            break;
+          }
+        }
+        if (all_same) {
+          // Every count is the same, so we can use gf2p8affineqb.
+          const uint8_t shift_amount = shamt.u8[0];
+          const uint64_t shift_matrix = 0x0102040810204080
+                                        << (shift_amount * 8);
+          e.vgf2p8affineqb(i.dest, i.src1,
+                           e.StashConstantXmm(0, vec128q(shift_matrix)), 0);
+          return;
+        }
+      }
       e.lea(e.GetNativeParam(1), e.StashConstantXmm(1, i.src2.constant()));
     } else {
       e.lea(e.GetNativeParam(1), e.StashXmm(1, i.src2));
@@ -1087,8 +1125,8 @@ struct VECTOR_SHA_V128
       if (e.IsFeatureEnabled(kX64EmitGFNI)) {
         const auto& shamt = i.src2.constant();
         bool all_same = true;
-        for (size_t n = 0; n < 8 - n; ++n) {
-          if (shamt.u16[n] != shamt.u16[n + 1]) {
+        for (size_t n = 0; n < 16 - n; ++n) {
+          if (shamt.u8[n] != shamt.u8[n + 1]) {
             all_same = false;
             break;
           }