Merge pull request #1166 from Triang3l/altivec_vpkd3d

[CPU] v(u)pkd3d128: Support UINT_2101010 and don't saturate D3DCOLOR
2018-05-31 07:15:35 -05:00 · 2018-05-31 07:15:35 -05:00 · 20bcd3f3c6
parent ec3ab0adbd d61aff4389
commit 20bcd3f3c6
3 changed files with 115 additions and 85 deletions
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -626,7 +626,6 @@ static const vec128_t xmm_consts[] = {
    vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
    /* XMMPermuteControl15    */ vec128b(15),
    /* XMMPermuteByteMask     */ vec128b(0x1F),
-    /* XMMPackD3DCOLORSat     */ vec128i(0x404000FFu),
    /* XMMPackD3DCOLOR        */
    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u),
    /* XMMUnpackD3DCOLOR      */
@ -647,8 +646,19 @@ static const vec128_t xmm_consts[] = {
    vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, 0x09080D0Cu),
    /* XMMUnpackSHORT_2       */
    vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu),
+    /* XMMUnpackSHORT_2_Min   */
+    vec128i(0x403F8001u, 0x403F8001u, 0x00000000u, 0x00000000u),
    /* XMMUnpackSHORT_4       */
    vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu),
+    /* XMMPackUINT_2101010_MinUnpacked */
+    vec128i(0x403FFE01u, 0x403FFE01u, 0x403FFE01u, 0x40400000u),
+    /* XMMPackUINT_2101010_MaxUnpacked */
+    vec128i(0x404001FFu, 0x404001FFu, 0x404001FFu, 0x40400003u),
+    /* XMMPackUINT_2101010_MaskUnpacked */
+    vec128i(0x3FFu, 0x3FFu, 0x3FFu, 0x3u),
+    /* XMMPackUINT_2101010_MaskPacked */
+    vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30),
+    /* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30),
    /* XMMOneOver255          */ vec128f(1.0f / 255.0f),
    /* XMMMaskEvenPI16        */
    vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu),
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -64,7 +64,6 @@ enum XmmConst {
  XMMByteOrderMask,
  XMMPermuteControl15,
  XMMPermuteByteMask,
-  XMMPackD3DCOLORSat,
  XMMPackD3DCOLOR,
  XMMUnpackD3DCOLOR,
  XMMPackFLOAT16_2,
@ -76,7 +75,13 @@ enum XmmConst {
  XMMPackSHORT_2,
  XMMPackSHORT_4,
  XMMUnpackSHORT_2,
+  XMMUnpackSHORT_2_Min,
  XMMUnpackSHORT_4,
+  XMMPackUINT_2101010_MinUnpacked,
+  XMMPackUINT_2101010_MaxUnpacked,
+  XMMPackUINT_2101010_MaskUnpacked,
+  XMMPackUINT_2101010_MaskPacked,
+  XMMPackUINT_2101010_Shift,
  XMMOneOver255,
  XMMMaskEvenPI16,
  XMMShiftMaskEvenPI16,
--- a/src/xenia/cpu/backend/x64/x64_sequences.cc
+++ b/src/xenia/cpu/backend/x64/x64_sequences.cc
@ -6964,20 +6964,24 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
  }
  static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
    assert_true(i.src2.value->IsConstantZero());
-    // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF]
-    // are valid.
-    if (i.src1.is_constant) {
-      e.LoadConstantXmm(i.dest, i.src1.constant());
-      e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLORSat));
-    } else {
-      e.vminps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLORSat));
-    }
-    e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMM3333));
+    // No saturation done here.
+    // Unpacking D3DCOLOR gives (1.0f | bits), or from 3F800000 to 3F8000FF.
+    // However, you can pack 3.0f + (value / (float) (1 << 22)), which creates
+    // a number between 40400000 and 404000FF:
+    // https://github.com/ValveSoftware/source-sdk-2013/blob/master/sp/src/public/pixelwriter.h#L648
+    // With saturation, you will get 0 when re-packing after unpacking.
+    // The above code also has to perform clamping explicitly.
+
    // Extract bytes.
    // RGBA (XYZW) -> ARGB (WXYZ)
    // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) |
    //     ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
-    e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
+    if (i.src1.is_constant) {
+      e.LoadConstantXmm(i.dest, i.src1.constant());
+      e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
+    } else {
+      e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLOR));
+    }
  }
  static __m128i EmulateFLOAT16_2(void*, __m128 src1) {
    alignas(16) float a[4];
@ -7050,59 +7054,35 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
    // Pack.
    e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4));
  }
-  static __m128i EmulatePackUINT_2101010(void*, __m128i src1) {
-    // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
-    union {
-      alignas(16) int32_t a_i[4];
-      alignas(16) uint32_t a_u[4];
-      alignas(16) float a_f[4];
-    };
-    alignas(16) uint32_t b[4];
-    alignas(16) uint32_t c[4];
-    _mm_store_si128(reinterpret_cast<__m128i*>(a_u), src1);
-    // XYZ are 10 bits, signed and saturated.
-    for (int i = 0; i < 3; ++i) {
-      static const int32_t kMinValueXYZ = 0x403FFE01;  // 3 - 1FF / (1 << 22)
-      static const int32_t kMaxValueXYZ = 0x404001FF;  // 3 + 1FF / (1 << 22)
-      uint32_t exponent = (a_u[i] >> 23) & 0xFF;
-      uint32_t fractional = a_u[i] & 0x007FFFFF;
-      if ((exponent == 0xFF) && fractional) {
-        b[i] = 0x200;
-      } else if (a_i[i] > kMaxValueXYZ) {
-        b[i] = 0x1FF;  // INT_MAX
-      } else if (a_i[i] < kMinValueXYZ) {
-        b[i] = 0x201;  // -INT_MAX
-      } else {
-        b[i] = a_u[i] & 0x3FF;
-      }
-    }
-    // W is 2 bits, unsigned and saturated.
-    static const int32_t kMinValueW = 0x40400000;  // 3
-    static const int32_t kMaxValueW = 0x40400003;  // 3 + 3 / (1 << 22)
-    uint32_t w_exponent = (a_u[3] >> 23) & 0xFF;
-    uint32_t w_fractional = a_u[3] & 0x007FFFFF;
-    if ((w_exponent == 0xFF) && w_fractional) {
-      b[3] = 0x0;
-    } else if (a_i[3] > kMaxValueW) {
-      b[3] = 0x3;
-    } else if (a_i[3] < kMinValueW) {
-      b[3] = 0x0;
-    } else {
-      b[3] = a_u[3] & 0x3;
-    }
-    // Combine in 2101010 WZYX.
-    c[0] = c[1] = c[2] = 0;
-    c[3] = ((b[3] & 0x3) << 30) | ((b[2] & 0x3FF) << 20) |
-           ((b[1] & 0x3FF) << 10) | ((b[0] & 0x3FF));
-    return _mm_load_si128(reinterpret_cast<__m128i*>(c));
-  }
  static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
-    assert_true(i.src2.value->IsConstantZero());
-    // dest = [(b2(src1.w), b10(src1.z), b10(src1.y), b10(src1.x)), 0, 0, 0]
-    // TODO(benvanik): optimized version.
-    e.lea(e.r8, e.StashXmm(0, i.src1));
-    e.CallNativeSafe(reinterpret_cast<void*>(EmulatePackUINT_2101010));
-    e.vmovaps(i.dest, e.xmm0);
+    // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
+    // XYZ are 10 bits, signed and saturated.
+    // W is 2 bits, unsigned and saturated.
+
+    // Saturate.
+    e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked));
+    e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaxUnpacked));
+    // Remove the unneeded bits of the floats.
+    e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      // Shift the components up.
+      e.vpsllvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
+    } else {
+      // Duplicate all the components into bits 10-19.
+      e.vpslld(e.xmm0, i.dest, 10);
+      e.vpor(i.dest, e.xmm0);
+      // Duplicate all the components into bits 20-39
+      // (so alpha will be in 30-31).
+      e.vpslld(e.xmm0, i.dest, 20);
+      e.vpor(i.dest, e.xmm0);
+      // Leave only the needed components.
+      e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
+    }
+    // Combine the components.
+    e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(2, 3, 0, 1));
+    e.vorps(i.dest, e.xmm0);
+    e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2));
+    e.vorps(i.dest, e.xmm0);
  }
  static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
                                              __m128i src2) {
@ -7329,20 +7309,23 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
  }
  static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
    // ARGB (WXYZ) -> RGBA (XYZW)
-    // XMLoadColor
+    Xmm src;
    if (i.src1.is_constant) {
      if (i.src1.value->IsConstantZero()) {
        e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne));
        return;
-      } else {
-        assert_always();
      }
+      src = e.xmm0;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
    }
    // src = ZZYYXXWW
    // Unpack to 000000ZZ,000000YY,000000XX,000000WW
    e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR));
    // Add 1.0f to each.
    e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
+    // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
  }
  static __m128 EmulateFLOAT16_2(void*, __m128i src1) {
    alignas(16) uint16_t a[8];
@ -7426,64 +7409,96 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
    // (VD.x) = 3.0 + (VB.x>>16)*2^-22
    // (VD.y) = 3.0 + (VB.x)*2^-22
    // (VD.z) = 0.0
-    // (VD.w) = 1.0
-
-    // XMLoadShortN2 plus 3,3,0,3 (for some reason)
+    // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
    // src is (xx,xx,xx,VALUE)
-    // (VALUE,VALUE,VALUE,VALUE)
    Xmm src;
    if (i.src1.is_constant) {
      if (i.src1.value->IsConstantZero()) {
        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301));
        return;
-      } else {
-        // TODO(benvanik): check other common constants/perform shuffle/or here.
-        src = e.xmm0;
-        e.LoadConstantXmm(src, i.src1.constant());
      }
+      // TODO(benvanik): check other common constants/perform shuffle/or here.
+      src = e.xmm0;
+      e.LoadConstantXmm(src, i.src1.constant());
    } else {
      src = i.src1;
    }
    // Shuffle bytes.
    e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2));
-    // Sign extend words.
+    // If negative, make smaller than 3 - sign extend before adding.
    e.vpslld(i.dest, 16);
    e.vpsrad(i.dest, 16);
    // Add 3,3,0,1.
    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
+    // Clamp the absolute value to the maximum positive value.
+    e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_2_Min));
  }
  static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
    // (VD.x) = 3.0 + (VB.x>>16)*2^-22
    // (VD.y) = 3.0 + (VB.x)*2^-22
    // (VD.z) = 3.0 + (VB.y>>16)*2^-22
    // (VD.w) = 3.0 + (VB.y)*2^-22
-
-    // XMLoadShortN4 plus 3,3,3,3 (for some reason)
    // src is (xx,xx,VALUE,VALUE)
-    // (VALUE,VALUE,VALUE,VALUE)
    Xmm src;
    if (i.src1.is_constant) {
      if (i.src1.value->IsConstantZero()) {
        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
        return;
-      } else {
-        // TODO(benvanik): check other common constants/perform shuffle/or here.
-        src = e.xmm0;
-        e.LoadConstantXmm(src, i.src1.constant());
      }
+      // TODO(benvanik): check other common constants/perform shuffle/or here.
+      src = e.xmm0;
+      e.LoadConstantXmm(src, i.src1.constant());
    } else {
      src = i.src1;
    }
    // Shuffle bytes.
    e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4));
-    // Sign extend words.
+    // If negative, make smaller than 3 - sign extend before adding.
    e.vpslld(i.dest, 16);
    e.vpsrad(i.dest, 16);
    // Add 3,3,3,3.
    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
+    // Clamp the absolute value to the maximum positive value.
+    e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Min));
  }
  static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
-    assert_always("not implemented");
+    Xmm src;
+    if (i.src1.is_constant) {
+      if (i.src1.value->IsConstantZero()) {
+        e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
+        return;
+      }
+      src = e.xmm0;
+      e.LoadConstantXmm(src, i.src1.constant());
+    } else {
+      src = i.src1;
+    }
+    // Splat W.
+    e.vshufps(i.dest, src, src, _MM_SHUFFLE(3, 3, 3, 3));
+    // Keep only the needed components.
+    // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31.
+    e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
+    if (e.IsFeatureEnabled(kX64EmitAVX2)) {
+      // Shift the components down.
+      e.vpsrlvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
+    } else {
+      // Duplicate green in 0-9 and alpha in 20-21.
+      e.vpsrld(e.xmm0, i.dest, 10);
+      e.vpor(i.dest, e.xmm0);
+      // Duplicate blue in 0-9 and alpha in 0-1.
+      e.vpsrld(e.xmm0, i.dest, 20);
+      e.vpor(i.dest, e.xmm0);
+      // Remove higher duplicate components.
+      e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
+    }
+    // If negative, make smaller than 3 - sign extend XYZ before adding.
+    e.vpslld(i.dest, 22);
+    e.vpsrad(i.dest, 22);
+    // Add 3,3,3,3.
+    e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
+    // Clamp the absolute values of XYZ to the maximum positive value.
+    e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked));
+    // To convert XYZ to -1 to 1, games multiply by 0x46004020 & add 0xC6C06030.
  }
  static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
    assert_false(IsPackOutSaturate(flags));