[x64] vcfux single rounding for 0x80000000+

2020-12-11 21:20:13 +03:00 · 2020-12-11 21:20:13 +03:00 · 5c47a3a588
parent d0b849aad7
commit 5c47a3a588
4 changed files with 85 additions and 10 deletions
--- a/src/xenia/cpu/backend/x64/x64_emitter.cc
+++ b/src/xenia/cpu/backend/x64/x64_emitter.cc
@ -746,6 +746,8 @@ static const vec128_t xmm_consts[] = {
    /* XMMIntMaxPD            */ vec128d(INT_MAX),
    /* XMMPosIntMinPS         */ vec128f((float)0x80000000u),
    /* XMMQNaN                */ vec128i(0x7FC00000u),
+    /* XMMInt127              */ vec128i(0x7Fu),
+    /* XMM2To32               */ vec128f(0x1.0p32f),
 };

 // First location to try and place constants.
--- a/src/xenia/cpu/backend/x64/x64_emitter.h
+++ b/src/xenia/cpu/backend/x64/x64_emitter.h
@ -114,6 +114,8 @@ enum XmmConst {
  XMMIntMaxPD,
  XMMPosIntMinPS,
  XMMQNaN,
+  XMMInt127,
+  XMM2To32,
 };

 // Unfortunately due to the design of xbyak we have to pass this to the ctor.
--- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc
+++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc
@ -33,19 +33,41 @@ struct VECTOR_CONVERT_I2F
  static void Emit(X64Emitter& e, const EmitArgType& i) {
    // flags = ARITHMETIC_UNSIGNED
    if (i.instr->flags & ARITHMETIC_UNSIGNED) {
-      // xmm0 = mask of positive values
-      e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF));
+      // Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the
+      // nearest even (the only rounding mode used on AltiVec) if the number is
+      // 0x80000000 or greater, instead of converting src & 0x7FFFFFFF and then
+      // adding 2147483648.0f, which results in double rounding that can give a
+      // result larger than needed - see OPCODE_VECTOR_CONVERT_I2F notes.

-      // scale any values >= (unsigned)INT_MIN back to [0, INT_MAX]
-      e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32));
-      e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0);
+      // [0x80000000, 0xFFFFFFFF] case:

-      // xmm1 = [0, INT_MAX]
-      e.vcvtdq2ps(i.dest, e.xmm1);
+      // Round to the nearest even, from (0x80000000 | 31 stored mantissa bits)
+      // to ((-1 << 23) | 23 stored mantissa bits), or to 0 if the result should
+      // be 4294967296.0f.
+      // xmm0 = src + 0b01111111 + ((src >> 8) & 1)
+      // (xmm1 also used to launch reg + mem early and to require it late)
+      e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127));
+      e.vpslld(e.xmm0, i.src1, 31 - 8);
+      e.vpsrld(e.xmm0, e.xmm0, 31);
+      e.vpaddd(e.xmm0, e.xmm0, e.xmm1);
+      // xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed
+      e.vpsrad(e.xmm0, e.xmm0, 8);
+      // Calculate the result for the [0x80000000, 0xFFFFFFFF] case - take the
+      // rounded mantissa, and add -1 or 0 to the exponent of 32, depending on
+      // whether the number should be (1.stored mantissa bits * 2^31) or 2^32.
+      // xmm0 = [0x80000000, 0xFFFFFFFF] case result
+      e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMM2To32));

-      // scale values back above [INT_MIN, UINT_MAX]
-      e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
-      e.vaddps(i.dest, i.dest, e.xmm0);
+      // [0x00000000, 0x7FFFFFFF] case
+      // (during vblendvps reg -> vpaddd reg -> vpaddd mem dependency):
+
+      // Convert from signed integer to float.
+      // xmm1 = [0x00000000, 0x7FFFFFFF] case result
+      e.vcvtdq2ps(e.xmm1, i.src1);
+
+      // Merge the two ways depending on whether the number is >= 0x80000000
+      // (has high bit set).
+      e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1);
    } else {
      e.vcvtdq2ps(i.dest, i.src1);
    }
--- a/src/xenia/cpu/hir/opcodes.h
+++ b/src/xenia/cpu/hir/opcodes.h
@ -143,6 +143,55 @@ enum Opcode {
  OPCODE_TRUNCATE,
  OPCODE_CONVERT,
  OPCODE_ROUND,
+  // Note that 2147483648.0 + (src & 0x7FFFFFFF) is not a correct way of
+  // performing the uint -> float conversion for large numbers on backends where
+  // only sint -> float is available.
+  //
+  // Take 0b11000000000000000000000101000001 as an example,
+  // or 1.1000000000000000000000101000001 * 2^31.
+  // This one has 31 mantissa bits (excluding the implicit 1.), and needs to be
+  // rounded to 23 bits - 8 mantissa bits need to be dropped:
+  // 10000000000000000000001_01000001
+  //
+  // Rounding to the nearest even (the only rounding mode that exists on
+  // AltiVec, and the likely rounding mode in the implementations) should be
+  // done downwards - 01000001 of 1_01000001 is in [00000000, 01111111].
+  // The correct mantissa in this case is:
+  // 1.10000000000000000000001 * 2^31.
+  //
+  // With a two-step conversion, rounding is done twice instead, which gives an
+  // incorrect result.
+  //
+  // First, converting the low 31 bits to float:
+  // The number is 0.1000000000000000000000101000001 * 2^31.
+  // Normalizing it, we get 1.000000000000000000000101000001 (30 significand
+  // bits).
+  // We need to round 30 bits to 23 - 7 bits need to be dropped:
+  // 00000000000000000000010_1000001
+  //
+  // Rounding to the nearest even is done upwards in this case - 1000001 of
+  // 0_1000001 is in [1000001, 1111111].
+  // The result of the sint -> float conversion is:
+  // 1.00000000000000000000011 * 2^30.
+  //
+  // Now 2147483648.0 (1 * 2^31) needs to be added. Aligning the exponents, we
+  // get:
+  //   0.|10000000000000000000001|1 * 2^31
+  // + 1.|00000000000000000000000|  * 2^31
+  // = 1.|10000000000000000000001|1 * 2^31
+  //
+  // At "infinite precision", the result has 24 significand bits, but only 23
+  // can be stored, thus rounding to the nearest even needs to be done. 1_1 is
+  // (odd + 0.5). 0.5 is ambiguous, thus tie-breaking to the nearest even -
+  // which is above in this case - is done. The result is:
+  // 1.10000000000000000000010 * 2^31.
+  //
+  // This is incorrect - larger than the correctly rounded result, which is:
+  // 1.10000000000000000000001 * 2^31.
+  //
+  // Test cases checked on real hardware via vcfux: 0xFFFDFF7E, 0xFFFCFF7D -
+  // should be 0x4F7FFDFF and 0x4F7FFCFF respectively, not 0x4F7FFE00 and
+  // 0x4F7FFD00.
  OPCODE_VECTOR_CONVERT_I2F,
  OPCODE_VECTOR_CONVERT_F2I,
  OPCODE_LOAD_VECTOR_SHL,