diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 37d1cdc77..92f45d493 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -746,6 +746,8 @@ static const vec128_t xmm_consts[] = { /* XMMIntMaxPD */ vec128d(INT_MAX), /* XMMPosIntMinPS */ vec128f((float)0x80000000u), /* XMMQNaN */ vec128i(0x7FC00000u), + /* XMMInt127 */ vec128i(0x7Fu), + /* XMM2To32 */ vec128f(0x1.0p32f), }; // First location to try and place constants. diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 4f661a331..4a31543b6 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -114,6 +114,8 @@ enum XmmConst { XMMIntMaxPD, XMMPosIntMinPS, XMMQNaN, + XMMInt127, + XMM2To32, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 5cfb4615c..4c7fb665a 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -33,19 +33,41 @@ struct VECTOR_CONVERT_I2F static void Emit(X64Emitter& e, const EmitArgType& i) { // flags = ARITHMETIC_UNSIGNED if (i.instr->flags & ARITHMETIC_UNSIGNED) { - // xmm0 = mask of positive values - e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF)); + // Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the + // nearest even (the only rounding mode used on AltiVec) if the number is + // 0x80000000 or greater, instead of converting src & 0x7FFFFFFF and then + // adding 2147483648.0f, which results in double rounding that can give a + // result larger than needed - see OPCODE_VECTOR_CONVERT_I2F notes. - // scale any values >= (unsigned)INT_MIN back to [0, INT_MAX] - e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32)); - e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0); + // [0x80000000, 0xFFFFFFFF] case: - // xmm1 = [0, INT_MAX] - e.vcvtdq2ps(i.dest, e.xmm1); + // Round to the nearest even, from (0x80000000 | 31 stored mantissa bits) + // to ((-1 << 23) | 23 stored mantissa bits), or to 0 if the result should + // be 4294967296.0f. + // xmm0 = src + 0b01111111 + ((src >> 8) & 1) + // (xmm1 also used to launch reg + mem early and to require it late) + e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127)); + e.vpslld(e.xmm0, i.src1, 31 - 8); + e.vpsrld(e.xmm0, e.xmm0, 31); + e.vpaddd(e.xmm0, e.xmm0, e.xmm1); + // xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed + e.vpsrad(e.xmm0, e.xmm0, 8); + // Calculate the result for the [0x80000000, 0xFFFFFFFF] case - take the + // rounded mantissa, and add -1 or 0 to the exponent of 32, depending on + // whether the number should be (1.stored mantissa bits * 2^31) or 2^32. + // xmm0 = [0x80000000, 0xFFFFFFFF] case result + e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMM2To32)); - // scale values back above [INT_MIN, UINT_MAX] - e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); - e.vaddps(i.dest, i.dest, e.xmm0); + // [0x00000000, 0x7FFFFFFF] case + // (during vblendvps reg -> vpaddd reg -> vpaddd mem dependency): + + // Convert from signed integer to float. + // xmm1 = [0x00000000, 0x7FFFFFFF] case result + e.vcvtdq2ps(e.xmm1, i.src1); + + // Merge the two ways depending on whether the number is >= 0x80000000 + // (has high bit set). + e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1); } else { e.vcvtdq2ps(i.dest, i.src1); } diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 488e7e168..1649ec9dc 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -143,6 +143,55 @@ enum Opcode { OPCODE_TRUNCATE, OPCODE_CONVERT, OPCODE_ROUND, + // Note that 2147483648.0 + (src & 0x7FFFFFFF) is not a correct way of + // performing the uint -> float conversion for large numbers on backends where + // only sint -> float is available. + // + // Take 0b11000000000000000000000101000001 as an example, + // or 1.1000000000000000000000101000001 * 2^31. + // This one has 31 mantissa bits (excluding the implicit 1.), and needs to be + // rounded to 23 bits - 8 mantissa bits need to be dropped: + // 10000000000000000000001_01000001 + // + // Rounding to the nearest even (the only rounding mode that exists on + // AltiVec, and the likely rounding mode in the implementations) should be + // done downwards - 01000001 of 1_01000001 is in [00000000, 01111111]. + // The correct mantissa in this case is: + // 1.10000000000000000000001 * 2^31. + // + // With a two-step conversion, rounding is done twice instead, which gives an + // incorrect result. + // + // First, converting the low 31 bits to float: + // The number is 0.1000000000000000000000101000001 * 2^31. + // Normalizing it, we get 1.000000000000000000000101000001 (30 significand + // bits). + // We need to round 30 bits to 23 - 7 bits need to be dropped: + // 00000000000000000000010_1000001 + // + // Rounding to the nearest even is done upwards in this case - 1000001 of + // 0_1000001 is in [1000001, 1111111]. + // The result of the sint -> float conversion is: + // 1.00000000000000000000011 * 2^30. + // + // Now 2147483648.0 (1 * 2^31) needs to be added. Aligning the exponents, we + // get: + // 0.|10000000000000000000001|1 * 2^31 + // + 1.|00000000000000000000000| * 2^31 + // = 1.|10000000000000000000001|1 * 2^31 + // + // At "infinite precision", the result has 24 significand bits, but only 23 + // can be stored, thus rounding to the nearest even needs to be done. 1_1 is + // (odd + 0.5). 0.5 is ambiguous, thus tie-breaking to the nearest even - + // which is above in this case - is done. The result is: + // 1.10000000000000000000010 * 2^31. + // + // This is incorrect - larger than the correctly rounded result, which is: + // 1.10000000000000000000001 * 2^31. + // + // Test cases checked on real hardware via vcfux: 0xFFFDFF7E, 0xFFFCFF7D - + // should be 0x4F7FFDFF and 0x4F7FFCFF respectively, not 0x4F7FFE00 and + // 0x4F7FFD00. OPCODE_VECTOR_CONVERT_I2F, OPCODE_VECTOR_CONVERT_F2I, OPCODE_LOAD_VECTOR_SHL,