[x64] vcfux single rounding for 0x80000000+

This commit is contained in:
Triang3l 2020-12-11 21:20:13 +03:00
parent d0b849aad7
commit 5c47a3a588
4 changed files with 85 additions and 10 deletions

View File

@ -746,6 +746,8 @@ static const vec128_t xmm_consts[] = {
/* XMMIntMaxPD */ vec128d(INT_MAX),
/* XMMPosIntMinPS */ vec128f((float)0x80000000u),
/* XMMQNaN */ vec128i(0x7FC00000u),
/* XMMInt127 */ vec128i(0x7Fu),
/* XMM2To32 */ vec128f(0x1.0p32f),
};
// First location to try and place constants.

View File

@ -114,6 +114,8 @@ enum XmmConst {
XMMIntMaxPD,
XMMPosIntMinPS,
XMMQNaN,
XMMInt127,
XMM2To32,
};
// Unfortunately due to the design of xbyak we have to pass this to the ctor.

View File

@ -33,19 +33,41 @@ struct VECTOR_CONVERT_I2F
static void Emit(X64Emitter& e, const EmitArgType& i) {
// flags = ARITHMETIC_UNSIGNED
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
// xmm0 = mask of positive values
e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF));
// Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the
// nearest even (the only rounding mode used on AltiVec) if the number is
// 0x80000000 or greater, instead of converting src & 0x7FFFFFFF and then
// adding 2147483648.0f, which results in double rounding that can give a
// result larger than needed - see OPCODE_VECTOR_CONVERT_I2F notes.
// scale any values >= (unsigned)INT_MIN back to [0, INT_MAX]
e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32));
e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0);
// [0x80000000, 0xFFFFFFFF] case:
// xmm1 = [0, INT_MAX]
e.vcvtdq2ps(i.dest, e.xmm1);
// Round to the nearest even, from (0x80000000 | 31 stored mantissa bits)
// to ((-1 << 23) | 23 stored mantissa bits), or to 0 if the result should
// be 4294967296.0f.
// xmm0 = src + 0b01111111 + ((src >> 8) & 1)
// (xmm1 also used to launch reg + mem early and to require it late)
e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127));
e.vpslld(e.xmm0, i.src1, 31 - 8);
e.vpsrld(e.xmm0, e.xmm0, 31);
e.vpaddd(e.xmm0, e.xmm0, e.xmm1);
// xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed
e.vpsrad(e.xmm0, e.xmm0, 8);
// Calculate the result for the [0x80000000, 0xFFFFFFFF] case - take the
// rounded mantissa, and add -1 or 0 to the exponent of 32, depending on
// whether the number should be (1.stored mantissa bits * 2^31) or 2^32.
// xmm0 = [0x80000000, 0xFFFFFFFF] case result
e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMM2To32));
// scale values back above [INT_MIN, UINT_MAX]
e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
e.vaddps(i.dest, i.dest, e.xmm0);
// [0x00000000, 0x7FFFFFFF] case
// (during vblendvps reg -> vpaddd reg -> vpaddd mem dependency):
// Convert from signed integer to float.
// xmm1 = [0x00000000, 0x7FFFFFFF] case result
e.vcvtdq2ps(e.xmm1, i.src1);
// Merge the two ways depending on whether the number is >= 0x80000000
// (has high bit set).
e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1);
} else {
e.vcvtdq2ps(i.dest, i.src1);
}

View File

@ -143,6 +143,55 @@ enum Opcode {
OPCODE_TRUNCATE,
OPCODE_CONVERT,
OPCODE_ROUND,
// Note that 2147483648.0 + (src & 0x7FFFFFFF) is not a correct way of
// performing the uint -> float conversion for large numbers on backends where
// only sint -> float is available.
//
// Take 0b11000000000000000000000101000001 as an example,
// or 1.1000000000000000000000101000001 * 2^31.
// This one has 31 mantissa bits (excluding the implicit 1.), and needs to be
// rounded to 23 bits - 8 mantissa bits need to be dropped:
// 10000000000000000000001_01000001
//
// Rounding to the nearest even (the only rounding mode that exists on
// AltiVec, and the likely rounding mode in the implementations) should be
// done downwards - 01000001 of 1_01000001 is in [00000000, 01111111].
// The correct mantissa in this case is:
// 1.10000000000000000000001 * 2^31.
//
// With a two-step conversion, rounding is done twice instead, which gives an
// incorrect result.
//
// First, converting the low 31 bits to float:
// The number is 0.1000000000000000000000101000001 * 2^31.
// Normalizing it, we get 1.000000000000000000000101000001 (30 significand
// bits).
// We need to round 30 bits to 23 - 7 bits need to be dropped:
// 00000000000000000000010_1000001
//
// Rounding to the nearest even is done upwards in this case - 1000001 of
// 0_1000001 is in [1000001, 1111111].
// The result of the sint -> float conversion is:
// 1.00000000000000000000011 * 2^30.
//
// Now 2147483648.0 (1 * 2^31) needs to be added. Aligning the exponents, we
// get:
// 0.|10000000000000000000001|1 * 2^31
// + 1.|00000000000000000000000| * 2^31
// = 1.|10000000000000000000001|1 * 2^31
//
// At "infinite precision", the result has 24 significand bits, but only 23
// can be stored, thus rounding to the nearest even needs to be done. 1_1 is
// (odd + 0.5). 0.5 is ambiguous, thus tie-breaking to the nearest even -
// which is above in this case - is done. The result is:
// 1.10000000000000000000010 * 2^31.
//
// This is incorrect - larger than the correctly rounded result, which is:
// 1.10000000000000000000001 * 2^31.
//
// Test cases checked on real hardware via vcfux: 0xFFFDFF7E, 0xFFFCFF7D -
// should be 0x4F7FFDFF and 0x4F7FFCFF respectively, not 0x4F7FFE00 and
// 0x4F7FFD00.
OPCODE_VECTOR_CONVERT_I2F,
OPCODE_VECTOR_CONVERT_F2I,
OPCODE_LOAD_VECTOR_SHL,