[x64] vcfux single rounding for 0x80000000+
This commit is contained in:
parent
d0b849aad7
commit
5c47a3a588
|
@ -746,6 +746,8 @@ static const vec128_t xmm_consts[] = {
|
|||
/* XMMIntMaxPD */ vec128d(INT_MAX),
|
||||
/* XMMPosIntMinPS */ vec128f((float)0x80000000u),
|
||||
/* XMMQNaN */ vec128i(0x7FC00000u),
|
||||
/* XMMInt127 */ vec128i(0x7Fu),
|
||||
/* XMM2To32 */ vec128f(0x1.0p32f),
|
||||
};
|
||||
|
||||
// First location to try and place constants.
|
||||
|
|
|
@ -114,6 +114,8 @@ enum XmmConst {
|
|||
XMMIntMaxPD,
|
||||
XMMPosIntMinPS,
|
||||
XMMQNaN,
|
||||
XMMInt127,
|
||||
XMM2To32,
|
||||
};
|
||||
|
||||
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
||||
|
|
|
@ -33,19 +33,41 @@ struct VECTOR_CONVERT_I2F
|
|||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
// flags = ARITHMETIC_UNSIGNED
|
||||
if (i.instr->flags & ARITHMETIC_UNSIGNED) {
|
||||
// xmm0 = mask of positive values
|
||||
e.vpcmpgtd(e.xmm0, i.src1, e.GetXmmConstPtr(XMMFFFF));
|
||||
// Round manually to (1.stored mantissa bits * 2^31) or to 2^32 to the
|
||||
// nearest even (the only rounding mode used on AltiVec) if the number is
|
||||
// 0x80000000 or greater, instead of converting src & 0x7FFFFFFF and then
|
||||
// adding 2147483648.0f, which results in double rounding that can give a
|
||||
// result larger than needed - see OPCODE_VECTOR_CONVERT_I2F notes.
|
||||
|
||||
// scale any values >= (unsigned)INT_MIN back to [0, INT_MAX]
|
||||
e.vpsubd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMSignMaskI32));
|
||||
e.vblendvps(e.xmm1, e.xmm1, i.src1, e.xmm0);
|
||||
// [0x80000000, 0xFFFFFFFF] case:
|
||||
|
||||
// xmm1 = [0, INT_MAX]
|
||||
e.vcvtdq2ps(i.dest, e.xmm1);
|
||||
// Round to the nearest even, from (0x80000000 | 31 stored mantissa bits)
|
||||
// to ((-1 << 23) | 23 stored mantissa bits), or to 0 if the result should
|
||||
// be 4294967296.0f.
|
||||
// xmm0 = src + 0b01111111 + ((src >> 8) & 1)
|
||||
// (xmm1 also used to launch reg + mem early and to require it late)
|
||||
e.vpaddd(e.xmm1, i.src1, e.GetXmmConstPtr(XMMInt127));
|
||||
e.vpslld(e.xmm0, i.src1, 31 - 8);
|
||||
e.vpsrld(e.xmm0, e.xmm0, 31);
|
||||
e.vpaddd(e.xmm0, e.xmm0, e.xmm1);
|
||||
// xmm0 = (0xFF800000 | 23 explicit mantissa bits), or 0 if overflowed
|
||||
e.vpsrad(e.xmm0, e.xmm0, 8);
|
||||
// Calculate the result for the [0x80000000, 0xFFFFFFFF] case - take the
|
||||
// rounded mantissa, and add -1 or 0 to the exponent of 32, depending on
|
||||
// whether the number should be (1.stored mantissa bits * 2^31) or 2^32.
|
||||
// xmm0 = [0x80000000, 0xFFFFFFFF] case result
|
||||
e.vpaddd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMM2To32));
|
||||
|
||||
// scale values back above [INT_MIN, UINT_MAX]
|
||||
e.vpandn(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS));
|
||||
e.vaddps(i.dest, i.dest, e.xmm0);
|
||||
// [0x00000000, 0x7FFFFFFF] case
|
||||
// (during vblendvps reg -> vpaddd reg -> vpaddd mem dependency):
|
||||
|
||||
// Convert from signed integer to float.
|
||||
// xmm1 = [0x00000000, 0x7FFFFFFF] case result
|
||||
e.vcvtdq2ps(e.xmm1, i.src1);
|
||||
|
||||
// Merge the two ways depending on whether the number is >= 0x80000000
|
||||
// (has high bit set).
|
||||
e.vblendvps(i.dest, e.xmm1, e.xmm0, i.src1);
|
||||
} else {
|
||||
e.vcvtdq2ps(i.dest, i.src1);
|
||||
}
|
||||
|
|
|
@ -143,6 +143,55 @@ enum Opcode {
|
|||
OPCODE_TRUNCATE,
|
||||
OPCODE_CONVERT,
|
||||
OPCODE_ROUND,
|
||||
// Note that 2147483648.0 + (src & 0x7FFFFFFF) is not a correct way of
|
||||
// performing the uint -> float conversion for large numbers on backends where
|
||||
// only sint -> float is available.
|
||||
//
|
||||
// Take 0b11000000000000000000000101000001 as an example,
|
||||
// or 1.1000000000000000000000101000001 * 2^31.
|
||||
// This one has 31 mantissa bits (excluding the implicit 1.), and needs to be
|
||||
// rounded to 23 bits - 8 mantissa bits need to be dropped:
|
||||
// 10000000000000000000001_01000001
|
||||
//
|
||||
// Rounding to the nearest even (the only rounding mode that exists on
|
||||
// AltiVec, and the likely rounding mode in the implementations) should be
|
||||
// done downwards - 01000001 of 1_01000001 is in [00000000, 01111111].
|
||||
// The correct mantissa in this case is:
|
||||
// 1.10000000000000000000001 * 2^31.
|
||||
//
|
||||
// With a two-step conversion, rounding is done twice instead, which gives an
|
||||
// incorrect result.
|
||||
//
|
||||
// First, converting the low 31 bits to float:
|
||||
// The number is 0.1000000000000000000000101000001 * 2^31.
|
||||
// Normalizing it, we get 1.000000000000000000000101000001 (30 significand
|
||||
// bits).
|
||||
// We need to round 30 bits to 23 - 7 bits need to be dropped:
|
||||
// 00000000000000000000010_1000001
|
||||
//
|
||||
// Rounding to the nearest even is done upwards in this case - 1000001 of
|
||||
// 0_1000001 is in [1000001, 1111111].
|
||||
// The result of the sint -> float conversion is:
|
||||
// 1.00000000000000000000011 * 2^30.
|
||||
//
|
||||
// Now 2147483648.0 (1 * 2^31) needs to be added. Aligning the exponents, we
|
||||
// get:
|
||||
// 0.|10000000000000000000001|1 * 2^31
|
||||
// + 1.|00000000000000000000000| * 2^31
|
||||
// = 1.|10000000000000000000001|1 * 2^31
|
||||
//
|
||||
// At "infinite precision", the result has 24 significand bits, but only 23
|
||||
// can be stored, thus rounding to the nearest even needs to be done. 1_1 is
|
||||
// (odd + 0.5). 0.5 is ambiguous, thus tie-breaking to the nearest even -
|
||||
// which is above in this case - is done. The result is:
|
||||
// 1.10000000000000000000010 * 2^31.
|
||||
//
|
||||
// This is incorrect - larger than the correctly rounded result, which is:
|
||||
// 1.10000000000000000000001 * 2^31.
|
||||
//
|
||||
// Test cases checked on real hardware via vcfux: 0xFFFDFF7E, 0xFFFCFF7D -
|
||||
// should be 0x4F7FFDFF and 0x4F7FFCFF respectively, not 0x4F7FFE00 and
|
||||
// 0x4F7FFD00.
|
||||
OPCODE_VECTOR_CONVERT_I2F,
|
||||
OPCODE_VECTOR_CONVERT_F2I,
|
||||
OPCODE_LOAD_VECTOR_SHL,
|
||||
|
|
Loading…
Reference in New Issue