From 26cc6abf61cc2bf68eaeb89b1497c12330e6b73c Mon Sep 17 00:00:00 2001 From: Christophe LYON Date: Wed, 19 Oct 2011 16:14:05 +0000 Subject: [PATCH 1/8] rsqrte_f32: No need to copy sign bit. Indeed, the result is known to be always positive. Signed-off-by: Christophe Lyon Signed-off-by: Peter Maydell --- target-arm/helper.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/target-arm/helper.c b/target-arm/helper.c index e2428eb7b2..17ef98b687 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -3039,8 +3039,7 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUState *env) val64 = float64_val(f64); - val = ((val64 >> 63) & 0x80000000) - | ((result_exp & 0xff) << 23) + val = ((result_exp & 0xff) << 23) | ((val64 >> 29) & 0x7fffff); return make_float32(val); } From 41e9564df650ddedbdab27283ebcb0173adf024f Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Wed, 19 Oct 2011 16:14:05 +0000 Subject: [PATCH 2/8] target-arm: v6 media multiply space: UNDEF on unassigned encodings Clean up the decoding of the v6 media multiply space so that we UNDEF on unassigned encodings rather than randomly interpreting them as some instruction in this space. Signed-off-by: Peter Maydell --- target-arm/translate.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 75c0ad413a..e99fc183b4 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -7569,11 +7569,16 @@ static void disas_arm_insn(CPUState * env, DisasContext *s) } break; case 2: /* Multiplies (Type 3). */ - tmp = load_reg(s, rm); - tmp2 = load_reg(s, rs); - if (insn & (1 << 20)) { + switch ((insn >> 20) & 0x7) { + case 5: + if (((insn >> 6) ^ (insn >> 7)) & 1) { + /* op2 not 00x or 11x : UNDEF */ + goto illegal_op; + } /* Signed multiply most significant [accumulate]. (SMMUL, SMMLA, SMMLS) */ + tmp = load_reg(s, rm); + tmp2 = load_reg(s, rs); tmp64 = gen_muls_i64_i32(tmp, tmp2); if (rd != 15) { @@ -7592,7 +7597,15 @@ static void disas_arm_insn(CPUState * env, DisasContext *s) tcg_gen_trunc_i64_i32(tmp, tmp64); tcg_temp_free_i64(tmp64); store_reg(s, rn, tmp); - } else { + break; + case 0: + case 4: + /* SMLAD, SMUAD, SMLSD, SMUSD, SMLALD, SMLSLD */ + if (insn & (1 << 7)) { + goto illegal_op; + } + tmp = load_reg(s, rm); + tmp2 = load_reg(s, rs); if (insn & (1 << 5)) gen_swap_half(tmp2); gen_smul_dual(tmp, tmp2); @@ -7625,6 +7638,9 @@ static void disas_arm_insn(CPUState * env, DisasContext *s) } store_reg(s, rn, tmp); } + break; + default: + goto illegal_op; } break; case 3: From 477899908fd2e6ed72faf18ef5ab09f2ae746a8a Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Wed, 19 Oct 2011 16:14:06 +0000 Subject: [PATCH 3/8] target-arm: Rename ARM_FEATURE_DIV to _THUMB_DIV Rename the ARM_FEATURE_DIV feature bit to _THUMB_DIV, to make room for a new feature switch enabling DIV in the ARM encoding. (Cores may implement either (a) no divide insns (b) divide insns in Thumb encodings only (c) divide insns in both ARM and Thumb encodings.) Signed-off-by: Peter Maydell --- target-arm/cpu.h | 2 +- target-arm/helper.c | 4 ++-- target-arm/translate.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/target-arm/cpu.h b/target-arm/cpu.h index 6ab780d7ef..00e012e540 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -366,7 +366,7 @@ enum arm_features { ARM_FEATURE_VFP3, ARM_FEATURE_VFP_FP16, ARM_FEATURE_NEON, - ARM_FEATURE_DIV, + ARM_FEATURE_THUMB_DIV, /* divide supported in Thumb encoding */ ARM_FEATURE_M, /* Microcontroller profile. */ ARM_FEATURE_OMAPCP, /* OMAP specific CP15 ops handling. */ ARM_FEATURE_THUMB2EE, diff --git a/target-arm/helper.c b/target-arm/helper.c index 17ef98b687..faf0283b2d 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -193,7 +193,7 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) set_feature(env, ARM_FEATURE_THUMB2); set_feature(env, ARM_FEATURE_V7); set_feature(env, ARM_FEATURE_M); - set_feature(env, ARM_FEATURE_DIV); + set_feature(env, ARM_FEATURE_THUMB_DIV); break; case ARM_CPUID_ANY: /* For userspace emulation. */ set_feature(env, ARM_FEATURE_V4T); @@ -207,7 +207,7 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) set_feature(env, ARM_FEATURE_VFP_FP16); set_feature(env, ARM_FEATURE_NEON); set_feature(env, ARM_FEATURE_THUMB2EE); - set_feature(env, ARM_FEATURE_DIV); + set_feature(env, ARM_FEATURE_THUMB_DIV); set_feature(env, ARM_FEATURE_V7MP); break; case ARM_CPUID_TI915T: diff --git a/target-arm/translate.c b/target-arm/translate.c index e99fc183b4..deb0bcfba7 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -8513,8 +8513,9 @@ static int disas_thumb2_insn(CPUState *env, DisasContext *s, uint16_t insn_hw1) tmp2 = load_reg(s, rm); if ((op & 0x50) == 0x10) { /* sdiv, udiv */ - if (!arm_feature(env, ARM_FEATURE_DIV)) + if (!arm_feature(env, ARM_FEATURE_THUMB_DIV)) { goto illegal_op; + } if (op & 0x20) gen_helper_udiv(tmp, tmp, tmp2); else From b8b8ea05c4008343afcedf99b9c91fd750ea90e5 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Wed, 19 Oct 2011 16:14:06 +0000 Subject: [PATCH 4/8] target-arm: Add ARM UDIV/SDIV support Add support for UDIV and SDIV in ARM mode. This is a new optional feature for A profile cores (Thumb mode has had UDIV and SDIV for M profile cores for some time). Signed-off-by: Peter Maydell --- target-arm/cpu.h | 1 + target-arm/helper.c | 5 ++++- target-arm/translate.c | 19 +++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/target-arm/cpu.h b/target-arm/cpu.h index 00e012e540..af3904d2bb 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -375,6 +375,7 @@ enum arm_features { ARM_FEATURE_V5, ARM_FEATURE_STRONGARM, ARM_FEATURE_VAPA, /* cp15 VA to PA lookups */ + ARM_FEATURE_ARM_DIV, /* divide supported in ARM encoding */ }; static inline int arm_feature(CPUARMState *env, int feature) diff --git a/target-arm/helper.c b/target-arm/helper.c index faf0283b2d..3a51fd73cf 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -207,7 +207,7 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) set_feature(env, ARM_FEATURE_VFP_FP16); set_feature(env, ARM_FEATURE_NEON); set_feature(env, ARM_FEATURE_THUMB2EE); - set_feature(env, ARM_FEATURE_THUMB_DIV); + set_feature(env, ARM_FEATURE_ARM_DIV); set_feature(env, ARM_FEATURE_V7MP); break; case ARM_CPUID_TI915T: @@ -261,6 +261,9 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) if (arm_feature(env, ARM_FEATURE_V7)) { set_feature(env, ARM_FEATURE_VAPA); } + if (arm_feature(env, ARM_FEATURE_ARM_DIV)) { + set_feature(env, ARM_FEATURE_THUMB_DIV); + } } void cpu_reset(CPUARMState *env) diff --git a/target-arm/translate.c b/target-arm/translate.c index deb0bcfba7..812a9e7be4 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -7639,6 +7639,25 @@ static void disas_arm_insn(CPUState * env, DisasContext *s) store_reg(s, rn, tmp); } break; + case 1: + case 3: + /* SDIV, UDIV */ + if (!arm_feature(env, ARM_FEATURE_ARM_DIV)) { + goto illegal_op; + } + if (((insn >> 5) & 7) || (rd != 15)) { + goto illegal_op; + } + tmp = load_reg(s, rm); + tmp2 = load_reg(s, rs); + if (insn & (1 << 21)) { + gen_helper_udiv(tmp, tmp, tmp2); + } else { + gen_helper_sdiv(tmp, tmp, tmp2); + } + tcg_temp_free_i32(tmp2); + store_reg(s, rn, tmp); + break; default: goto illegal_op; } From 369be8f618ac145d2421ea4bfff86ee774ad618c Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Wed, 19 Oct 2011 16:14:06 +0000 Subject: [PATCH 5/8] softfloat: Implement fused multiply-add Implement fused multiply-add as a softfloat primitive. This implements "a+b*c" as a single step without any intermediate rounding; it is specified in IEEE 754-2008 and implemented in a number of CPUs. Signed-off-by: Peter Maydell --- fpu/softfloat-specialize.h | 178 ++++++++++++++++ fpu/softfloat.c | 427 +++++++++++++++++++++++++++++++++++++ fpu/softfloat.h | 14 ++ 3 files changed, 619 insertions(+) diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h index c165205a49..c5e2dab9f6 100644 --- a/fpu/softfloat-specialize.h +++ b/fpu/softfloat-specialize.h @@ -419,6 +419,82 @@ static int pickNaN(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, } #endif +/*---------------------------------------------------------------------------- +| Select which NaN to propagate for a three-input operation. +| For the moment we assume that no CPU needs the 'larger significand' +| information. +| Return values : 0 : a; 1 : b; 2 : c; 3 : default-NaN +*----------------------------------------------------------------------------*/ +#if defined(TARGET_ARM) +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + /* For ARM, the (inf,zero,qnan) case sets InvalidOp and returns + * the default NaN + */ + if (infzero && cIsQNaN) { + float_raise(float_flag_invalid STATUS_VAR); + return 3; + } + + /* This looks different from the ARM ARM pseudocode, because the ARM ARM + * puts the operands to a fused mac operation (a*b)+c in the order c,a,b. + */ + if (cIsSNaN) { + return 2; + } else if (aIsSNaN) { + return 0; + } else if (bIsSNaN) { + return 1; + } else if (cIsQNaN) { + return 2; + } else if (aIsQNaN) { + return 0; + } else { + return 1; + } +} +#elif defined(TARGET_PPC) +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + /* For PPC, the (inf,zero,qnan) case sets InvalidOp, but we prefer + * to return an input NaN if we have one (ie c) rather than generating + * a default NaN + */ + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return 2; + } + + /* If fRA is a NaN return it; otherwise if fRB is a NaN return it; + * otherwise return fRC. Note that muladd on PPC is (fRA * fRC) + frB + */ + if (aIsSNaN || aIsQNaN) { + return 0; + } else if (cIsSNaN || cIsQNaN) { + return 2; + } else { + return 1; + } +} +#else +/* A default implementation: prefer a to b to c. + * This is unlikely to actually match any real implementation. + */ +static int pickNaNMulAdd(flag aIsQNaN, flag aIsSNaN, flag bIsQNaN, flag bIsSNaN, + flag cIsQNaN, flag cIsSNaN, flag infzero STATUS_PARAM) +{ + if (aIsSNaN || aIsQNaN) { + return 0; + } else if (bIsSNaN || bIsQNaN) { + return 1; + } else { + return 2; + } +} +#endif + /*---------------------------------------------------------------------------- | Takes two single-precision floating-point values `a' and `b', one of which | is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a @@ -459,6 +535,57 @@ static float32 propagateFloat32NaN( float32 a, float32 b STATUS_PARAM) } } +/*---------------------------------------------------------------------------- +| Takes three single-precision floating-point values `a', `b' and `c', one of +| which is a NaN, and returns the appropriate NaN result. If any of `a', +| `b' or `c' is a signaling NaN, the invalid exception is raised. +| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case +| obviously c is a NaN, and whether to propagate c or some other NaN is +| implementation defined). +*----------------------------------------------------------------------------*/ + +static float32 propagateFloat32MulAddNaN(float32 a, float32 b, + float32 c, flag infzero STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN; + int which; + + aIsQuietNaN = float32_is_quiet_nan(a); + aIsSignalingNaN = float32_is_signaling_nan(a); + bIsQuietNaN = float32_is_quiet_nan(b); + bIsSignalingNaN = float32_is_signaling_nan(b); + cIsQuietNaN = float32_is_quiet_nan(c); + cIsSignalingNaN = float32_is_signaling_nan(c); + + if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) { + float_raise(float_flag_invalid STATUS_VAR); + } + + which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN, + bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN, infzero STATUS_VAR); + + if (STATUS(default_nan_mode)) { + /* Note that this check is after pickNaNMulAdd so that function + * has an opportunity to set the Invalid flag. + */ + return float32_default_nan; + } + + switch (which) { + case 0: + return float32_maybe_silence_nan(a); + case 1: + return float32_maybe_silence_nan(b); + case 2: + return float32_maybe_silence_nan(c); + case 3: + default: + return float32_default_nan; + } +} + /*---------------------------------------------------------------------------- | Returns 1 if the double-precision floating-point value `a' is a quiet | NaN; otherwise returns 0. @@ -595,6 +722,57 @@ static float64 propagateFloat64NaN( float64 a, float64 b STATUS_PARAM) } } +/*---------------------------------------------------------------------------- +| Takes three double-precision floating-point values `a', `b' and `c', one of +| which is a NaN, and returns the appropriate NaN result. If any of `a', +| `b' or `c' is a signaling NaN, the invalid exception is raised. +| The input infzero indicates whether a*b was 0*inf or inf*0 (in which case +| obviously c is a NaN, and whether to propagate c or some other NaN is +| implementation defined). +*----------------------------------------------------------------------------*/ + +static float64 propagateFloat64MulAddNaN(float64 a, float64 b, + float64 c, flag infzero STATUS_PARAM) +{ + flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN; + int which; + + aIsQuietNaN = float64_is_quiet_nan(a); + aIsSignalingNaN = float64_is_signaling_nan(a); + bIsQuietNaN = float64_is_quiet_nan(b); + bIsSignalingNaN = float64_is_signaling_nan(b); + cIsQuietNaN = float64_is_quiet_nan(c); + cIsSignalingNaN = float64_is_signaling_nan(c); + + if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) { + float_raise(float_flag_invalid STATUS_VAR); + } + + which = pickNaNMulAdd(aIsQuietNaN, aIsSignalingNaN, + bIsQuietNaN, bIsSignalingNaN, + cIsQuietNaN, cIsSignalingNaN, infzero STATUS_VAR); + + if (STATUS(default_nan_mode)) { + /* Note that this check is after pickNaNMulAdd so that function + * has an opportunity to set the Invalid flag. + */ + return float64_default_nan; + } + + switch (which) { + case 0: + return float64_maybe_silence_nan(a); + case 1: + return float64_maybe_silence_nan(b); + case 2: + return float64_maybe_silence_nan(c); + case 3: + default: + return float64_default_nan; + } +} + /*---------------------------------------------------------------------------- | Returns 1 if the extended double-precision floating-point value `a' is a | quiet NaN; otherwise returns 0. This slightly differs from the same diff --git a/fpu/softfloat.c b/fpu/softfloat.c index 3aafa81d58..81a7d1ae09 100644 --- a/fpu/softfloat.c +++ b/fpu/softfloat.c @@ -2117,6 +2117,213 @@ float32 float32_rem( float32 a, float32 b STATUS_PARAM ) } +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the single-precision floating-point values +| `a' and `b' then adding 'c', with no intermediate rounding step after the +| multiplication. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic 754-2008. +| The flags argument allows the caller to select negation of the +| addend, the intermediate product, or the final result. (The difference +| between this and having the caller do a separate negation is that negating +| externally will flip the sign bit on NaNs.) +*----------------------------------------------------------------------------*/ + +float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM) +{ + flag aSign, bSign, cSign, zSign; + int aExp, bExp, cExp, pExp, zExp, expDiff; + uint32_t aSig, bSig, cSig; + flag pInf, pZero, pSign; + uint64_t pSig64, cSig64, zSig64; + uint32_t pSig; + int shiftcount; + flag signflip, infzero; + + a = float32_squash_input_denormal(a STATUS_VAR); + b = float32_squash_input_denormal(b STATUS_VAR); + c = float32_squash_input_denormal(c STATUS_VAR); + aSig = extractFloat32Frac(a); + aExp = extractFloat32Exp(a); + aSign = extractFloat32Sign(a); + bSig = extractFloat32Frac(b); + bExp = extractFloat32Exp(b); + bSign = extractFloat32Sign(b); + cSig = extractFloat32Frac(c); + cExp = extractFloat32Exp(c); + cSign = extractFloat32Sign(c); + + infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) || + (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0)); + + /* It is implementation-defined whether the cases of (0,inf,qnan) + * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN + * they return if they do), so we have to hand this information + * off to the target-specific pick-a-NaN routine. + */ + if (((aExp == 0xff) && aSig) || + ((bExp == 0xff) && bSig) || + ((cExp == 0xff) && cSig)) { + return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR); + } + + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + + if (flags & float_muladd_negate_c) { + cSign ^= 1; + } + + signflip = (flags & float_muladd_negate_result) ? 1 : 0; + + /* Work out the sign and type of the product */ + pSign = aSign ^ bSign; + if (flags & float_muladd_negate_product) { + pSign ^= 1; + } + pInf = (aExp == 0xff) || (bExp == 0xff); + pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); + + if (cExp == 0xff) { + if (pInf && (pSign ^ cSign)) { + /* addition of opposite-signed infinities => InvalidOperation */ + float_raise(float_flag_invalid STATUS_VAR); + return float32_default_nan; + } + /* Otherwise generate an infinity of the same sign */ + return packFloat32(cSign ^ signflip, 0xff, 0); + } + + if (pInf) { + return packFloat32(pSign ^ signflip, 0xff, 0); + } + + if (pZero) { + if (cExp == 0) { + if (cSig == 0) { + /* Adding two exact zeroes */ + if (pSign == cSign) { + zSign = pSign; + } else if (STATUS(float_rounding_mode) == float_round_down) { + zSign = 1; + } else { + zSign = 0; + } + return packFloat32(zSign ^ signflip, 0, 0); + } + /* Exact zero plus a denorm */ + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat32(cSign ^ signflip, 0, 0); + } + } + /* Zero plus something non-zero : just return the something */ + return c ^ (signflip << 31); + } + + if (aExp == 0) { + normalizeFloat32Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + normalizeFloat32Subnormal(bSig, &bExp, &bSig); + } + + /* Calculate the actual result a * b + c */ + + /* Multiply first; this is easy. */ + /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f + * because we want the true exponent, not the "one-less-than" + * flavour that roundAndPackFloat32() takes. + */ + pExp = aExp + bExp - 0x7e; + aSig = (aSig | 0x00800000) << 7; + bSig = (bSig | 0x00800000) << 8; + pSig64 = (uint64_t)aSig * bSig; + if ((int64_t)(pSig64 << 1) >= 0) { + pSig64 <<= 1; + pExp--; + } + + zSign = pSign ^ signflip; + + /* Now pSig64 is the significand of the multiply, with the explicit bit in + * position 62. + */ + if (cExp == 0) { + if (!cSig) { + /* Throw out the special case of c being an exact zero now */ + shift64RightJamming(pSig64, 32, &pSig64); + pSig = pSig64; + return roundAndPackFloat32(zSign, pExp - 1, + pSig STATUS_VAR); + } + normalizeFloat32Subnormal(cSig, &cExp, &cSig); + } + + cSig64 = (uint64_t)cSig << (62 - 23); + cSig64 |= LIT64(0x4000000000000000); + expDiff = pExp - cExp; + + if (pSign == cSign) { + /* Addition */ + if (expDiff > 0) { + /* scale c to match p */ + shift64RightJamming(cSig64, expDiff, &cSig64); + zExp = pExp; + } else if (expDiff < 0) { + /* scale p to match c */ + shift64RightJamming(pSig64, -expDiff, &pSig64); + zExp = cExp; + } else { + /* no scaling needed */ + zExp = cExp; + } + /* Add significands and make sure explicit bit ends up in posn 62 */ + zSig64 = pSig64 + cSig64; + if ((int64_t)zSig64 < 0) { + shift64RightJamming(zSig64, 1, &zSig64); + } else { + zExp--; + } + } else { + /* Subtraction */ + if (expDiff > 0) { + shift64RightJamming(cSig64, expDiff, &cSig64); + zSig64 = pSig64 - cSig64; + zExp = pExp; + } else if (expDiff < 0) { + shift64RightJamming(pSig64, -expDiff, &pSig64); + zSig64 = cSig64 - pSig64; + zExp = cExp; + zSign ^= 1; + } else { + zExp = pExp; + if (cSig64 < pSig64) { + zSig64 = pSig64 - cSig64; + } else if (pSig64 < cSig64) { + zSig64 = cSig64 - pSig64; + zSign ^= 1; + } else { + /* Exact zero */ + zSign = signflip; + if (STATUS(float_rounding_mode) == float_round_down) { + zSign ^= 1; + } + return packFloat32(zSign, 0, 0); + } + } + --zExp; + /* Normalize to put the explicit bit back into bit 62. */ + shiftcount = countLeadingZeros64(zSig64) - 1; + zSig64 <<= shiftcount; + zExp -= shiftcount; + } + shift64RightJamming(zSig64, 32, &zSig64); + return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR); +} + + /*---------------------------------------------------------------------------- | Returns the square root of the single-precision floating-point value `a'. | The operation is performed according to the IEC/IEEE Standard for Binary @@ -3464,6 +3671,226 @@ float64 float64_rem( float64 a, float64 b STATUS_PARAM ) } +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the double-precision floating-point values +| `a' and `b' then adding 'c', with no intermediate rounding step after the +| multiplication. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic 754-2008. +| The flags argument allows the caller to select negation of the +| addend, the intermediate product, or the final result. (The difference +| between this and having the caller do a separate negation is that negating +| externally will flip the sign bit on NaNs.) +*----------------------------------------------------------------------------*/ + +float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM) +{ + flag aSign, bSign, cSign, zSign; + int aExp, bExp, cExp, pExp, zExp, expDiff; + uint64_t aSig, bSig, cSig; + flag pInf, pZero, pSign; + uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1; + int shiftcount; + flag signflip, infzero; + + a = float64_squash_input_denormal(a STATUS_VAR); + b = float64_squash_input_denormal(b STATUS_VAR); + c = float64_squash_input_denormal(c STATUS_VAR); + aSig = extractFloat64Frac(a); + aExp = extractFloat64Exp(a); + aSign = extractFloat64Sign(a); + bSig = extractFloat64Frac(b); + bExp = extractFloat64Exp(b); + bSign = extractFloat64Sign(b); + cSig = extractFloat64Frac(c); + cExp = extractFloat64Exp(c); + cSign = extractFloat64Sign(c); + + infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) || + (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0)); + + /* It is implementation-defined whether the cases of (0,inf,qnan) + * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN + * they return if they do), so we have to hand this information + * off to the target-specific pick-a-NaN routine. + */ + if (((aExp == 0x7ff) && aSig) || + ((bExp == 0x7ff) && bSig) || + ((cExp == 0x7ff) && cSig)) { + return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR); + } + + if (infzero) { + float_raise(float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + + if (flags & float_muladd_negate_c) { + cSign ^= 1; + } + + signflip = (flags & float_muladd_negate_result) ? 1 : 0; + + /* Work out the sign and type of the product */ + pSign = aSign ^ bSign; + if (flags & float_muladd_negate_product) { + pSign ^= 1; + } + pInf = (aExp == 0x7ff) || (bExp == 0x7ff); + pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0); + + if (cExp == 0x7ff) { + if (pInf && (pSign ^ cSign)) { + /* addition of opposite-signed infinities => InvalidOperation */ + float_raise(float_flag_invalid STATUS_VAR); + return float64_default_nan; + } + /* Otherwise generate an infinity of the same sign */ + return packFloat64(cSign ^ signflip, 0x7ff, 0); + } + + if (pInf) { + return packFloat64(pSign ^ signflip, 0x7ff, 0); + } + + if (pZero) { + if (cExp == 0) { + if (cSig == 0) { + /* Adding two exact zeroes */ + if (pSign == cSign) { + zSign = pSign; + } else if (STATUS(float_rounding_mode) == float_round_down) { + zSign = 1; + } else { + zSign = 0; + } + return packFloat64(zSign ^ signflip, 0, 0); + } + /* Exact zero plus a denorm */ + if (STATUS(flush_to_zero)) { + float_raise(float_flag_output_denormal STATUS_VAR); + return packFloat64(cSign ^ signflip, 0, 0); + } + } + /* Zero plus something non-zero : just return the something */ + return c ^ ((uint64_t)signflip << 63); + } + + if (aExp == 0) { + normalizeFloat64Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + normalizeFloat64Subnormal(bSig, &bExp, &bSig); + } + + /* Calculate the actual result a * b + c */ + + /* Multiply first; this is easy. */ + /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff + * because we want the true exponent, not the "one-less-than" + * flavour that roundAndPackFloat64() takes. + */ + pExp = aExp + bExp - 0x3fe; + aSig = (aSig | LIT64(0x0010000000000000))<<10; + bSig = (bSig | LIT64(0x0010000000000000))<<11; + mul64To128(aSig, bSig, &pSig0, &pSig1); + if ((int64_t)(pSig0 << 1) >= 0) { + shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1); + pExp--; + } + + zSign = pSign ^ signflip; + + /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit + * bit in position 126. + */ + if (cExp == 0) { + if (!cSig) { + /* Throw out the special case of c being an exact zero now */ + shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1); + return roundAndPackFloat64(zSign, pExp - 1, + pSig1 STATUS_VAR); + } + normalizeFloat64Subnormal(cSig, &cExp, &cSig); + } + + /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the + * significand of the addend, with the explicit bit in position 126. + */ + cSig0 = cSig << (126 - 64 - 52); + cSig1 = 0; + cSig0 |= LIT64(0x4000000000000000); + expDiff = pExp - cExp; + + if (pSign == cSign) { + /* Addition */ + if (expDiff > 0) { + /* scale c to match p */ + shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); + zExp = pExp; + } else if (expDiff < 0) { + /* scale p to match c */ + shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); + zExp = cExp; + } else { + /* no scaling needed */ + zExp = cExp; + } + /* Add significands and make sure explicit bit ends up in posn 126 */ + add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + if ((int64_t)zSig0 < 0) { + shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1); + } else { + zExp--; + } + shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1); + return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR); + } else { + /* Subtraction */ + if (expDiff > 0) { + shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1); + sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + zExp = pExp; + } else if (expDiff < 0) { + shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1); + sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); + zExp = cExp; + zSign ^= 1; + } else { + zExp = pExp; + if (lt128(cSig0, cSig1, pSig0, pSig1)) { + sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1); + } else if (lt128(pSig0, pSig1, cSig0, cSig1)) { + sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1); + zSign ^= 1; + } else { + /* Exact zero */ + zSign = signflip; + if (STATUS(float_rounding_mode) == float_round_down) { + zSign ^= 1; + } + return packFloat64(zSign, 0, 0); + } + } + --zExp; + /* Do the equivalent of normalizeRoundAndPackFloat64() but + * starting with the significand in a pair of uint64_t. + */ + if (zSig0) { + shiftcount = countLeadingZeros64(zSig0) - 1; + shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1); + if (zSig1) { + zSig0 |= 1; + } + zExp -= shiftcount; + } else { + shiftcount = countLeadingZeros64(zSig1) - 1; + zSig0 = zSig1 << shiftcount; + zExp -= (shiftcount + 64); + } + return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR); + } +} + /*---------------------------------------------------------------------------- | Returns the square root of the double-precision floating-point value `a'. | The operation is performed according to the IEC/IEEE Standard for Binary diff --git a/fpu/softfloat.h b/fpu/softfloat.h index 618ddee569..07c2929613 100644 --- a/fpu/softfloat.h +++ b/fpu/softfloat.h @@ -211,6 +211,18 @@ void set_floatx80_rounding_precision(int val STATUS_PARAM); *----------------------------------------------------------------------------*/ void float_raise( int8 flags STATUS_PARAM); +/*---------------------------------------------------------------------------- +| Options to indicate which negations to perform in float*_muladd() +| Using these differs from negating an input or output before calling +| the muladd function in that this means that a NaN doesn't have its +| sign bit inverted before it is propagated. +*----------------------------------------------------------------------------*/ +enum { + float_muladd_negate_c = 1, + float_muladd_negate_product = 2, + float_muladd_negate_result = 3, +}; + /*---------------------------------------------------------------------------- | Software IEC/IEEE integer-to-floating-point conversion routines. *----------------------------------------------------------------------------*/ @@ -269,6 +281,7 @@ float32 float32_sub( float32, float32 STATUS_PARAM ); float32 float32_mul( float32, float32 STATUS_PARAM ); float32 float32_div( float32, float32 STATUS_PARAM ); float32 float32_rem( float32, float32 STATUS_PARAM ); +float32 float32_muladd(float32, float32, float32, int STATUS_PARAM); float32 float32_sqrt( float32 STATUS_PARAM ); float32 float32_exp2( float32 STATUS_PARAM ); float32 float32_log2( float32 STATUS_PARAM ); @@ -375,6 +388,7 @@ float64 float64_sub( float64, float64 STATUS_PARAM ); float64 float64_mul( float64, float64 STATUS_PARAM ); float64 float64_div( float64, float64 STATUS_PARAM ); float64 float64_rem( float64, float64 STATUS_PARAM ); +float64 float64_muladd(float64, float64, float64, int STATUS_PARAM); float64 float64_sqrt( float64 STATUS_PARAM ); float64 float64_log2( float64 STATUS_PARAM ); int float64_eq( float64, float64 STATUS_PARAM ); From da97f52cb35ccf72a4a3d745926ee5e3c263ed07 Mon Sep 17 00:00:00 2001 From: Peter Maydell Date: Wed, 19 Oct 2011 16:14:07 +0000 Subject: [PATCH 6/8] target-arm: Implement VFPv4 fused multiply-accumulate insns Implement the fused multiply-accumulate instructions (VFMA, VFMS, VFNMA, VFNMS) which are new in VFPv4. Signed-off-by: Peter Maydell --- target-arm/cpu.h | 1 + target-arm/helper.c | 14 ++++++++ target-arm/helper.h | 3 ++ target-arm/translate.c | 72 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+) diff --git a/target-arm/cpu.h b/target-arm/cpu.h index af3904d2bb..c4d742f084 100644 --- a/target-arm/cpu.h +++ b/target-arm/cpu.h @@ -376,6 +376,7 @@ enum arm_features { ARM_FEATURE_STRONGARM, ARM_FEATURE_VAPA, /* cp15 VA to PA lookups */ ARM_FEATURE_ARM_DIV, /* divide supported in ARM encoding */ + ARM_FEATURE_VFP4, /* VFPv4 (implies that NEON is v2) */ }; static inline int arm_feature(CPUARMState *env, int feature) diff --git a/target-arm/helper.c b/target-arm/helper.c index 3a51fd73cf..eddb92307c 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -204,6 +204,7 @@ static void cpu_reset_model_id(CPUARMState *env, uint32_t id) set_feature(env, ARM_FEATURE_THUMB2); set_feature(env, ARM_FEATURE_VFP); set_feature(env, ARM_FEATURE_VFP3); + set_feature(env, ARM_FEATURE_VFP4); set_feature(env, ARM_FEATURE_VFP_FP16); set_feature(env, ARM_FEATURE_NEON); set_feature(env, ARM_FEATURE_THUMB2EE); @@ -3084,6 +3085,19 @@ uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUState *env) return 0x80000000 | ((float64_val(f64) >> 21) & 0x7fffffff); } +/* VFPv4 fused multiply-accumulate */ +float32 VFP_HELPER(muladd, s)(float32 a, float32 b, float32 c, void *fpstp) +{ + float_status *fpst = fpstp; + return float32_muladd(a, b, c, 0, fpst); +} + +float64 VFP_HELPER(muladd, d)(float64 a, float64 b, float64 c, void *fpstp) +{ + float_status *fpst = fpstp; + return float64_muladd(a, b, c, 0, fpst); +} + void HELPER(set_teecr)(CPUState *env, uint32_t val) { val &= 1; diff --git a/target-arm/helper.h b/target-arm/helper.h index 3ad1cb0881..16dd5fcc89 100644 --- a/target-arm/helper.h +++ b/target-arm/helper.h @@ -132,6 +132,9 @@ DEF_HELPER_2(vfp_fcvt_f32_to_f16, i32, f32, env) DEF_HELPER_2(neon_fcvt_f16_to_f32, f32, i32, env) DEF_HELPER_2(neon_fcvt_f32_to_f16, i32, f32, env) +DEF_HELPER_4(vfp_muladdd, f64, f64, f64, f64, ptr) +DEF_HELPER_4(vfp_muladds, f32, f32, f32, f32, ptr) + DEF_HELPER_3(recps_f32, f32, f32, f32, env) DEF_HELPER_3(rsqrts_f32, f32, f32, f32, env) DEF_HELPER_2(recpe_f32, f32, f32, env) diff --git a/target-arm/translate.c b/target-arm/translate.c index 812a9e7be4..0f35b60946 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -3141,6 +3141,57 @@ static int disas_vfp_insn(CPUState * env, DisasContext *s, uint32_t insn) case 8: /* div: fn / fm */ gen_vfp_div(dp); break; + case 10: /* VFNMA : fd = muladd(-fd, fn, fm) */ + case 11: /* VFNMS : fd = muladd(-fd, -fn, fm) */ + case 12: /* VFMA : fd = muladd( fd, fn, fm) */ + case 13: /* VFMS : fd = muladd( fd, -fn, fm) */ + /* These are fused multiply-add, and must be done as one + * floating point operation with no rounding between the + * multiplication and addition steps. + * NB that doing the negations here as separate steps is + * correct : an input NaN should come out with its sign bit + * flipped if it is a negated-input. + */ + if (!arm_feature(env, ARM_FEATURE_VFP4)) { + return 1; + } + if (dp) { + TCGv_ptr fpst; + TCGv_i64 frd; + if (op & 1) { + /* VFNMS, VFMS */ + gen_helper_vfp_negd(cpu_F0d, cpu_F0d); + } + frd = tcg_temp_new_i64(); + tcg_gen_ld_f64(frd, cpu_env, vfp_reg_offset(dp, rd)); + if (op & 2) { + /* VFNMA, VFNMS */ + gen_helper_vfp_negd(frd, frd); + } + fpst = get_fpstatus_ptr(0); + gen_helper_vfp_muladdd(cpu_F0d, cpu_F0d, + cpu_F1d, frd, fpst); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i64(frd); + } else { + TCGv_ptr fpst; + TCGv_i32 frd; + if (op & 1) { + /* VFNMS, VFMS */ + gen_helper_vfp_negs(cpu_F0s, cpu_F0s); + } + frd = tcg_temp_new_i32(); + tcg_gen_ld_f32(frd, cpu_env, vfp_reg_offset(dp, rd)); + if (op & 2) { + gen_helper_vfp_negs(frd, frd); + } + fpst = get_fpstatus_ptr(0); + gen_helper_vfp_muladds(cpu_F0s, cpu_F0s, + cpu_F1s, frd, fpst); + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(frd); + } + break; case 14: /* fconst */ if (!arm_feature(env, ARM_FEATURE_VFP3)) return 1; @@ -4417,6 +4468,7 @@ static void gen_neon_narrow_op(int op, int u, int size, TCGv dest, TCGv_i64 src) #define NEON_3R_VPMIN 21 #define NEON_3R_VQDMULH_VQRDMULH 22 #define NEON_3R_VPADD 23 +#define NEON_3R_VFM 25 /* VFMA, VFMS : float fused multiply-add */ #define NEON_3R_FLOAT_ARITH 26 /* float VADD, VSUB, VPADD, VABD */ #define NEON_3R_FLOAT_MULTIPLY 27 /* float VMLA, VMLS, VMUL */ #define NEON_3R_FLOAT_CMP 28 /* float VCEQ, VCGE, VCGT */ @@ -4449,6 +4501,7 @@ static const uint8_t neon_3r_sizes[] = { [NEON_3R_VPMIN] = 0x7, [NEON_3R_VQDMULH_VQRDMULH] = 0x6, [NEON_3R_VPADD] = 0x7, + [NEON_3R_VFM] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_ARITH] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_MULTIPLY] = 0x5, /* size bit 1 encodes op */ [NEON_3R_FLOAT_CMP] = 0x5, /* size bit 1 encodes op */ @@ -4726,6 +4779,11 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) return 1; } break; + case NEON_3R_VFM: + if (!arm_feature(env, ARM_FEATURE_VFP4) || u) { + return 1; + } + break; default: break; } @@ -5006,6 +5064,20 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) else gen_helper_rsqrts_f32(tmp, tmp, tmp2, cpu_env); break; + case NEON_3R_VFM: + { + /* VFMA, VFMS: fused multiply-add */ + TCGv_ptr fpstatus = get_fpstatus_ptr(1); + TCGv_i32 tmp3 = neon_load_reg(rd, pass); + if (size) { + /* VFMS */ + gen_helper_vfp_negs(tmp, tmp); + } + gen_helper_vfp_muladds(tmp, tmp, tmp2, tmp3, fpstatus); + tcg_temp_free_i32(tmp3); + tcg_temp_free_ptr(fpstatus); + break; + } default: abort(); } From 1518025641c6c067518c871be7f72a524dc68711 Mon Sep 17 00:00:00 2001 From: Dmitry Koshelev Date: Wed, 19 Oct 2011 16:14:07 +0000 Subject: [PATCH 7/8] target-arm/machine.c: Restore VFP registers correctly Fix the restoring of VFP registers on vmload. Signed-off-by: Dmitry Koshelev Reviewed-by: Juan Quintela [peter.maydell: improved commit message a little] Signed-off-by: Peter Maydell --- target-arm/machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target-arm/machine.c b/target-arm/machine.c index 7d4fc545a6..aaee9b9c11 100644 --- a/target-arm/machine.c +++ b/target-arm/machine.c @@ -189,7 +189,7 @@ int cpu_load(QEMUFile *f, void *opaque, int version_id) env->vfp.vec_stride = qemu_get_be32(f); if (arm_feature(env, ARM_FEATURE_VFP3)) { - for (i = 0; i < 16; i++) { + for (i = 16; i < 32; i++) { CPU_DoubleU u; u.l.upper = qemu_get_be32(f); u.l.lower = qemu_get_be32(f); From 12b1de3a3f026a2e27993b60749122b02509533c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20F=C3=A4rber?= Date: Mon, 10 Oct 2011 02:52:28 +0000 Subject: [PATCH 8/8] target-arm: Fix use of free() in cpu_arm_close() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit env is allocated in cpu_arm_init() with g_malloc0(), so free with g_free(). Signed-off-by: Andreas Färber Signed-off-by: Peter Maydell --- target-arm/helper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target-arm/helper.c b/target-arm/helper.c index eddb92307c..97af4d0bba 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -475,7 +475,7 @@ static uint32_t cpu_arm_find_by_name(const char *name) void cpu_arm_close(CPUARMState *env) { - free(env); + g_free(env); } uint32_t cpsr_read(CPUARMState *env)