From 0d4bcac3cac461798d810e6df54768d9613ea794 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Oct 2022 14:32:04 +0200 Subject: [PATCH 1/4] target/i386: decode-new: avoid out-of-bounds access to xmm_regs[-1] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the destination is a memory register, op->n is -1. Going through tcg_gen_gvec_dup_imm path is both useless (the value has been stored by the gen_* function already) and wrong because of the out-of-bounds access. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/emit.c.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 27eca591a9..ebf299451d 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -296,7 +296,7 @@ static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv case X86_OP_MMX: break; case X86_OP_SSE: - if ((s->prefix & PREFIX_VEX) && op->ot == MO_128) { + if (!op->has_ea && (s->prefix & PREFIX_VEX) && op->ot == MO_128) { tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[op->n].ZMM_X(1)), 16, 16, 0); From 314d3eff66f41f39191aaca2e5f6e3dc81480c1b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Oct 2022 14:01:36 +0200 Subject: [PATCH 2/4] target/i386: introduce function to set rounding mode from FPCW or MXCSR bits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VROUND, FSTCW and STMXCSR all have to perform the same conversion from x86 rounding modes to softfloat constants. Since the ISA is consistent on the meaning of the two-bit rounding modes, extract the common code into a wrapper for set_float_rounding_mode. Reviewed-by: Philippe Mathieu-Daudé Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/ops_sse.h | 60 +++--------------------------------- target/i386/tcg/fpu_helper.c | 60 +++++++++++++----------------------- 2 files changed, 25 insertions(+), 95 deletions(-) diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index d35fc15c65..0799712f6e 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -1684,20 +1684,7 @@ void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, prev_rounding_mode = env->sse_status.float_rounding_mode; if (!(mode & (1 << 2))) { - switch (mode & 3) { - case 0: - set_float_rounding_mode(float_round_nearest_even, &env->sse_status); - break; - case 1: - set_float_rounding_mode(float_round_down, &env->sse_status); - break; - case 2: - set_float_rounding_mode(float_round_up, &env->sse_status); - break; - case 3: - set_float_rounding_mode(float_round_to_zero, &env->sse_status); - break; - } + set_x86_rounding_mode(mode & 3, &env->sse_status); } for (i = 0; i < 2 << SHIFT; i++) { @@ -1721,20 +1708,7 @@ void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, prev_rounding_mode = env->sse_status.float_rounding_mode; if (!(mode & (1 << 2))) { - switch (mode & 3) { - case 0: - set_float_rounding_mode(float_round_nearest_even, &env->sse_status); - break; - case 1: - set_float_rounding_mode(float_round_down, &env->sse_status); - break; - case 2: - set_float_rounding_mode(float_round_up, &env->sse_status); - break; - case 3: - set_float_rounding_mode(float_round_to_zero, &env->sse_status); - break; - } + set_x86_rounding_mode(mode & 3, &env->sse_status); } for (i = 0; i < 1 << SHIFT; i++) { @@ -1759,20 +1733,7 @@ void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, prev_rounding_mode = env->sse_status.float_rounding_mode; if (!(mode & (1 << 2))) { - switch (mode & 3) { - case 0: - set_float_rounding_mode(float_round_nearest_even, &env->sse_status); - break; - case 1: - set_float_rounding_mode(float_round_down, &env->sse_status); - break; - case 2: - set_float_rounding_mode(float_round_up, &env->sse_status); - break; - case 3: - set_float_rounding_mode(float_round_to_zero, &env->sse_status); - break; - } + set_x86_rounding_mode(mode & 3, &env->sse_status); } d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status); @@ -1797,20 +1758,7 @@ void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, prev_rounding_mode = env->sse_status.float_rounding_mode; if (!(mode & (1 << 2))) { - switch (mode & 3) { - case 0: - set_float_rounding_mode(float_round_nearest_even, &env->sse_status); - break; - case 1: - set_float_rounding_mode(float_round_down, &env->sse_status); - break; - case 2: - set_float_rounding_mode(float_round_up, &env->sse_status); - break; - case 3: - set_float_rounding_mode(float_round_to_zero, &env->sse_status); - break; - } + set_x86_rounding_mode(mode & 3, &env->sse_status); } d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status); diff --git a/target/i386/tcg/fpu_helper.c b/target/i386/tcg/fpu_helper.c index a6a90a1817..6f3741b635 100644 --- a/target/i386/tcg/fpu_helper.c +++ b/target/i386/tcg/fpu_helper.c @@ -32,7 +32,8 @@ #define ST(n) (env->fpregs[(env->fpstt + (n)) & 7].d) #define ST1 ST(1) -#define FPU_RC_MASK 0xc00 +#define FPU_RC_SHIFT 10 +#define FPU_RC_MASK (3 << FPU_RC_SHIFT) #define FPU_RC_NEAR 0x000 #define FPU_RC_DOWN 0x400 #define FPU_RC_UP 0x800 @@ -685,28 +686,26 @@ uint32_t helper_fnstcw(CPUX86State *env) return env->fpuc; } +static void set_x86_rounding_mode(unsigned mode, float_status *status) +{ + static FloatRoundMode x86_round_mode[4] = { + float_round_nearest_even, + float_round_down, + float_round_up, + float_round_to_zero + }; + assert(mode < ARRAY_SIZE(x86_round_mode)); + set_float_rounding_mode(x86_round_mode[mode], status); +} + void update_fp_status(CPUX86State *env) { - FloatRoundMode rnd_mode; + int rnd_mode; FloatX80RoundPrec rnd_prec; /* set rounding mode */ - switch (env->fpuc & FPU_RC_MASK) { - default: - case FPU_RC_NEAR: - rnd_mode = float_round_nearest_even; - break; - case FPU_RC_DOWN: - rnd_mode = float_round_down; - break; - case FPU_RC_UP: - rnd_mode = float_round_up; - break; - case FPU_RC_CHOP: - rnd_mode = float_round_to_zero; - break; - } - set_float_rounding_mode(rnd_mode, &env->fp_status); + rnd_mode = (env->fpuc & FPU_RC_MASK) >> FPU_RC_SHIFT; + set_x86_rounding_mode(rnd_mode, &env->fp_status); switch ((env->fpuc >> 8) & 3) { case 0: @@ -3038,11 +3037,8 @@ void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask) /* XXX: optimize by storing fptt and fptags in the static cpu state */ #define SSE_DAZ 0x0040 -#define SSE_RC_MASK 0x6000 -#define SSE_RC_NEAR 0x0000 -#define SSE_RC_DOWN 0x2000 -#define SSE_RC_UP 0x4000 -#define SSE_RC_CHOP 0x6000 +#define SSE_RC_SHIFT 13 +#define SSE_RC_MASK (3 << SSE_RC_SHIFT) #define SSE_FZ 0x8000 void update_mxcsr_status(CPUX86State *env) @@ -3051,22 +3047,8 @@ void update_mxcsr_status(CPUX86State *env) int rnd_type; /* set rounding mode */ - switch (mxcsr & SSE_RC_MASK) { - default: - case SSE_RC_NEAR: - rnd_type = float_round_nearest_even; - break; - case SSE_RC_DOWN: - rnd_type = float_round_down; - break; - case SSE_RC_UP: - rnd_type = float_round_up; - break; - case SSE_RC_CHOP: - rnd_type = float_round_to_zero; - break; - } - set_float_rounding_mode(rnd_type, &env->sse_status); + rnd_type = (mxcsr & SSE_RC_MASK) >> SSE_RC_SHIFT; + set_x86_rounding_mode(rnd_type, &env->sse_status); /* Set exception flags. */ set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) | From cf5ec6641ed456e2748b211b7bbf5103bfc93098 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Oct 2022 13:22:06 +0200 Subject: [PATCH 3/4] target/i386: implement F16C instructions F16C only consists of two instructions, which are a bit peculiar nevertheless. First, they access only the low half of an YMM or XMM register for the packed-half operand; the exact size still depends on the VEX.L flag. This is similar to the existing avx_movx flag, but not exactly because avx_movx is hardcoded to affect operand 2. To this end I added a "ph" format name; it's possible to reuse this approach for the VPMOVSX and VPMOVZX instructions, though that would also require adding two more formats for the low-quarter and low-eighth of an operand. Second, VCVTPS2PH is somewhat weird because it *stores* the result of the instruction into memory rather than loading it. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 5 ++--- target/i386/cpu.h | 3 +++ target/i386/ops_sse.h | 29 +++++++++++++++++++++++++++++ target/i386/ops_sse_header.h | 6 ++++++ target/i386/tcg/decode-new.c.inc | 8 ++++++++ target/i386/tcg/decode-new.h | 2 ++ target/i386/tcg/emit.c.inc | 17 ++++++++++++++++- tests/tcg/i386/test-avx.c | 17 +++++++++++++++++ tests/tcg/i386/test-avx.py | 8 ++++++-- 9 files changed, 89 insertions(+), 6 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 0ebd610faa..6292b7e12f 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -625,13 +625,12 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \ CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \ CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \ - CPUID_EXT_RDRAND | CPUID_EXT_AVX) + CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C) /* missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX, CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA, - CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER, - CPUID_EXT_F16C */ + CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */ #ifdef TARGET_X86_64 #define TCG_EXT2_X86_64_FEATURES (CPUID_EXT2_SYSCALL | CPUID_EXT2_LM) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index dad2b2db8d..d4bc19577a 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1258,6 +1258,7 @@ typedef union ZMMReg { uint16_t _w_ZMMReg[512 / 16]; uint32_t _l_ZMMReg[512 / 32]; uint64_t _q_ZMMReg[512 / 64]; + float16 _h_ZMMReg[512 / 16]; float32 _s_ZMMReg[512 / 32]; float64 _d_ZMMReg[512 / 64]; XMMReg _x_ZMMReg[512 / 128]; @@ -1282,6 +1283,7 @@ typedef struct BNDCSReg { #define ZMM_B(n) _b_ZMMReg[63 - (n)] #define ZMM_W(n) _w_ZMMReg[31 - (n)] #define ZMM_L(n) _l_ZMMReg[15 - (n)] +#define ZMM_H(n) _h_ZMMReg[31 - (n)] #define ZMM_S(n) _s_ZMMReg[15 - (n)] #define ZMM_Q(n) _q_ZMMReg[7 - (n)] #define ZMM_D(n) _d_ZMMReg[7 - (n)] @@ -1301,6 +1303,7 @@ typedef struct BNDCSReg { #define ZMM_B(n) _b_ZMMReg[n] #define ZMM_W(n) _w_ZMMReg[n] #define ZMM_L(n) _l_ZMMReg[n] +#define ZMM_H(n) _h_ZMMReg[n] #define ZMM_S(n) _s_ZMMReg[n] #define ZMM_Q(n) _q_ZMMReg[n] #define ZMM_D(n) _d_ZMMReg[n] diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 0799712f6e..33c61896ee 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -586,6 +586,35 @@ void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) } } +#if SHIFT >= 1 +void glue(helper_cvtph2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) +{ + int i; + + for (i = 2 << SHIFT; --i >= 0; ) { + d->ZMM_S(i) = float16_to_float32(s->ZMM_H(i), true, &env->sse_status); + } +} + +void glue(helper_cvtps2ph, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, int mode) +{ + int i; + FloatRoundMode prev_rounding_mode = env->sse_status.float_rounding_mode; + if (!(mode & (1 << 2))) { + set_x86_rounding_mode(mode & 3, &env->sse_status); + } + + for (i = 0; i < 2 << SHIFT; i++) { + d->ZMM_H(i) = float32_to_float16(s->ZMM_S(i), true, &env->sse_status); + } + for (i >>= 2; i < 1 << SHIFT; i++) { + d->Q(i) = 0; + } + + env->sse_status.float_rounding_mode = prev_rounding_mode; +} +#endif + #if SHIFT == 1 void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) { diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index 2f1f811f9f..c4c41976c0 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -353,6 +353,12 @@ DEF_HELPER_4(glue(aeskeygenassist, SUFFIX), void, env, Reg, Reg, i32) DEF_HELPER_5(glue(pclmulqdq, SUFFIX), void, env, Reg, Reg, Reg, i32) #endif +/* F16C helpers */ +#if SHIFT >= 1 +DEF_HELPER_3(glue(cvtph2ps, SUFFIX), void, env, Reg, Reg) +DEF_HELPER_4(glue(cvtps2ph, SUFFIX), void, env, Reg, Reg, int) +#endif + /* AVX helpers */ #if SHIFT >= 1 DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 8e1eb9db42..8baee9018a 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -336,6 +336,7 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = { [0x07] = X86_OP_ENTRY3(PHSUBSW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), [0x10] = X86_OP_ENTRY2(PBLENDVB, V,x, W,x, vex4 cpuid(SSE41) avx2_256 p_66), + [0x13] = X86_OP_ENTRY2(VCVTPH2PS, V,x, W,ph, vex11 cpuid(F16C) p_66), [0x14] = X86_OP_ENTRY2(BLENDVPS, V,x, W,x, vex4 cpuid(SSE41) p_66), [0x15] = X86_OP_ENTRY2(BLENDVPD, V,x, W,x, vex4 cpuid(SSE41) p_66), /* Listed incorrectly as type 4 */ @@ -525,6 +526,7 @@ static const X86OpEntry opcodes_0F3A[256] = { [0x15] = X86_OP_ENTRY3(PEXTRW, E,w, V,dq, I,b, vex5 cpuid(SSE41) zext0 p_66), [0x16] = X86_OP_ENTRY3(PEXTR, E,y, V,dq, I,b, vex5 cpuid(SSE41) p_66), [0x17] = X86_OP_ENTRY3(VEXTRACTPS, E,d, V,dq, I,b, vex5 cpuid(SSE41) p_66), + [0x1d] = X86_OP_ENTRY3(VCVTPS2PH, W,ph, V,x, I,b, vex11 cpuid(F16C) p_66), [0x20] = X86_OP_ENTRY4(PINSRB, V,dq, H,dq, E,b, vex5 cpuid(SSE41) zext2 p_66), [0x21] = X86_OP_GROUP0(VINSERTPS), @@ -1051,6 +1053,10 @@ static bool decode_op_size(DisasContext *s, X86OpEntry *e, X86OpSize size, MemOp *ot = s->vex_l ? MO_256 : MO_128; return true; + case X86_SIZE_ph: /* SSE/AVX packed half precision */ + *ot = s->vex_l ? MO_128 : MO_64; + return true; + case X86_SIZE_d64: /* Default to 64-bit in 64-bit mode */ *ot = CODE64(s) && s->dflag == MO_32 ? MO_64 : s->dflag; return true; @@ -1342,6 +1348,8 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid) switch (cpuid) { case X86_FEAT_None: return true; + case X86_FEAT_F16C: + return (s->cpuid_ext_features & CPUID_EXT_F16C); case X86_FEAT_MOVBE: return (s->cpuid_ext_features & CPUID_EXT_MOVBE); case X86_FEAT_PCLMULQDQ: diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h index f159c26850..0ef54628ee 100644 --- a/target/i386/tcg/decode-new.h +++ b/target/i386/tcg/decode-new.h @@ -92,6 +92,7 @@ typedef enum X86OpSize { /* Custom */ X86_SIZE_d64, X86_SIZE_f64, + X86_SIZE_ph, /* SSE/AVX packed half precision */ } X86OpSize; typedef enum X86CPUIDFeature { @@ -103,6 +104,7 @@ typedef enum X86CPUIDFeature { X86_FEAT_AVX2, X86_FEAT_BMI1, X86_FEAT_BMI2, + X86_FEAT_F16C, X86_FEAT_MOVBE, X86_FEAT_PCLMULQDQ, X86_FEAT_SSE, diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index ebf299451d..9334f0939d 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -296,7 +296,7 @@ static void gen_writeback(DisasContext *s, X86DecodedInsn *decode, int opn, TCGv case X86_OP_MMX: break; case X86_OP_SSE: - if (!op->has_ea && (s->prefix & PREFIX_VEX) && op->ot == MO_128) { + if (!op->has_ea && (s->prefix & PREFIX_VEX) && op->ot <= MO_128) { tcg_gen_gvec_dup_imm(MO_64, offsetof(CPUX86State, xmm_regs[op->n].ZMM_X(1)), 16, 16, 0); @@ -852,6 +852,7 @@ UNARY_INT_SSE(VCVTTPD2DQ, cvttpd2dq) UNARY_INT_SSE(VCVTDQ2PS, cvtdq2ps) UNARY_INT_SSE(VCVTPS2DQ, cvtps2dq) UNARY_INT_SSE(VCVTTPS2DQ, cvttps2dq) +UNARY_INT_SSE(VCVTPH2PS, cvtph2ps) static inline void gen_unary_imm_sse(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode, @@ -1868,6 +1869,20 @@ static void gen_VCVTfp2fp(DisasContext *s, CPUX86State *env, X86DecodedInsn *dec gen_helper_cvtsd2ss, gen_helper_cvtss2sd); } +static void gen_VCVTPS2PH(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) +{ + gen_unary_imm_fp_sse(s, env, decode, + gen_helper_cvtps2ph_xmm, + gen_helper_cvtps2ph_ymm); + /* + * VCVTPS2PH is the only instruction that performs an operation on a + * register source and then *stores* into memory. + */ + if (decode->op[0].has_ea) { + gen_store_sse(s, decode, decode->op[0].offset); + } +} + static void gen_VCVTSI2Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) { int vec_len = vector_len(s, decode); diff --git a/tests/tcg/i386/test-avx.c b/tests/tcg/i386/test-avx.c index 953e2906fe..c39c0e5bce 100644 --- a/tests/tcg/i386/test-avx.c +++ b/tests/tcg/i386/test-avx.c @@ -28,6 +28,7 @@ typedef struct { } TestDef; reg_state initI; +reg_state initF16; reg_state initF32; reg_state initF64; @@ -221,6 +222,7 @@ static void run_all(void) #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0])) +uint16_t val_f16[] = { 0x4000, 0xbc00, 0x44cd, 0x3a66, 0x4200, 0x7a1a, 0x4780, 0x4826 }; float val_f32[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6, 7.5, 8.3}; double val_f64[] = {2.0, -1.0, 4.8, 0.8, 3, -42.0, 5e6, 7.5}; v4di val_i64[] = { @@ -241,6 +243,12 @@ v4di indexd = {0x00000002000000efull, 0xfffffff500000010ull, v4di gather_mem[0x20]; +void init_f16reg(v4di *r) +{ + memset(r, 0, sizeof(*r)); + memcpy(r, val_f16, sizeof(val_f16)); +} + void init_f32reg(v4di *r) { static int n; @@ -315,6 +323,15 @@ int main(int argc, char *argv[]) printf("Int:\n"); dump_regs(&initI); + init_all(&initF16); + init_f16reg(&initF16.ymm[10]); + init_f16reg(&initF16.ymm[11]); + init_f16reg(&initF16.ymm[12]); + init_f16reg(&initF16.mem0[1]); + initF16.ff = 16; + printf("F16:\n"); + dump_regs(&initF16); + init_all(&initF32); init_f32reg(&initF32.ymm[10]); init_f32reg(&initF32.ymm[11]); diff --git a/tests/tcg/i386/test-avx.py b/tests/tcg/i386/test-avx.py index 02982329f1..ebb1d99c5e 100755 --- a/tests/tcg/i386/test-avx.py +++ b/tests/tcg/i386/test-avx.py @@ -9,6 +9,7 @@ from fnmatch import fnmatch archs = [ "SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2", "AES", "AVX", "AVX2", "AES+AVX", "VAES+AVX", + "F16C", ] ignore = set(["FISTTP", @@ -19,6 +20,7 @@ imask = { 'vBLENDPS': 0x0f, 'CMP[PS][SD]': 0x07, 'VCMP[PS][SD]': 0x1f, + 'vCVTPS2PH': 0x7, 'vDPPD': 0x33, 'vDPPS': 0xff, 'vEXTRACTPS': 0x03, @@ -221,8 +223,10 @@ def ArgGenerator(arg, op): class InsnGenerator: def __init__(self, op, args): self.op = op - if op[-2:] in ["PS", "PD", "SS", "SD"]: - if op[-1] == 'S': + if op[-2:] in ["PH", "PS", "PD", "SS", "SD"]: + if op[-1] == 'H': + self.optype = 'F16' + elif op[-1] == 'S': self.optype = 'F32' else: self.optype = 'F64' From 2872b0f390c3fbd8f19f6b82da3dca15fa820118 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 19 Oct 2022 13:22:06 +0200 Subject: [PATCH 4/4] target/i386: implement FMA instructions The only issue with FMA instructions is that there are _a lot_ of them (30 opcodes, each of which comes in up to 4 versions depending on VEX.W and VEX.L; a total of 96 possibilities). However, they can be implement with only 6 helpers, two for scalar operations and four for packed operations. (Scalar versions do not do any merging; they only affect the bottom 32 or 64 bits of the output operand. Therefore, there is no separate XMM and YMM of the scalar helpers). First, we can reduce the number of helpers to one third by passing four operands (one output and three inputs); the reordering of which operands go to the multiply and which go to the add is done in emit.c. Second, the different instructions also dispatch to the same softfloat function, so the flags for float32_muladd and float64_muladd are passed in the helper as int arguments, with a little extra complication to handle FMADDSUB and FMSUBADD. Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/cpu.c | 5 ++-- target/i386/ops_sse.h | 27 +++++++++++++++++ target/i386/ops_sse_header.h | 11 +++++++ target/i386/tcg/decode-new.c.inc | 40 +++++++++++++++++++++++++ target/i386/tcg/decode-new.h | 1 + target/i386/tcg/emit.c.inc | 51 ++++++++++++++++++++++++++++++++ target/i386/tcg/translate.c | 1 + tests/tcg/i386/test-avx.py | 2 +- 8 files changed, 135 insertions(+), 3 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6292b7e12f..22b681ca37 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -625,10 +625,11 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1, CPUID_EXT_SSE41 | CPUID_EXT_SSE42 | CPUID_EXT_POPCNT | \ CPUID_EXT_XSAVE | /* CPUID_EXT_OSXSAVE is dynamic */ \ CPUID_EXT_MOVBE | CPUID_EXT_AES | CPUID_EXT_HYPERVISOR | \ - CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C) + CPUID_EXT_RDRAND | CPUID_EXT_AVX | CPUID_EXT_F16C | \ + CPUID_EXT_FMA) /* missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_SMX, - CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_FMA, + CPUID_EXT_EST, CPUID_EXT_TM2, CPUID_EXT_CID, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_PCID, CPUID_EXT_DCA, CPUID_EXT_X2APIC, CPUID_EXT_TSC_DEADLINE_TIMER */ diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h index 33c61896ee..3cbc36a59d 100644 --- a/target/i386/ops_sse.h +++ b/target/i386/ops_sse.h @@ -2522,6 +2522,33 @@ void helper_vpermd_ymm(Reg *d, Reg *v, Reg *s) } #endif +/* FMA3 op helpers */ +#if SHIFT == 1 +#define SSE_HELPER_FMAS(name, elem, F) \ + void name(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c, int flags) \ + { \ + d->elem(0) = F(a->elem(0), b->elem(0), c->elem(0), flags, &env->sse_status); \ + } +#define SSE_HELPER_FMAP(name, elem, num, F) \ + void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *a, Reg *b, Reg *c, \ + int flags, int flip) \ + { \ + int i; \ + for (i = 0; i < num; i++) { \ + d->elem(i) = F(a->elem(i), b->elem(i), c->elem(i), flags, &env->sse_status); \ + flags ^= flip; \ + } \ + } + +SSE_HELPER_FMAS(helper_fma4ss, ZMM_S, float32_muladd) +SSE_HELPER_FMAS(helper_fma4sd, ZMM_D, float64_muladd) +#endif + +#if SHIFT >= 1 +SSE_HELPER_FMAP(helper_fma4ps, ZMM_S, 2 << SHIFT, float32_muladd) +SSE_HELPER_FMAP(helper_fma4pd, ZMM_D, 1 << SHIFT, float64_muladd) +#endif + #undef SSE_HELPER_S #undef LANE_WIDTH diff --git a/target/i386/ops_sse_header.h b/target/i386/ops_sse_header.h index c4c41976c0..8a7b2f4e2f 100644 --- a/target/i386/ops_sse_header.h +++ b/target/i386/ops_sse_header.h @@ -359,6 +359,17 @@ DEF_HELPER_3(glue(cvtph2ps, SUFFIX), void, env, Reg, Reg) DEF_HELPER_4(glue(cvtps2ph, SUFFIX), void, env, Reg, Reg, int) #endif +/* FMA3 helpers */ +#if SHIFT == 1 +DEF_HELPER_6(fma4ss, void, env, Reg, Reg, Reg, Reg, int) +DEF_HELPER_6(fma4sd, void, env, Reg, Reg, Reg, Reg, int) +#endif + +#if SHIFT >= 1 +DEF_HELPER_7(glue(fma4ps, SUFFIX), void, env, Reg, Reg, Reg, Reg, int, int) +DEF_HELPER_7(glue(fma4pd, SUFFIX), void, env, Reg, Reg, Reg, Reg, int, int) +#endif + /* AVX helpers */ #if SHIFT >= 1 DEF_HELPER_4(glue(vpermilpd, SUFFIX), void, env, Reg, Reg, Reg) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 8baee9018a..e4878b967f 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -376,6 +376,16 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = { [0x92] = X86_OP_ENTRY3(VPGATHERD, V,x, H,x, M,d, vex12 cpuid(AVX2) p_66), /* vgatherdps/d */ [0x93] = X86_OP_ENTRY3(VPGATHERQ, V,x, H,x, M,q, vex12 cpuid(AVX2) p_66), /* vgatherqps/d */ + /* Should be exception type 2 but they do not have legacy SSE equivalents? */ + [0x96] = X86_OP_ENTRY3(VFMADDSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x97] = X86_OP_ENTRY3(VFMSUBADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + + [0xa6] = X86_OP_ENTRY3(VFMADDSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xa7] = X86_OP_ENTRY3(VFMSUBADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + + [0xb6] = X86_OP_ENTRY3(VFMADDSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xb7] = X86_OP_ENTRY3(VFMSUBADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x08] = X86_OP_ENTRY3(PSIGNB, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), [0x09] = X86_OP_ENTRY3(PSIGNW, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), [0x0a] = X86_OP_ENTRY3(PSIGND, V,x, H,x, W,x, vex4 cpuid(SSSE3) mmx avx2_256 p_00_66), @@ -421,6 +431,34 @@ static const X86OpEntry opcodes_0F38_00toEF[240] = { [0x8c] = X86_OP_ENTRY3(VPMASKMOV, V,x, H,x, WM,x, vex6 cpuid(AVX2) p_66), [0x8e] = X86_OP_ENTRY3(VPMASKMOV_st, M,x, V,x, H,x, vex6 cpuid(AVX2) p_66), + /* Should be exception type 2 or 3 but they do not have legacy SSE equivalents? */ + [0x98] = X86_OP_ENTRY3(VFMADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x99] = X86_OP_ENTRY3(VFMADD132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9a] = X86_OP_ENTRY3(VFMSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9b] = X86_OP_ENTRY3(VFMSUB132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9c] = X86_OP_ENTRY3(VFNMADD132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9d] = X86_OP_ENTRY3(VFNMADD132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9e] = X86_OP_ENTRY3(VFNMSUB132Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0x9f] = X86_OP_ENTRY3(VFNMSUB132Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + + [0xa8] = X86_OP_ENTRY3(VFMADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xa9] = X86_OP_ENTRY3(VFMADD213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xaa] = X86_OP_ENTRY3(VFMSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xab] = X86_OP_ENTRY3(VFMSUB213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xac] = X86_OP_ENTRY3(VFNMADD213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xad] = X86_OP_ENTRY3(VFNMADD213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xae] = X86_OP_ENTRY3(VFNMSUB213Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xaf] = X86_OP_ENTRY3(VFNMSUB213Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + + [0xb8] = X86_OP_ENTRY3(VFMADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xb9] = X86_OP_ENTRY3(VFMADD231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xba] = X86_OP_ENTRY3(VFMSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbb] = X86_OP_ENTRY3(VFMSUB231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbc] = X86_OP_ENTRY3(VFNMADD231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbd] = X86_OP_ENTRY3(VFNMADD231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbe] = X86_OP_ENTRY3(VFNMSUB231Px, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xbf] = X86_OP_ENTRY3(VFNMSUB231Sx, V,x, H,x, W,x, vex6 cpuid(FMA) p_66), + [0xdb] = X86_OP_ENTRY3(VAESIMC, V,dq, None,None, W,dq, vex4 cpuid(AES) p_66), [0xdc] = X86_OP_ENTRY3(VAESENC, V,x, H,x, W,x, vex4 cpuid(AES) p_66), [0xdd] = X86_OP_ENTRY3(VAESENCLAST, V,x, H,x, W,x, vex4 cpuid(AES) p_66), @@ -1350,6 +1388,8 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid) return true; case X86_FEAT_F16C: return (s->cpuid_ext_features & CPUID_EXT_F16C); + case X86_FEAT_FMA: + return (s->cpuid_ext_features & CPUID_EXT_FMA); case X86_FEAT_MOVBE: return (s->cpuid_ext_features & CPUID_EXT_MOVBE); case X86_FEAT_PCLMULQDQ: diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h index 0ef54628ee..cb6b8bcf67 100644 --- a/target/i386/tcg/decode-new.h +++ b/target/i386/tcg/decode-new.h @@ -105,6 +105,7 @@ typedef enum X86CPUIDFeature { X86_FEAT_BMI1, X86_FEAT_BMI2, X86_FEAT_F16C, + X86_FEAT_FMA, X86_FEAT_MOVBE, X86_FEAT_PCLMULQDQ, X86_FEAT_SSE, diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 9334f0939d..7037ff91c6 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -39,6 +39,11 @@ typedef void (*SSEFunc_0_eppt)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv val); typedef void (*SSEFunc_0_epppti)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, TCGv_ptr reg_c, TCGv a0, TCGv_i32 scale); +typedef void (*SSEFunc_0_eppppi)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 flags); +typedef void (*SSEFunc_0_eppppii)(TCGv_ptr env, TCGv_ptr reg_a, TCGv_ptr reg_b, + TCGv_ptr reg_c, TCGv_ptr reg_d, TCGv_i32 even, + TCGv_i32 odd); static inline TCGv_i32 tcg_constant8u_i32(uint8_t val) { @@ -491,6 +496,52 @@ FP_SSE(VMIN, min) FP_SSE(VDIV, div) FP_SSE(VMAX, max) +#define FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, even, odd) \ +static void gen_##uname##Px(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + SSEFunc_0_eppppii xmm = s->vex_w ? gen_helper_fma4pd_xmm : gen_helper_fma4ps_xmm; \ + SSEFunc_0_eppppii ymm = s->vex_w ? gen_helper_fma4pd_ymm : gen_helper_fma4ps_ymm; \ + SSEFunc_0_eppppii fn = s->vex_l ? ymm : xmm; \ + \ + fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2, \ + tcg_constant_i32(even), \ + tcg_constant_i32((even) ^ (odd))); \ +} + +#define FMA_SSE(uname, ptr0, ptr1, ptr2, flags) \ +FMA_SSE_PACKED(uname, ptr0, ptr1, ptr2, flags, flags) \ +static void gen_##uname##Sx(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ +{ \ + SSEFunc_0_eppppi fn = s->vex_w ? gen_helper_fma4sd : gen_helper_fma4ss; \ + \ + fn(cpu_env, OP_PTR0, ptr0, ptr1, ptr2, \ + tcg_constant_i32(flags)); \ +} \ + +FMA_SSE(VFMADD231, OP_PTR1, OP_PTR2, OP_PTR0, 0) +FMA_SSE(VFMADD213, OP_PTR1, OP_PTR0, OP_PTR2, 0) +FMA_SSE(VFMADD132, OP_PTR0, OP_PTR2, OP_PTR1, 0) + +FMA_SSE(VFNMADD231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_product) +FMA_SSE(VFNMADD213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_product) +FMA_SSE(VFNMADD132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_product) + +FMA_SSE(VFMSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c) +FMA_SSE(VFMSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c) +FMA_SSE(VFMSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c) + +FMA_SSE(VFNMSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c|float_muladd_negate_product) +FMA_SSE(VFNMSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c|float_muladd_negate_product) +FMA_SSE(VFNMSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c|float_muladd_negate_product) + +FMA_SSE_PACKED(VFMADDSUB231, OP_PTR1, OP_PTR2, OP_PTR0, float_muladd_negate_c, 0) +FMA_SSE_PACKED(VFMADDSUB213, OP_PTR1, OP_PTR0, OP_PTR2, float_muladd_negate_c, 0) +FMA_SSE_PACKED(VFMADDSUB132, OP_PTR0, OP_PTR2, OP_PTR1, float_muladd_negate_c, 0) + +FMA_SSE_PACKED(VFMSUBADD231, OP_PTR1, OP_PTR2, OP_PTR0, 0, float_muladd_negate_c) +FMA_SSE_PACKED(VFMSUBADD213, OP_PTR1, OP_PTR0, OP_PTR2, 0, float_muladd_negate_c) +FMA_SSE_PACKED(VFMSUBADD132, OP_PTR0, OP_PTR2, OP_PTR1, 0, float_muladd_negate_c) + #define FP_UNPACK_SSE(uname, lname) \ static void gen_##uname(DisasContext *s, CPUX86State *env, X86DecodedInsn *decode) \ { \ diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index e19d5c1c64..85be2e58c2 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -26,6 +26,7 @@ #include "tcg/tcg-op-gvec.h" #include "exec/cpu_ldst.h" #include "exec/translator.h" +#include "fpu/softfloat.h" #include "exec/helper-proto.h" #include "exec/helper-gen.h" diff --git a/tests/tcg/i386/test-avx.py b/tests/tcg/i386/test-avx.py index ebb1d99c5e..d9ca00a49e 100755 --- a/tests/tcg/i386/test-avx.py +++ b/tests/tcg/i386/test-avx.py @@ -9,7 +9,7 @@ from fnmatch import fnmatch archs = [ "SSE", "SSE2", "SSE3", "SSSE3", "SSE4_1", "SSE4_2", "AES", "AVX", "AVX2", "AES+AVX", "VAES+AVX", - "F16C", + "F16C", "FMA", ] ignore = set(["FISTTP",