From 11ffaf8c73aae1a70f4640ada14a437a78d06efb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 9 May 2024 15:11:41 +0200 Subject: [PATCH] target/i386: convert LZCNT/TZCNT/BSF/BSR/POPCNT to new decoder Reviewed-by: Richard Henderson Signed-off-by: Paolo Bonzini --- target/i386/tcg/decode-new.c.inc | 52 +++++++++++++++++++- target/i386/tcg/decode-new.h | 1 + target/i386/tcg/emit.c.inc | 82 ++++++++++++++++++++++++++++++++ target/i386/tcg/translate.c | 74 ---------------------------- 4 files changed, 133 insertions(+), 76 deletions(-) diff --git a/target/i386/tcg/decode-new.c.inc b/target/i386/tcg/decode-new.c.inc index 598112d93d..e159280b51 100644 --- a/target/i386/tcg/decode-new.c.inc +++ b/target/i386/tcg/decode-new.c.inc @@ -450,6 +450,50 @@ static void decode_0F7F(DisasContext *s, CPUX86State *env, X86OpEntry *entry, ui *entry = *decode_by_prefix(s, opcodes_0F7F); } +static void decode_0FB8(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + static const X86OpEntry popcnt = + X86_OP_ENTRYwr(POPCNT, G,v, E,v, cpuid(POPCNT) zextT0); + + if (s->prefix & PREFIX_REPZ) { + *entry = popcnt; + } else { + memset(entry, 0, sizeof(*entry)); + } +} + +static void decode_0FBC(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + /* For BSF, pass 2op as the third operand so that we can use zextT0 */ + static const X86OpEntry opcodes_0FBC[4] = { + X86_OP_ENTRY3(BSF, G,v, E,v, 2op,v, zextT0), + X86_OP_ENTRY3(BSF, G,v, E,v, 2op,v, zextT0), /* 0x66 */ + X86_OP_ENTRYwr(TZCNT, G,v, E,v, zextT0), /* 0xf3 */ + X86_OP_ENTRY3(BSF, G,v, E,v, 2op,v, zextT0), /* 0xf2 */ + }; + if (!(s->cpuid_ext3_features & CPUID_EXT3_ABM)) { + *entry = opcodes_0FBC[0]; + } else { + *entry = *decode_by_prefix(s, opcodes_0FBC); + } +} + +static void decode_0FBD(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) +{ + /* For BSR, pass 2op as the third operand so that we can use zextT0 */ + static const X86OpEntry opcodes_0FBD[4] = { + X86_OP_ENTRY3(BSR, G,v, E,v, 2op,v, zextT0), + X86_OP_ENTRY3(BSR, G,v, E,v, 2op,v, zextT0), /* 0x66 */ + X86_OP_ENTRYwr(LZCNT, G,v, E,v, zextT0), /* 0xf3 */ + X86_OP_ENTRY3(BSR, G,v, E,v, 2op,v, zextT0), /* 0xf2 */ + }; + if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) { + *entry = opcodes_0FBD[0]; + } else { + *entry = *decode_by_prefix(s, opcodes_0FBD); + } +} + static void decode_0FD6(DisasContext *s, CPUX86State *env, X86OpEntry *entry, uint8_t *b) { static const X86OpEntry movq[4] = { @@ -1255,8 +1299,11 @@ static const X86OpEntry opcodes_0F[256] = { */ [0xaf] = X86_OP_ENTRY3(IMUL3, G,v, E,v, 2op,v, sextT0), + [0xb8] = X86_OP_GROUP0(0FB8), /* decoded as modrm, which is visible as a difference between page fault and #UD */ [0xb9] = X86_OP_ENTRYr(UD, nop,v), /* UD1 */ + [0xbc] = X86_OP_GROUP0(0FBC), + [0xbd] = X86_OP_GROUP0(0FBD), [0xbe] = X86_OP_ENTRY3(MOV, G,v, E,b, None, None, sextT0), /* MOVSX */ [0xbf] = X86_OP_ENTRY3(MOV, G,v, E,w, None, None, sextT0), /* MOVSX */ @@ -2158,6 +2205,8 @@ static bool has_cpuid_feature(DisasContext *s, X86CPUIDFeature cpuid) return (s->cpuid_ext_features & CPUID_EXT_MOVBE); case X86_FEAT_PCLMULQDQ: return (s->cpuid_ext_features & CPUID_EXT_PCLMULQDQ); + case X86_FEAT_POPCNT: + return (s->cpuid_ext_features & CPUID_EXT_POPCNT); case X86_FEAT_SSE: return (s->cpuid_features & CPUID_SSE); case X86_FEAT_SSE2: @@ -2548,8 +2597,7 @@ static void disas_insn(DisasContext *s, CPUState *cpu) case 0xab: /* bts */ case 0xb0 ... 0xb1: /* cmpxchg */ case 0xb3: /* btr */ - case 0xb8: /* integer ops */ - case 0xba ... 0xbd: /* integer ops */ + case 0xba ... 0xbb: /* grp8, btc */ case 0xc0 ... 0xc1: /* xadd */ case 0xc7: /* grp9 */ disas_insn_old(s, cpu, b + 0x100); diff --git a/target/i386/tcg/decode-new.h b/target/i386/tcg/decode-new.h index e8d7d69954..f9bf9a6041 100644 --- a/target/i386/tcg/decode-new.h +++ b/target/i386/tcg/decode-new.h @@ -120,6 +120,7 @@ typedef enum X86CPUIDFeature { X86_FEAT_FXSR, X86_FEAT_MOVBE, X86_FEAT_PCLMULQDQ, + X86_FEAT_POPCNT, X86_FEAT_SHA_NI, X86_FEAT_SSE, X86_FEAT_SSE2, diff --git a/target/i386/tcg/emit.c.inc b/target/i386/tcg/emit.c.inc index 2b1f0f1b9b..99b08eb719 100644 --- a/target/i386/tcg/emit.c.inc +++ b/target/i386/tcg/emit.c.inc @@ -1333,6 +1333,47 @@ static void gen_BOUND(DisasContext *s, X86DecodedInsn *decode) } } +/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output. */ +static void gen_BSF(DisasContext *s, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + /* Only the Z bit is defined and it is related to the input. */ + decode->cc_dst = tcg_temp_new(); + decode->cc_op = CC_OP_LOGICB + ot; + tcg_gen_mov_tl(decode->cc_dst, s->T0); + + /* + * The manual says that the output is undefined when the + * input is zero, but real hardware leaves it unchanged, and + * real programs appear to depend on that. Accomplish this + * by passing the output as the value to return upon zero. + */ + tcg_gen_ctz_tl(s->T0, s->T0, s->T1); +} + +/* Non-standard convention - on entry T0 is zero-extended input, T1 is the output. */ +static void gen_BSR(DisasContext *s, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + /* Only the Z bit is defined and it is related to the input. */ + decode->cc_dst = tcg_temp_new(); + decode->cc_op = CC_OP_LOGICB + ot; + tcg_gen_mov_tl(decode->cc_dst, s->T0); + + /* + * The manual says that the output is undefined when the + * input is zero, but real hardware leaves it unchanged, and + * real programs appear to depend on that. Accomplish this + * by passing the output as the value to return upon zero. + * Plus, return the bit index of the first 1 bit. + */ + tcg_gen_xori_tl(s->T1, s->T1, TARGET_LONG_BITS - 1); + tcg_gen_clz_tl(s->T0, s->T0, s->T1); + tcg_gen_xori_tl(s->T0, s->T0, TARGET_LONG_BITS - 1); +} + static void gen_BSWAP(DisasContext *s, X86DecodedInsn *decode) { #ifdef TARGET_X86_64 @@ -2134,6 +2175,24 @@ static void gen_LSS(DisasContext *s, X86DecodedInsn *decode) gen_lxx_seg(s, decode, R_SS); } +static void gen_LZCNT(DisasContext *s, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + /* C bit (cc_src) is defined related to the input. */ + decode->cc_src = tcg_temp_new(); + decode->cc_dst = s->T0; + decode->cc_op = CC_OP_BMILGB + ot; + tcg_gen_mov_tl(decode->cc_src, s->T0); + + /* + * Reduce the target_ulong result by the number of zeros that + * we expect to find at the top. + */ + tcg_gen_clzi_tl(s->T0, s->T0, TARGET_LONG_BITS); + tcg_gen_subi_tl(s->T0, s->T0, TARGET_LONG_BITS - (8 << ot)); +} + static void gen_MFENCE(DisasContext *s, X86DecodedInsn *decode) { tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC); @@ -2692,6 +2751,15 @@ static void gen_POPA(DisasContext *s, X86DecodedInsn *decode) gen_popa(s); } +static void gen_POPCNT(DisasContext *s, X86DecodedInsn *decode) +{ + decode->cc_src = tcg_temp_new(); + decode->cc_op = CC_OP_POPCNT; + + tcg_gen_mov_tl(decode->cc_src, s->T0); + tcg_gen_ctpop_tl(s->T0, s->T0); +} + static void gen_POPF(DisasContext *s, X86DecodedInsn *decode) { MemOp ot; @@ -3773,6 +3841,20 @@ static void gen_SYSRET(DisasContext *s, X86DecodedInsn *decode) s->base.is_jmp = DISAS_EOB_RECHECK_TF; } +static void gen_TZCNT(DisasContext *s, X86DecodedInsn *decode) +{ + MemOp ot = decode->op[0].ot; + + /* C bit (cc_src) is defined related to the input. */ + decode->cc_src = tcg_temp_new(); + decode->cc_dst = s->T0; + decode->cc_op = CC_OP_BMILGB + ot; + tcg_gen_mov_tl(decode->cc_src, s->T0); + + /* A zero input returns the operand size. */ + tcg_gen_ctzi_tl(s->T0, s->T0, 8 << ot); +} + static void gen_UD(DisasContext *s, X86DecodedInsn *decode) { gen_illegal_opcode(s); diff --git a/target/i386/tcg/translate.c b/target/i386/tcg/translate.c index 33058db4e3..68a11f8178 100644 --- a/target/i386/tcg/translate.c +++ b/target/i386/tcg/translate.c @@ -823,11 +823,6 @@ static void gen_movs(DisasContext *s, MemOp ot) gen_op_add_reg(s, s->aflag, R_EDI, dshift); } -static void gen_op_update1_cc(DisasContext *s) -{ - tcg_gen_mov_tl(cpu_cc_dst, s->T0); -} - static void gen_op_update2_cc(DisasContext *s) { tcg_gen_mov_tl(cpu_cc_src, s->T1); @@ -3311,56 +3306,6 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) break; } break; - case 0x1bc: /* bsf / tzcnt */ - case 0x1bd: /* bsr / lzcnt */ - ot = dflag; - modrm = x86_ldub_code(env, s); - reg = ((modrm >> 3) & 7) | REX_R(s); - gen_ld_modrm(env, s, modrm, ot); - gen_extu(ot, s->T0); - - /* Note that lzcnt and tzcnt are in different extensions. */ - if ((prefixes & PREFIX_REPZ) - && (b & 1 - ? s->cpuid_ext3_features & CPUID_EXT3_ABM - : s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_BMI1)) { - int size = 8 << ot; - /* For lzcnt/tzcnt, C bit is defined related to the input. */ - tcg_gen_mov_tl(cpu_cc_src, s->T0); - if (b & 1) { - /* For lzcnt, reduce the target_ulong result by the - number of zeros that we expect to find at the top. */ - tcg_gen_clzi_tl(s->T0, s->T0, TARGET_LONG_BITS); - tcg_gen_subi_tl(s->T0, s->T0, TARGET_LONG_BITS - size); - } else { - /* For tzcnt, a zero input must return the operand size. */ - tcg_gen_ctzi_tl(s->T0, s->T0, size); - } - /* For lzcnt/tzcnt, Z bit is defined related to the result. */ - gen_op_update1_cc(s); - set_cc_op(s, CC_OP_BMILGB + ot); - } else { - /* For bsr/bsf, only the Z bit is defined and it is related - to the input and not the result. */ - tcg_gen_mov_tl(cpu_cc_dst, s->T0); - set_cc_op(s, CC_OP_LOGICB + ot); - - /* ??? The manual says that the output is undefined when the - input is zero, but real hardware leaves it unchanged, and - real programs appear to depend on that. Accomplish this - by passing the output as the value to return upon zero. */ - if (b & 1) { - /* For bsr, return the bit index of the first 1 bit, - not the count of leading zeros. */ - tcg_gen_xori_tl(s->T1, cpu_regs[reg], TARGET_LONG_BITS - 1); - tcg_gen_clz_tl(s->T0, s->T0, s->T1); - tcg_gen_xori_tl(s->T0, s->T0, TARGET_LONG_BITS - 1); - } else { - tcg_gen_ctz_tl(s->T0, s->T0, cpu_regs[reg]); - } - } - gen_op_mov_reg_v(s, ot, reg, s->T0); - break; case 0x100: modrm = x86_ldub_code(env, s); mod = (modrm >> 6) & 3; @@ -3955,25 +3900,6 @@ static void disas_insn_old(DisasContext *s, CPUState *cpu, int b) } gen_nop_modrm(env, s, modrm); break; - case 0x1b8: /* SSE4.2 popcnt */ - if ((prefixes & (PREFIX_REPZ | PREFIX_LOCK | PREFIX_REPNZ)) != - PREFIX_REPZ) - goto illegal_op; - if (!(s->cpuid_ext_features & CPUID_EXT_POPCNT)) - goto illegal_op; - - modrm = x86_ldub_code(env, s); - reg = ((modrm >> 3) & 7) | REX_R(s); - - ot = dflag; - gen_ld_modrm(env, s, modrm, ot); - gen_extu(ot, s->T0); - tcg_gen_mov_tl(cpu_cc_src, s->T0); - tcg_gen_ctpop_tl(s->T0, s->T0); - gen_op_mov_reg_v(s, ot, reg, s->T0); - - set_cc_op(s, CC_OP_POPCNT); - break; default: g_assert_not_reached(); }