mirror of https://github.com/xemu-project/xemu.git
tcg/i386: use movbe instruction in qemu_ldst routines
The movbe instruction has been added on some Intel Atom CPUs and on recent Intel Haswell CPUs. It allows to load/store a value and at the same time bswap it. This patch detects the avaibility of this instruction and when available use it in the qemu load/store routines in replacement of load/store + bswap. Note that for 16-bit unsigned loads, movbe + movzw is basically the same as movzw + bswap, so the patch doesn't touch this case. Signed-off-by: Aurelien Jarno <aurelien@aurel32.net> [RTH: Reduced the number of conditionals using "movop".] Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
parent
2a1137753f
commit
085bb5bb64
|
@ -99,18 +99,31 @@ static const int tcg_target_call_oarg_regs[] = {
|
||||||
# define TCG_REG_L1 TCG_REG_EDX
|
# define TCG_REG_L1 TCG_REG_EDX
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* The host compiler should supply <cpuid.h> to enable runtime features
|
||||||
|
detection, as we're not going to go so far as our own inline assembly.
|
||||||
|
If not available, default values will be assumed. */
|
||||||
|
#if defined(CONFIG_CPUID_H)
|
||||||
|
#include <cpuid.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
/* For 32-bit, we are going to attempt to determine at runtime whether cmov
|
/* For 32-bit, we are going to attempt to determine at runtime whether cmov
|
||||||
is available. However, the host compiler must supply <cpuid.h>, as we're
|
is available. */
|
||||||
not going to go so far as our own inline assembly. */
|
|
||||||
#if TCG_TARGET_REG_BITS == 64
|
#if TCG_TARGET_REG_BITS == 64
|
||||||
# define have_cmov 1
|
# define have_cmov 1
|
||||||
#elif defined(CONFIG_CPUID_H)
|
#elif defined(CONFIG_CPUID_H)
|
||||||
#include <cpuid.h>
|
|
||||||
static bool have_cmov;
|
static bool have_cmov;
|
||||||
#else
|
#else
|
||||||
# define have_cmov 0
|
# define have_cmov 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
/* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
|
||||||
|
going to attempt to determine at runtime whether movbe is available. */
|
||||||
|
#if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
|
||||||
|
static bool have_movbe;
|
||||||
|
#else
|
||||||
|
# define have_movbe 0
|
||||||
|
#endif
|
||||||
|
|
||||||
static uint8_t *tb_ret_addr;
|
static uint8_t *tb_ret_addr;
|
||||||
|
|
||||||
static void patch_reloc(uint8_t *code_ptr, int type,
|
static void patch_reloc(uint8_t *code_ptr, int type,
|
||||||
|
@ -280,6 +293,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
|
||||||
#define OPC_MOVB_EvIz (0xc6)
|
#define OPC_MOVB_EvIz (0xc6)
|
||||||
#define OPC_MOVL_EvIz (0xc7)
|
#define OPC_MOVL_EvIz (0xc7)
|
||||||
#define OPC_MOVL_Iv (0xb8)
|
#define OPC_MOVL_Iv (0xb8)
|
||||||
|
#define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
|
||||||
|
#define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
|
||||||
#define OPC_MOVSBL (0xbe | P_EXT)
|
#define OPC_MOVSBL (0xbe | P_EXT)
|
||||||
#define OPC_MOVSWL (0xbf | P_EXT)
|
#define OPC_MOVSWL (0xbf | P_EXT)
|
||||||
#define OPC_MOVSLQ (0x63 | P_REXW)
|
#define OPC_MOVSLQ (0x63 | P_REXW)
|
||||||
|
@ -1344,7 +1359,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
TCGReg base, intptr_t ofs, int seg,
|
TCGReg base, intptr_t ofs, int seg,
|
||||||
TCGMemOp memop)
|
TCGMemOp memop)
|
||||||
{
|
{
|
||||||
const TCGMemOp bswap = memop & MO_BSWAP;
|
const TCGMemOp real_bswap = memop & MO_BSWAP;
|
||||||
|
TCGMemOp bswap = real_bswap;
|
||||||
|
int movop = OPC_MOVL_GvEv;
|
||||||
|
|
||||||
|
if (have_movbe && real_bswap) {
|
||||||
|
bswap = 0;
|
||||||
|
movop = OPC_MOVBE_GyMy;
|
||||||
|
}
|
||||||
|
|
||||||
switch (memop & MO_SSIZE) {
|
switch (memop & MO_SSIZE) {
|
||||||
case MO_UB:
|
case MO_UB:
|
||||||
|
@ -1355,14 +1377,19 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
break;
|
break;
|
||||||
case MO_UW:
|
case MO_UW:
|
||||||
tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
|
tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
|
||||||
if (bswap) {
|
if (real_bswap) {
|
||||||
tcg_out_rolw_8(s, datalo);
|
tcg_out_rolw_8(s, datalo);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case MO_SW:
|
case MO_SW:
|
||||||
if (bswap) {
|
if (real_bswap) {
|
||||||
|
if (have_movbe) {
|
||||||
|
tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
|
||||||
|
datalo, base, ofs);
|
||||||
|
} else {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
|
tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
|
||||||
tcg_out_rolw_8(s, datalo);
|
tcg_out_rolw_8(s, datalo);
|
||||||
|
}
|
||||||
tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
|
tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
|
||||||
} else {
|
} else {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
|
tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
|
||||||
|
@ -1370,16 +1397,18 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case MO_UL:
|
case MO_UL:
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
|
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
|
||||||
if (bswap) {
|
if (bswap) {
|
||||||
tcg_out_bswap32(s, datalo);
|
tcg_out_bswap32(s, datalo);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
#if TCG_TARGET_REG_BITS == 64
|
#if TCG_TARGET_REG_BITS == 64
|
||||||
case MO_SL:
|
case MO_SL:
|
||||||
|
if (real_bswap) {
|
||||||
|
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
|
||||||
if (bswap) {
|
if (bswap) {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
|
|
||||||
tcg_out_bswap32(s, datalo);
|
tcg_out_bswap32(s, datalo);
|
||||||
|
}
|
||||||
tcg_out_ext32s(s, datalo, datalo);
|
tcg_out_ext32s(s, datalo, datalo);
|
||||||
} else {
|
} else {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
|
tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
|
||||||
|
@ -1388,27 +1417,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
#endif
|
#endif
|
||||||
case MO_Q:
|
case MO_Q:
|
||||||
if (TCG_TARGET_REG_BITS == 64) {
|
if (TCG_TARGET_REG_BITS == 64) {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg,
|
tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
|
||||||
datalo, base, ofs);
|
|
||||||
if (bswap) {
|
if (bswap) {
|
||||||
tcg_out_bswap64(s, datalo);
|
tcg_out_bswap64(s, datalo);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (bswap) {
|
if (real_bswap) {
|
||||||
int t = datalo;
|
int t = datalo;
|
||||||
datalo = datahi;
|
datalo = datahi;
|
||||||
datahi = t;
|
datahi = t;
|
||||||
}
|
}
|
||||||
if (base != datalo) {
|
if (base != datalo) {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
|
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
|
||||||
datalo, base, ofs);
|
tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
|
|
||||||
datahi, base, ofs + 4);
|
|
||||||
} else {
|
} else {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
|
tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
|
||||||
datahi, base, ofs + 4);
|
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
|
|
||||||
datalo, base, ofs);
|
|
||||||
}
|
}
|
||||||
if (bswap) {
|
if (bswap) {
|
||||||
tcg_out_bswap32(s, datalo);
|
tcg_out_bswap32(s, datalo);
|
||||||
|
@ -1484,13 +1508,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
TCGReg base, intptr_t ofs, int seg,
|
TCGReg base, intptr_t ofs, int seg,
|
||||||
TCGMemOp memop)
|
TCGMemOp memop)
|
||||||
{
|
{
|
||||||
const TCGMemOp bswap = memop & MO_BSWAP;
|
|
||||||
|
|
||||||
/* ??? Ideally we wouldn't need a scratch register. For user-only,
|
/* ??? Ideally we wouldn't need a scratch register. For user-only,
|
||||||
we could perform the bswap twice to restore the original value
|
we could perform the bswap twice to restore the original value
|
||||||
instead of moving to the scratch. But as it is, the L constraint
|
instead of moving to the scratch. But as it is, the L constraint
|
||||||
means that TCG_REG_L0 is definitely free here. */
|
means that TCG_REG_L0 is definitely free here. */
|
||||||
const TCGReg scratch = TCG_REG_L0;
|
const TCGReg scratch = TCG_REG_L0;
|
||||||
|
const TCGMemOp real_bswap = memop & MO_BSWAP;
|
||||||
|
TCGMemOp bswap = real_bswap;
|
||||||
|
int movop = OPC_MOVL_EvGv;
|
||||||
|
|
||||||
|
if (have_movbe && real_bswap) {
|
||||||
|
bswap = 0;
|
||||||
|
movop = OPC_MOVBE_MyGy;
|
||||||
|
}
|
||||||
|
|
||||||
switch (memop & MO_SIZE) {
|
switch (memop & MO_SIZE) {
|
||||||
case MO_8:
|
case MO_8:
|
||||||
|
@ -1509,8 +1539,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
tcg_out_rolw_8(s, scratch);
|
tcg_out_rolw_8(s, scratch);
|
||||||
datalo = scratch;
|
datalo = scratch;
|
||||||
}
|
}
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg,
|
tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
|
||||||
datalo, base, ofs);
|
|
||||||
break;
|
break;
|
||||||
case MO_32:
|
case MO_32:
|
||||||
if (bswap) {
|
if (bswap) {
|
||||||
|
@ -1518,7 +1547,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
tcg_out_bswap32(s, scratch);
|
tcg_out_bswap32(s, scratch);
|
||||||
datalo = scratch;
|
datalo = scratch;
|
||||||
}
|
}
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
|
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
|
||||||
break;
|
break;
|
||||||
case MO_64:
|
case MO_64:
|
||||||
if (TCG_TARGET_REG_BITS == 64) {
|
if (TCG_TARGET_REG_BITS == 64) {
|
||||||
|
@ -1527,8 +1556,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
tcg_out_bswap64(s, scratch);
|
tcg_out_bswap64(s, scratch);
|
||||||
datalo = scratch;
|
datalo = scratch;
|
||||||
}
|
}
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg,
|
tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
|
||||||
datalo, base, ofs);
|
|
||||||
} else if (bswap) {
|
} else if (bswap) {
|
||||||
tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
|
tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
|
||||||
tcg_out_bswap32(s, scratch);
|
tcg_out_bswap32(s, scratch);
|
||||||
|
@ -1537,8 +1565,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
|
||||||
tcg_out_bswap32(s, scratch);
|
tcg_out_bswap32(s, scratch);
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
|
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
|
||||||
} else {
|
} else {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs);
|
if (real_bswap) {
|
||||||
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4);
|
int t = datalo;
|
||||||
|
datalo = datahi;
|
||||||
|
datahi = t;
|
||||||
|
}
|
||||||
|
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
|
||||||
|
tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -2165,13 +2198,23 @@ static void tcg_target_qemu_prologue(TCGContext *s)
|
||||||
|
|
||||||
static void tcg_target_init(TCGContext *s)
|
static void tcg_target_init(TCGContext *s)
|
||||||
{
|
{
|
||||||
/* For 32-bit, 99% certainty that we're running on hardware that supports
|
#if !(defined(have_cmov) && defined(have_movbe))
|
||||||
cmov, but we still need to check. In case cmov is not available, we'll
|
|
||||||
use a small forward branch. */
|
|
||||||
#ifndef have_cmov
|
|
||||||
{
|
{
|
||||||
unsigned a, b, c, d;
|
unsigned a, b, c, d;
|
||||||
have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV));
|
int ret = __get_cpuid(1, &a, &b, &c, &d);
|
||||||
|
|
||||||
|
# ifndef have_cmov
|
||||||
|
/* For 32-bit, 99% certainty that we're running on hardware that
|
||||||
|
supports cmov, but we still need to check. In case cmov is not
|
||||||
|
available, we'll use a small forward branch. */
|
||||||
|
have_cmov = ret && (d & bit_CMOV);
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# ifndef have_movbe
|
||||||
|
/* MOVBE is only available on Intel Atom and Haswell CPUs, so we
|
||||||
|
need to probe for it. */
|
||||||
|
have_movbe = ret && (c & bit_MOVBE);
|
||||||
|
# endif
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue