tcg/i386: use movbe instruction in qemu_ldst routines

The movbe instruction has been added on some Intel Atom CPUs and on
recent Intel Haswell CPUs. It allows to load/store a value and at the
same time bswap it.

This patch detects the avaibility of this instruction and when available
use it in the qemu load/store routines in replacement of load/store +
bswap. Note that for 16-bit unsigned loads, movbe + movzw is basically the
same as movzw + bswap, so the patch doesn't touch this case.

Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
[RTH: Reduced the number of conditionals using "movop".]
Signed-off-by: Richard Henderson <rth@twiddle.net>
This commit is contained in:
Aurelien Jarno 2013-11-06 19:51:21 +01:00 committed by Richard Henderson
parent 2a1137753f
commit 085bb5bb64
1 changed files with 80 additions and 37 deletions

View File

@ -99,18 +99,31 @@ static const int tcg_target_call_oarg_regs[] = {
# define TCG_REG_L1 TCG_REG_EDX # define TCG_REG_L1 TCG_REG_EDX
#endif #endif
/* The host compiler should supply <cpuid.h> to enable runtime features
detection, as we're not going to go so far as our own inline assembly.
If not available, default values will be assumed. */
#if defined(CONFIG_CPUID_H)
#include <cpuid.h>
#endif
/* For 32-bit, we are going to attempt to determine at runtime whether cmov /* For 32-bit, we are going to attempt to determine at runtime whether cmov
is available. However, the host compiler must supply <cpuid.h>, as we're is available. */
not going to go so far as our own inline assembly. */
#if TCG_TARGET_REG_BITS == 64 #if TCG_TARGET_REG_BITS == 64
# define have_cmov 1 # define have_cmov 1
#elif defined(CONFIG_CPUID_H) #elif defined(CONFIG_CPUID_H)
#include <cpuid.h>
static bool have_cmov; static bool have_cmov;
#else #else
# define have_cmov 0 # define have_cmov 0
#endif #endif
/* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
going to attempt to determine at runtime whether movbe is available. */
#if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
static bool have_movbe;
#else
# define have_movbe 0
#endif
static uint8_t *tb_ret_addr; static uint8_t *tb_ret_addr;
static void patch_reloc(uint8_t *code_ptr, int type, static void patch_reloc(uint8_t *code_ptr, int type,
@ -280,6 +293,8 @@ static inline int tcg_target_const_match(tcg_target_long val,
#define OPC_MOVB_EvIz (0xc6) #define OPC_MOVB_EvIz (0xc6)
#define OPC_MOVL_EvIz (0xc7) #define OPC_MOVL_EvIz (0xc7)
#define OPC_MOVL_Iv (0xb8) #define OPC_MOVL_Iv (0xb8)
#define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
#define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
#define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSBL (0xbe | P_EXT)
#define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT)
#define OPC_MOVSLQ (0x63 | P_REXW) #define OPC_MOVSLQ (0x63 | P_REXW)
@ -1344,7 +1359,14 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
TCGReg base, intptr_t ofs, int seg, TCGReg base, intptr_t ofs, int seg,
TCGMemOp memop) TCGMemOp memop)
{ {
const TCGMemOp bswap = memop & MO_BSWAP; const TCGMemOp real_bswap = memop & MO_BSWAP;
TCGMemOp bswap = real_bswap;
int movop = OPC_MOVL_GvEv;
if (have_movbe && real_bswap) {
bswap = 0;
movop = OPC_MOVBE_GyMy;
}
switch (memop & MO_SSIZE) { switch (memop & MO_SSIZE) {
case MO_UB: case MO_UB:
@ -1355,14 +1377,19 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
break; break;
case MO_UW: case MO_UW:
tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
if (bswap) { if (real_bswap) {
tcg_out_rolw_8(s, datalo); tcg_out_rolw_8(s, datalo);
} }
break; break;
case MO_SW: case MO_SW:
if (bswap) { if (real_bswap) {
if (have_movbe) {
tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
datalo, base, ofs);
} else {
tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs);
tcg_out_rolw_8(s, datalo); tcg_out_rolw_8(s, datalo);
}
tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo); tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
} else { } else {
tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg, tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg,
@ -1370,16 +1397,18 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
} }
break; break;
case MO_UL: case MO_UL:
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
if (bswap) { if (bswap) {
tcg_out_bswap32(s, datalo); tcg_out_bswap32(s, datalo);
} }
break; break;
#if TCG_TARGET_REG_BITS == 64 #if TCG_TARGET_REG_BITS == 64
case MO_SL: case MO_SL:
if (real_bswap) {
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
if (bswap) { if (bswap) {
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs);
tcg_out_bswap32(s, datalo); tcg_out_bswap32(s, datalo);
}
tcg_out_ext32s(s, datalo, datalo); tcg_out_ext32s(s, datalo, datalo);
} else { } else {
tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs); tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs);
@ -1388,27 +1417,22 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
#endif #endif
case MO_Q: case MO_Q:
if (TCG_TARGET_REG_BITS == 64) { if (TCG_TARGET_REG_BITS == 64) {
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
datalo, base, ofs);
if (bswap) { if (bswap) {
tcg_out_bswap64(s, datalo); tcg_out_bswap64(s, datalo);
} }
} else { } else {
if (bswap) { if (real_bswap) {
int t = datalo; int t = datalo;
datalo = datahi; datalo = datahi;
datahi = t; datahi = t;
} }
if (base != datalo) { if (base != datalo) {
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
datalo, base, ofs); tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
datahi, base, ofs + 4);
} else { } else {
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs + 4);
datahi, base, ofs + 4); tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg,
datalo, base, ofs);
} }
if (bswap) { if (bswap) {
tcg_out_bswap32(s, datalo); tcg_out_bswap32(s, datalo);
@ -1484,13 +1508,19 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
TCGReg base, intptr_t ofs, int seg, TCGReg base, intptr_t ofs, int seg,
TCGMemOp memop) TCGMemOp memop)
{ {
const TCGMemOp bswap = memop & MO_BSWAP;
/* ??? Ideally we wouldn't need a scratch register. For user-only, /* ??? Ideally we wouldn't need a scratch register. For user-only,
we could perform the bswap twice to restore the original value we could perform the bswap twice to restore the original value
instead of moving to the scratch. But as it is, the L constraint instead of moving to the scratch. But as it is, the L constraint
means that TCG_REG_L0 is definitely free here. */ means that TCG_REG_L0 is definitely free here. */
const TCGReg scratch = TCG_REG_L0; const TCGReg scratch = TCG_REG_L0;
const TCGMemOp real_bswap = memop & MO_BSWAP;
TCGMemOp bswap = real_bswap;
int movop = OPC_MOVL_EvGv;
if (have_movbe && real_bswap) {
bswap = 0;
movop = OPC_MOVBE_MyGy;
}
switch (memop & MO_SIZE) { switch (memop & MO_SIZE) {
case MO_8: case MO_8:
@ -1509,8 +1539,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
tcg_out_rolw_8(s, scratch); tcg_out_rolw_8(s, scratch);
datalo = scratch; datalo = scratch;
} }
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
datalo, base, ofs);
break; break;
case MO_32: case MO_32:
if (bswap) { if (bswap) {
@ -1518,7 +1547,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
tcg_out_bswap32(s, scratch); tcg_out_bswap32(s, scratch);
datalo = scratch; datalo = scratch;
} }
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
break; break;
case MO_64: case MO_64:
if (TCG_TARGET_REG_BITS == 64) { if (TCG_TARGET_REG_BITS == 64) {
@ -1527,8 +1556,7 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
tcg_out_bswap64(s, scratch); tcg_out_bswap64(s, scratch);
datalo = scratch; datalo = scratch;
} }
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
datalo, base, ofs);
} else if (bswap) { } else if (bswap) {
tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi); tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
tcg_out_bswap32(s, scratch); tcg_out_bswap32(s, scratch);
@ -1537,8 +1565,13 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
tcg_out_bswap32(s, scratch); tcg_out_bswap32(s, scratch);
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4); tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
} else { } else {
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); if (real_bswap) {
tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datahi, base, ofs+4); int t = datalo;
datalo = datahi;
datahi = t;
}
tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
} }
break; break;
default: default:
@ -2165,13 +2198,23 @@ static void tcg_target_qemu_prologue(TCGContext *s)
static void tcg_target_init(TCGContext *s) static void tcg_target_init(TCGContext *s)
{ {
/* For 32-bit, 99% certainty that we're running on hardware that supports #if !(defined(have_cmov) && defined(have_movbe))
cmov, but we still need to check. In case cmov is not available, we'll
use a small forward branch. */
#ifndef have_cmov
{ {
unsigned a, b, c, d; unsigned a, b, c, d;
have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV)); int ret = __get_cpuid(1, &a, &b, &c, &d);
# ifndef have_cmov
/* For 32-bit, 99% certainty that we're running on hardware that
supports cmov, but we still need to check. In case cmov is not
available, we'll use a small forward branch. */
have_cmov = ret && (d & bit_CMOV);
# endif
# ifndef have_movbe
/* MOVBE is only available on Intel Atom and Haswell CPUs, so we
need to probe for it. */
have_movbe = ret && (c & bit_MOVBE);
# endif
} }
#endif #endif