JIT: support paired load/store with MMU on

Also change the calling convention, to avoid RSCRATCH being clobbered by
memcheck'd loads.
This commit is contained in:
Fiora 2014-08-31 10:37:23 -07:00
parent 2661bc151a
commit 09a62505c5
5 changed files with 166 additions and 99 deletions

View File

@ -20,30 +20,31 @@ void Jit64::psq_st(UGeckoInstruction inst)
{ {
INSTRUCTION_START INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff); JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(js.memcheck || !inst.RA); FALLBACK_IF(!inst.RA);
bool update = inst.OPCD == 61; bool update = inst.OPCD == 61;
int offset = inst.SIMM_12; int offset = inst.SIMM_12;
int a = inst.RA; int a = inst.RA;
int s = inst.RS; // Fp numbers int s = inst.RS;
gpr.FlushLockX(RSCRATCH, RSCRATCH_EXTRA); gpr.FlushLockX(RSCRATCH_EXTRA);
if (update) if (update)
gpr.BindToRegister(inst.RA, true, true); gpr.BindToRegister(a, true, true);
fpr.BindToRegister(inst.RS, true, false); fpr.BindToRegister(s, true, false);
MOV(32, R(RSCRATCH_EXTRA), gpr.R(inst.RA)); MOV(32, R(RSCRATCH_EXTRA), gpr.R(a));
if (offset) if (offset)
ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset)); ADD(32, R(RSCRATCH_EXTRA), Imm32((u32)offset));
if (update && offset) // In memcheck mode, don't update the address until the exception check
if (update && offset && !js.memcheck)
MOV(32, gpr.R(a), R(RSCRATCH_EXTRA)); MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
// Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code. // Some games (e.g. Dirt 2) incorrectly set the unused bits which breaks the lookup table code.
// Hence, we need to mask out the unused bits. The layout of the GQR register is // Hence, we need to mask out the unused bits. The layout of the GQR register is
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
// 0b0011111100000111, or 0x3F07. // 0b0011111100000111, or 0x3F07.
MOV(32, R(RSCRATCH), Imm32(0x3F07)); MOV(32, R(RSCRATCH2), Imm32(0x3F07));
AND(32, R(RSCRATCH), PPCSTATE(spr[SPR_GQR0 + inst.I])); AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + inst.I]));
MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
// FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register! // FIXME: Fix ModR/M encoding to allow [RSCRATCH2*4+disp32] without a base register!
if (inst.W) if (inst.W)
@ -51,13 +52,20 @@ void Jit64::psq_st(UGeckoInstruction inst)
// One value // One value
PXOR(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions. PXOR(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
CVTSD2SS(XMM0, fpr.R(s)); CVTSD2SS(XMM0, fpr.R(s));
CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized)); CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.singleStoreQuantized));
} }
else else
{ {
// Pair of values // Pair of values
CVTPD2PS(XMM0, fpr.R(s)); CVTPD2PS(XMM0, fpr.R(s));
CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized)); CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedStoreQuantized));
}
if (update && offset && js.memcheck)
{
MEMCHECK_START
ADD(32, gpr.R(a), Imm32((u32)offset));
MEMCHECK_END
} }
gpr.UnlockAll(); gpr.UnlockAll();
gpr.UnlockAllX(); gpr.UnlockAllX();
@ -67,33 +75,38 @@ void Jit64::psq_l(UGeckoInstruction inst)
{ {
INSTRUCTION_START INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff); JITDISABLE(bJITLoadStorePairedOff);
FALLBACK_IF(js.memcheck || !inst.RA); FALLBACK_IF(!inst.RA);
bool update = inst.OPCD == 57; bool update = inst.OPCD == 57;
int offset = inst.SIMM_12; int offset = inst.SIMM_12;
int a = inst.RA;
int s = inst.RS;
gpr.FlushLockX(RSCRATCH, RSCRATCH_EXTRA); gpr.FlushLockX(RSCRATCH_EXTRA);
gpr.BindToRegister(inst.RA, true, update && offset); gpr.BindToRegister(a, true, update && offset);
fpr.BindToRegister(inst.RS, false, true); fpr.BindToRegister(s, false, true);
if (offset) if (offset)
LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(inst.RA), offset)); LEA(32, RSCRATCH_EXTRA, MDisp(gpr.RX(a), offset));
else else
MOV(32, R(RSCRATCH_EXTRA), gpr.R(inst.RA)); MOV(32, R(RSCRATCH_EXTRA), gpr.R(a));
if (update && offset) // In memcheck mode, don't update the address until the exception check
MOV(32, gpr.R(inst.RA), R(RSCRATCH_EXTRA)); if (update && offset && !js.memcheck)
MOV(32, R(RSCRATCH), Imm32(0x3F07)); MOV(32, gpr.R(a), R(RSCRATCH_EXTRA));
AND(32, R(RSCRATCH), M(((char *)&GQR(inst.I)) + 2)); MOV(32, R(RSCRATCH2), Imm32(0x3F07));
MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); AND(32, R(RSCRATCH2), M(((char *)&GQR(inst.I)) + 2));
MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
if (inst.W) if (inst.W)
OR(32, R(RSCRATCH2), Imm8(8)); OR(32, R(RSCRATCH), Imm8(8));
CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)asm_routines.pairedLoadQuantized)); CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)asm_routines.pairedLoadQuantized));
// MEMCHECK_START // FIXME: MMU does not work here because of unsafe memory access MEMCHECK_START
CVTPS2PD(fpr.RX(s), R(XMM0));
CVTPS2PD(fpr.RX(inst.RS), R(XMM0)); if (update && offset && js.memcheck)
{
// MEMCHECK_END ADD(32, gpr.R(a), Imm32((u32)offset));
}
MEMCHECK_END
gpr.UnlockAll(); gpr.UnlockAll();
gpr.UnlockAllX(); gpr.UnlockAllX();

View File

@ -1590,13 +1590,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
// Hence, we need to mask out the unused bits. The layout of the GQR register is // Hence, we need to mask out the unused bits. The layout of the GQR register is
// UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with // UU[SCALE]UUUUU[TYPE] where SCALE is 6 bits and TYPE is 3 bits, so we have to AND with
// 0b0011111100000111, or 0x3F07. // 0b0011111100000111, or 0x3F07.
Jit->MOV(32, R(RSCRATCH), Imm32(0x3F07)); Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07));
Jit->AND(32, R(RSCRATCH), M(((char *)&GQR(quantreg)) + 2)); Jit->AND(32, R(RSCRATCH2), M(((char *)&GQR(quantreg)) + 2));
Jit->MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
Jit->OR(32, R(RSCRATCH2), Imm8(w << 3)); Jit->OR(32, R(RSCRATCH), Imm8(w << 3));
Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp1(I))); Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp1(I)));
Jit->CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized))); Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedLoadQuantized)));
Jit->MOVAPD(reg, R(XMM0)); Jit->MOVAPD(reg, R(XMM0));
RI.fregs[reg] = I; RI.fregs[reg] = I;
regNormalRegClear(RI, I); regNormalRegClear(RI, I);
@ -1641,13 +1641,13 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
regSpill(RI, RSCRATCH); regSpill(RI, RSCRATCH);
regSpill(RI, RSCRATCH2); regSpill(RI, RSCRATCH2);
u32 quantreg = *I >> 24; u32 quantreg = *I >> 24;
Jit->MOV(32, R(RSCRATCH), Imm32(0x3F07)); Jit->MOV(32, R(RSCRATCH2), Imm32(0x3F07));
Jit->AND(32, R(RSCRATCH), PPCSTATE(spr[SPR_GQR0 + quantreg])); Jit->AND(32, R(RSCRATCH2), PPCSTATE(spr[SPR_GQR0 + quantreg]));
Jit->MOVZX(32, 8, RSCRATCH2, R(RSCRATCH)); Jit->MOVZX(32, 8, RSCRATCH, R(RSCRATCH2));
Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp2(I))); Jit->MOV(32, R(RSCRATCH_EXTRA), regLocForInst(RI, getOp2(I)));
Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I))); Jit->MOVAPD(XMM0, fregLocForInst(RI, getOp1(I)));
Jit->CALLptr(MScaled(RSCRATCH2, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized))); Jit->CALLptr(MScaled(RSCRATCH, SCALE_8, (u32)(u64)(((JitIL *)jit)->asm_routines.pairedStoreQuantized)));
if (RI.IInfo[I - RI.FirstI] & 4) if (RI.IInfo[I - RI.FirstI] & 4)
fregClearInst(RI, getOp1(I)); fregClearInst(RI, getOp1(I));
if (RI.IInfo[I - RI.FirstI] & 8) if (RI.IInfo[I - RI.FirstI] & 8)

View File

@ -17,6 +17,8 @@
(1 << (XMM0+16)) | \ (1 << (XMM0+16)) | \
(1 << (XMM1+16)))) (1 << (XMM1+16))))
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | (1 << RSCRATCH2))
using namespace Gen; using namespace Gen;
static int temp32; static int temp32;
@ -250,24 +252,29 @@ void CommonAsmRoutines::GenQuantizedStores()
UD2(); UD2();
const u8* storePairedFloat = AlignCode4(); const u8* storePairedFloat = AlignCode4();
FixupBranch skip_complex, too_complex;
SHUFPS(XMM0, R(XMM0), 1); SHUFPS(XMM0, R(XMM0), 1);
MOVQ_xmm(M(&psTemp[0]), XMM0); MOVQ_xmm(M(&psTemp[0]), XMM0);
TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000)); if (!jit->js.memcheck)
FixupBranch too_complex = J_CC(CC_NZ, true); {
MOV(64, R(RSCRATCH), M(&psTemp[0])); TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000));
SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH); too_complex = J_CC(CC_NZ, true);
FixupBranch skip_complex = J(true); MOV(64, R(RSCRATCH), M(&psTemp[0]));
SetJumpTarget(too_complex); SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH);
skip_complex = J(true);
SetJumpTarget(too_complex);
}
// RSP alignment here is 8 due to the call. // RSP alignment here is 8 due to the call.
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA); ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
SetJumpTarget(skip_complex); if (!jit->js.memcheck)
SetJumpTarget(skip_complex);
RET(); RET();
const u8* storePairedU8 = AlignCode4(); const u8* storePairedU8 = AlignCode4();
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1)); PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1)); MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE #ifdef QUANTIZE_OVERFLOW_SAFE
@ -284,8 +291,8 @@ void CommonAsmRoutines::GenQuantizedStores()
RET(); RET();
const u8* storePairedS8 = AlignCode4(); const u8* storePairedS8 = AlignCode4();
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1)); PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1)); MULPS(XMM0, R(XMM1));
#ifdef QUANTIZE_OVERFLOW_SAFE #ifdef QUANTIZE_OVERFLOW_SAFE
@ -303,8 +310,8 @@ void CommonAsmRoutines::GenQuantizedStores()
RET(); RET();
const u8* storePairedU16 = AlignCode4(); const u8* storePairedU16 = AlignCode4();
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1)); PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1)); MULPS(XMM0, R(XMM1));
@ -329,8 +336,8 @@ void CommonAsmRoutines::GenQuantizedStores()
RET(); RET();
const u8* storePairedS16 = AlignCode4(); const u8* storePairedS16 = AlignCode4();
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
// SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though. // SHUFPS or UNPCKLPS might be a better choice here. The last one might just be an alias though.
PUNPCKLDQ(XMM1, R(XMM1)); PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1)); MULPS(XMM0, R(XMM1));
@ -388,8 +395,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
}*/ }*/
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1)); MULSS(XMM0, R(XMM1));
PXOR(XMM1, R(XMM1)); PXOR(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1)); MAXSS(XMM0, R(XMM1));
@ -399,8 +406,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
RET(); RET();
const u8* storeSingleS8 = AlignCode4(); const u8* storeSingleS8 = AlignCode4();
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1)); MULSS(XMM0, R(XMM1));
MAXSS(XMM0, M((void *)&m_m128)); MAXSS(XMM0, M((void *)&m_m128));
MINSS(XMM0, M((void *)&m_127)); MINSS(XMM0, M((void *)&m_127));
@ -409,8 +416,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
RET(); RET();
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1)); MULSS(XMM0, R(XMM1));
PXOR(XMM1, R(XMM1)); PXOR(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1)); MAXSS(XMM0, R(XMM1));
@ -420,8 +427,8 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
RET(); RET();
const u8* storeSingleS16 = AlignCode4(); const u8* storeSingleS16 = AlignCode4();
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_quantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
MULSS(XMM0, R(XMM1)); MULSS(XMM0, R(XMM1));
MAXSS(XMM0, M((void *)&m_m32768)); MAXSS(XMM0, M((void *)&m_m32768));
MINSS(XMM0, M((void *)&m_32767)); MINSS(XMM0, M((void *)&m_32767));
@ -448,7 +455,13 @@ void CommonAsmRoutines::GenQuantizedLoads()
UD2(); UD2();
const u8* loadPairedFloatTwo = AlignCode4(); const u8* loadPairedFloatTwo = AlignCode4();
if (cpu_info.bSSSE3) if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG);
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
}
else if (cpu_info.bSSSE3)
{ {
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M((void *)pbswapShuffle2x4)); PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
@ -462,7 +475,13 @@ void CommonAsmRoutines::GenQuantizedLoads()
RET(); RET();
const u8* loadPairedFloatOne = AlignCode4(); const u8* loadPairedFloatOne = AlignCode4();
if (cpu_info.bSSSE3) if (jit->js.memcheck)
{
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_PROLOG);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
UNPCKLPS(XMM0, M((void*)m_one));
}
else if (cpu_info.bSSSE3)
{ {
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0)); MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
PSHUFB(XMM0, M((void *)pbswapShuffle1x4)); PSHUFB(XMM0, M((void *)pbswapShuffle1x4));
@ -477,99 +496,130 @@ void CommonAsmRoutines::GenQuantizedLoads()
RET(); RET();
const u8* loadPairedU8Two = AlignCode4(); const u8* loadPairedU8Two = AlignCode4();
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
PXOR(XMM1, R(XMM1)); PXOR(XMM1, R(XMM1));
PUNPCKLBW(XMM0, R(XMM1)); PUNPCKLBW(XMM0, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1)); PUNPCKLWD(XMM0, R(XMM1));
CVTDQ2PS(XMM0, R(XMM0)); CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1)); PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1)); MULPS(XMM0, R(XMM1));
RET(); RET();
const u8* loadPairedU8One = AlignCode4(); const u8* loadPairedU8One = AlignCode4();
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better? CVTDQ2PS(XMM0, R(XMM0)); // Is CVTSI2SS better?
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1)); MULSS(XMM0, R(XMM1));
UNPCKLPS(XMM0, M((void*)m_one)); UNPCKLPS(XMM0, M((void*)m_one));
RET(); RET();
const u8* loadPairedS8Two = AlignCode4(); const u8* loadPairedS8Two = AlignCode4();
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0); if (jit->js.memcheck)
{
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
}
else
{
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
}
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
PUNPCKLBW(XMM0, R(XMM0)); PUNPCKLBW(XMM0, R(XMM0));
PUNPCKLWD(XMM0, R(XMM0)); PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 24); PSRAD(XMM0, 24);
CVTDQ2PS(XMM0, R(XMM0)); CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1)); PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1)); MULPS(XMM0, R(XMM1));
RET(); RET();
const u8* loadPairedS8One = AlignCode4(); const u8* loadPairedS8One = AlignCode4();
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); if (jit->js.memcheck)
SHL(32, R(RSCRATCH_EXTRA), Imm8(24)); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG);
SAR(32, R(RSCRATCH_EXTRA), Imm8(24)); else
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
CVTDQ2PS(XMM0, R(XMM0)); CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1)); MULSS(XMM0, R(XMM1));
UNPCKLPS(XMM0, M((void*)m_one)); UNPCKLPS(XMM0, M((void*)m_one));
RET(); RET();
const u8* loadPairedU16Two = AlignCode4(); const u8* loadPairedU16Two = AlignCode4();
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); // TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
PXOR(XMM1, R(XMM1)); PXOR(XMM1, R(XMM1));
PUNPCKLWD(XMM0, R(XMM1)); PUNPCKLWD(XMM0, R(XMM1));
CVTDQ2PS(XMM0, R(XMM0)); CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1)); PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1)); MULPS(XMM0, R(XMM1));
RET(); RET();
const u8* loadPairedU16One = AlignCode4(); const u8* loadPairedU16One = AlignCode4();
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); if (jit->js.memcheck)
SHR(32, R(RSCRATCH_EXTRA), Imm8(16)); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
CVTDQ2PS(XMM0, R(XMM0)); CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1)); MULSS(XMM0, R(XMM1));
UNPCKLPS(XMM0, M((void*)m_one)); UNPCKLPS(XMM0, M((void*)m_one));
RET(); RET();
const u8* loadPairedS16Two = AlignCode4(); const u8* loadPairedS16Two = AlignCode4();
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); if (jit->js.memcheck)
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
ROL(32, R(RSCRATCH_EXTRA), Imm8(16)); ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
PUNPCKLWD(XMM0, R(XMM0)); PUNPCKLWD(XMM0, R(XMM0));
PSRAD(XMM0, 16); PSRAD(XMM0, 16);
CVTDQ2PS(XMM0, R(XMM0)); CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
AND(32, R(RSCRATCH), Imm32(0xFC)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1)); PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1)); MULPS(XMM0, R(XMM1));
RET(); RET();
const u8* loadPairedS16One = AlignCode4(); const u8* loadPairedS16One = AlignCode4();
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false); if (jit->js.memcheck)
SAR(32, R(RSCRATCH_EXTRA), Imm8(16)); SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_PROLOG);
else
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA)); MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
CVTDQ2PS(XMM0, R(XMM0)); CVTDQ2PS(XMM0, R(XMM0));
SHR(32, R(RSCRATCH), Imm8(6)); SHR(32, R(RSCRATCH2), Imm8(6));
AND(32, R(RSCRATCH), Imm32(0xFC)); MOVSS(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
MOVSS(XMM1, MDisp(RSCRATCH, (u32)(u64)m_dequantizeTableS));
MULSS(XMM0, R(XMM1)); MULSS(XMM0, R(XMM1));
UNPCKLPS(XMM0, M((void*)m_one)); UNPCKLPS(XMM0, M((void*)m_one));
RET(); RET();

View File

@ -61,9 +61,12 @@ void EmuCodeBlock::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int acc
} }
} }
void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset) void EmuCodeBlock::UnsafeLoadRegToRegNoSwap(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend)
{ {
MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset)); if (signExtend)
MOVSX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset));
else
MOVZX(32, accessSize, reg_value, MComplex(RMEM, reg_addr, SCALE_1, offset));
} }
u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessSize, s32 offset, bool signExtend) u8 *EmuCodeBlock::UnsafeLoadToReg(X64Reg reg_value, OpArg opAddress, int accessSize, s32 offset, bool signExtend)
@ -350,7 +353,8 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
FixupBranch fast = J_CC(CC_Z, true); FixupBranch fast = J_CC(CC_Z, true);
ABI_PushRegistersAndAdjustStack(registersInUse, 0); size_t rsp_alignment = (flags & SAFE_LOADSTORE_NO_PROLOG) ? 8 : 0;
ABI_PushRegistersAndAdjustStack(registersInUse, rsp_alignment);
switch (accessSize) switch (accessSize)
{ {
case 64: case 64:
@ -366,7 +370,7 @@ void EmuCodeBlock::SafeLoadToReg(X64Reg reg_value, const Gen::OpArg & opAddress,
ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc); ABI_CallFunctionA((void *)&Memory::Read_U8_ZX, addr_loc);
break; break;
} }
ABI_PopRegistersAndAdjustStack(registersInUse, 0); ABI_PopRegistersAndAdjustStack(registersInUse, rsp_alignment);
MEMCHECK_START MEMCHECK_START

View File

@ -40,7 +40,7 @@ public:
void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src); void SwapAndStore(int size, const Gen::OpArg& dst, Gen::X64Reg src);
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false); void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false);
void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset); void UnsafeLoadRegToRegNoSwap(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset, bool signExtend = false);
// these return the address of the MOV, for backpatching // these return the address of the MOV, for backpatching
u8 *UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0, bool swap = true); u8 *UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0, bool swap = true);
u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend); u8 *UnsafeLoadToReg(Gen::X64Reg reg_value, Gen::OpArg opAddress, int accessSize, s32 offset, bool signExtend);