JIT: cleanups/optimizations for ps loadstore

I'm not quite sure why the float paired stores were written how they were,
but it should be more consistent now.

Also get rid of the use of a psTemp global that wasn't really needed.

Add some comments.
This commit is contained in:
Fiora 2014-11-30 09:19:57 -08:00
parent c3d52e0476
commit 33b03fab81
1 changed files with 20 additions and 45 deletions

View File

@ -212,8 +212,6 @@ static const float GC_ALIGNED16(m_dequantizeTableS[]) =
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1), (1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
}; };
static float GC_ALIGNED16(psTemp[4]);
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f}; static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
static const float GC_ALIGNED16(m_32767) = 32767.0f; static const float GC_ALIGNED16(m_32767) = 32767.0f;
static const float GC_ALIGNED16(m_m32768) = -32768.0f; static const float GC_ALIGNED16(m_m32768) = -32768.0f;
@ -230,36 +228,26 @@ static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
// I don't know whether the overflow actually happens in any games // I don't know whether the overflow actually happens in any games
// but it potentially can cause problems, so we need some clamping // but it potentially can cause problems, so we need some clamping
static void WriteDual32(u32 address)
{
Memory::Write_U64(*(u64 *) psTemp, address);
}
// See comment in header for in/outs. // See comment in header for in/outs.
void CommonAsmRoutines::GenQuantizedStores() void CommonAsmRoutines::GenQuantizedStores()
{ {
const u8* storePairedIllegal = AlignCode4(); const u8* storePairedIllegal = AlignCode4();
UD2(); UD2();
const u8* storePairedFloat = AlignCode4();
FixupBranch skip_complex, too_complex; const u8* storePairedFloat = AlignCode4();
SHUFPS(XMM0, R(XMM0), 1); if (cpu_info.bSSSE3)
MOVQ_xmm(M(&psTemp[0]), XMM0);
if (!jit->js.memcheck)
{ {
TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000)); PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
too_complex = J_CC(CC_NZ, true); MOVQ_xmm(R(RSCRATCH), XMM0);
MOV(64, R(RSCRATCH), M(&psTemp[0]));
SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH);
skip_complex = J(true);
SetJumpTarget(too_complex);
} }
// RSP alignment here is 8 due to the call. else
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); {
ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA); MOVQ_xmm(R(RSCRATCH), XMM0);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8); ROL(64, R(RSCRATCH), Imm8(32));
if (!jit->js.memcheck) BSWAP(64, RSCRATCH);
SetJumpTarget(skip_complex); }
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET(); RET();
const u8* storePairedU8 = AlignCode4(); const u8* storePairedU8 = AlignCode4();
@ -316,12 +304,8 @@ void CommonAsmRoutines::GenQuantizedStores()
MINPS(XMM0, M(m_65535)); MINPS(XMM0, M(m_65535));
CVTTPS2DQ(XMM0, R(XMM0)); CVTTPS2DQ(XMM0, R(XMM0));
MOVQ_xmm(M(psTemp), XMM0); PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
// place ps[0] into the higher word, ps[1] into the lower MOVD_xmm(R(RSCRATCH), XMM0);
// so no need in ROL after BSWAP
MOVZX(32, 16, RSCRATCH, M(&psTemp[0]));
SHL(32, R(RSCRATCH), Imm8(16));
MOV(16, R(RSCRATCH), M(&psTemp[1]));
BSWAP(32, RSCRATCH); BSWAP(32, RSCRATCH);
} }
@ -369,21 +353,6 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
MOVD_xmm(R(RSCRATCH), XMM0); MOVD_xmm(R(RSCRATCH), XMM0);
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM); SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
RET(); RET();
/*
if (cpu_info.bSSSE3)
{
PSHUFB(XMM0, M(pbswapShuffle2x4));
// TODO: SafeWriteFloat
MOVSS(M(&psTemp[0]), XMM0);
MOV(32, R(RSCRATCH), M(&psTemp[0]));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
}
else
{
MOVSS(M(&psTemp[0]), XMM0);
MOV(32, R(RSCRATCH), M(&psTemp[0]));
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
}*/
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(RSCRATCH2), Imm8(5)); SHR(32, R(RSCRATCH2), Imm8(5));
@ -441,6 +410,12 @@ void CommonAsmRoutines::GenQuantizedLoads()
const u8* loadPairedIllegal = AlignCode4(); const u8* loadPairedIllegal = AlignCode4();
UD2(); UD2();
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
// don't need hardware access handling. This will definitely crash if paired loads occur
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
// for a good reason, or merely because no game does this.
// If we find something that actually does do this, maybe this should be changed. How
// much of a performance hit would it be?
const u8* loadPairedFloatTwo = AlignCode4(); const u8* loadPairedFloatTwo = AlignCode4();
if (jit->js.memcheck) if (jit->js.memcheck)
{ {