JIT: cleanups/optimizations for ps loadstore
I'm not quite sure why the float paired stores were written how they were, but it should be more consistent now. Also get rid of the use of a psTemp global that wasn't really needed. Add some comments.
This commit is contained in:
parent
c3d52e0476
commit
33b03fab81
|
@ -212,8 +212,6 @@ static const float GC_ALIGNED16(m_dequantizeTableS[]) =
|
||||||
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
||||||
};
|
};
|
||||||
|
|
||||||
static float GC_ALIGNED16(psTemp[4]);
|
|
||||||
|
|
||||||
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
||||||
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
||||||
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
||||||
|
@ -230,36 +228,26 @@ static const float GC_ALIGNED16(m_one[]) = {1.0f, 0.0f, 0.0f, 0.0f};
|
||||||
// I don't know whether the overflow actually happens in any games
|
// I don't know whether the overflow actually happens in any games
|
||||||
// but it potentially can cause problems, so we need some clamping
|
// but it potentially can cause problems, so we need some clamping
|
||||||
|
|
||||||
static void WriteDual32(u32 address)
|
|
||||||
{
|
|
||||||
Memory::Write_U64(*(u64 *) psTemp, address);
|
|
||||||
}
|
|
||||||
|
|
||||||
// See comment in header for in/outs.
|
// See comment in header for in/outs.
|
||||||
void CommonAsmRoutines::GenQuantizedStores()
|
void CommonAsmRoutines::GenQuantizedStores()
|
||||||
{
|
{
|
||||||
const u8* storePairedIllegal = AlignCode4();
|
const u8* storePairedIllegal = AlignCode4();
|
||||||
UD2();
|
UD2();
|
||||||
const u8* storePairedFloat = AlignCode4();
|
|
||||||
|
|
||||||
FixupBranch skip_complex, too_complex;
|
const u8* storePairedFloat = AlignCode4();
|
||||||
SHUFPS(XMM0, R(XMM0), 1);
|
if (cpu_info.bSSSE3)
|
||||||
MOVQ_xmm(M(&psTemp[0]), XMM0);
|
|
||||||
if (!jit->js.memcheck)
|
|
||||||
{
|
{
|
||||||
TEST(32, R(RSCRATCH_EXTRA), Imm32(0x0C000000));
|
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||||
too_complex = J_CC(CC_NZ, true);
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
MOV(64, R(RSCRATCH), M(&psTemp[0]));
|
|
||||||
SwapAndStore(64, MComplex(RMEM, RSCRATCH_EXTRA, SCALE_1, 0), RSCRATCH);
|
|
||||||
skip_complex = J(true);
|
|
||||||
SetJumpTarget(too_complex);
|
|
||||||
}
|
}
|
||||||
// RSP alignment here is 8 due to the call.
|
else
|
||||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
{
|
||||||
ABI_CallFunctionR((void *)&WriteDual32, RSCRATCH_EXTRA);
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
ROL(64, R(RSCRATCH), Imm8(32));
|
||||||
if (!jit->js.memcheck)
|
BSWAP(64, RSCRATCH);
|
||||||
SetJumpTarget(skip_complex);
|
}
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
const u8* storePairedU8 = AlignCode4();
|
const u8* storePairedU8 = AlignCode4();
|
||||||
|
@ -316,12 +304,8 @@ void CommonAsmRoutines::GenQuantizedStores()
|
||||||
MINPS(XMM0, M(m_65535));
|
MINPS(XMM0, M(m_65535));
|
||||||
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
MOVQ_xmm(M(psTemp), XMM0);
|
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
||||||
// place ps[0] into the higher word, ps[1] into the lower
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
// so no need in ROL after BSWAP
|
|
||||||
MOVZX(32, 16, RSCRATCH, M(&psTemp[0]));
|
|
||||||
SHL(32, R(RSCRATCH), Imm8(16));
|
|
||||||
MOV(16, R(RSCRATCH), M(&psTemp[1]));
|
|
||||||
BSWAP(32, RSCRATCH);
|
BSWAP(32, RSCRATCH);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -369,21 +353,6 @@ void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
RET();
|
RET();
|
||||||
/*
|
|
||||||
if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
|
||||||
// TODO: SafeWriteFloat
|
|
||||||
MOVSS(M(&psTemp[0]), XMM0);
|
|
||||||
MOV(32, R(RSCRATCH), M(&psTemp[0]));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
MOVSS(M(&psTemp[0]), XMM0);
|
|
||||||
MOV(32, R(RSCRATCH), M(&psTemp[0]));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
}*/
|
|
||||||
|
|
||||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
@ -441,6 +410,12 @@ void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
const u8* loadPairedIllegal = AlignCode4();
|
const u8* loadPairedIllegal = AlignCode4();
|
||||||
UD2();
|
UD2();
|
||||||
|
|
||||||
|
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
||||||
|
// don't need hardware access handling. This will definitely crash if paired loads occur
|
||||||
|
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
||||||
|
// for a good reason, or merely because no game does this.
|
||||||
|
// If we find something that actually does do this, maybe this should be changed. How
|
||||||
|
// much of a performance hit would it be?
|
||||||
const u8* loadPairedFloatTwo = AlignCode4();
|
const u8* loadPairedFloatTwo = AlignCode4();
|
||||||
if (jit->js.memcheck)
|
if (jit->js.memcheck)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue