[ARM] Implement psq_l for 2x float loads. Couldn't find a game using quantized loads. Huge speed boost to Ikaruga and THP movies with this one.
This commit is contained in:
parent
614a7c2081
commit
31b69c53f7
|
@ -215,6 +215,7 @@ if(_M_ARM)
|
||||||
Src/PowerPC/JitArm32/JitArm_LoadStore.cpp
|
Src/PowerPC/JitArm32/JitArm_LoadStore.cpp
|
||||||
Src/PowerPC/JitArm32/JitArm_FloatingPoint.cpp
|
Src/PowerPC/JitArm32/JitArm_FloatingPoint.cpp
|
||||||
Src/PowerPC/JitArm32/JitArm_Paired.cpp
|
Src/PowerPC/JitArm32/JitArm_Paired.cpp
|
||||||
|
Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp
|
||||||
Src/PowerPC/JitArm32/JitArm_SystemRegisters.cpp
|
Src/PowerPC/JitArm32/JitArm_SystemRegisters.cpp
|
||||||
Src/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp)
|
Src/PowerPC/JitArm32/JitArm_LoadStoreFloating.cpp)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -218,6 +218,9 @@ public:
|
||||||
void ps_neg(UGeckoInstruction _inst);
|
void ps_neg(UGeckoInstruction _inst);
|
||||||
void ps_abs(UGeckoInstruction _inst);
|
void ps_abs(UGeckoInstruction _inst);
|
||||||
void ps_nabs(UGeckoInstruction _inst);
|
void ps_nabs(UGeckoInstruction _inst);
|
||||||
|
|
||||||
|
// LoadStore paired
|
||||||
|
void psq_l(UGeckoInstruction _inst);
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // _JIT64_H
|
#endif // _JIT64_H
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
// Copyright 2013 Dolphin Emulator Project
|
||||||
|
// Licensed under GPLv2
|
||||||
|
// Refer to the license.txt file included.
|
||||||
|
#include "Common.h"
|
||||||
|
#include "Thunk.h"
|
||||||
|
|
||||||
|
#include "../../Core.h"
|
||||||
|
#include "../PowerPC.h"
|
||||||
|
#include "../../CoreTiming.h"
|
||||||
|
#include "../PPCTables.h"
|
||||||
|
#include "ArmEmitter.h"
|
||||||
|
|
||||||
|
#include "Jit.h"
|
||||||
|
#include "JitRegCache.h"
|
||||||
|
#include "JitAsm.h"
|
||||||
|
|
||||||
|
void JitArm::psq_l(UGeckoInstruction inst)
|
||||||
|
{
|
||||||
|
INSTRUCTION_START
|
||||||
|
JITDISABLE(bJITLoadStorePairedOff)
|
||||||
|
|
||||||
|
bool update = inst.OPCD == 57;
|
||||||
|
s32 offset = inst.SIMM_12;
|
||||||
|
|
||||||
|
// R12 contains scale
|
||||||
|
// R11 contains type
|
||||||
|
// R10 is the ADDR
|
||||||
|
|
||||||
|
if (js.memcheck) { Default(inst); return; }
|
||||||
|
|
||||||
|
if (inst.W) {
|
||||||
|
// Enable when supporting single loads
|
||||||
|
Default(inst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
|
||||||
|
//UBFX(R12, R11, 2, 6); // Scale
|
||||||
|
UBFX(R11, R11, 13, 3); // Type
|
||||||
|
|
||||||
|
MOVI2R(R10, (u32)offset);
|
||||||
|
if (inst.RA)
|
||||||
|
ADD(R10, R10, gpr.R(inst.RA));
|
||||||
|
if (update)
|
||||||
|
MOV(gpr.R(inst.RA), R10);
|
||||||
|
if (inst.W)
|
||||||
|
ADD(R11, R11, 8);
|
||||||
|
MOVI2R(R14, (u32)asm_routines.pairedLoadQuantized);
|
||||||
|
ADD(R14, R14, R11);
|
||||||
|
LDR(R14, R14);
|
||||||
|
|
||||||
|
// Values returned in S0, S1
|
||||||
|
BL(R14); // Jump to the quantizer Load
|
||||||
|
|
||||||
|
ARMReg vD0 = fpr.R0(inst.RS, false);
|
||||||
|
ARMReg vD1 = fpr.R1(inst.RS, false);
|
||||||
|
VCVT(vD0, S0, 0);
|
||||||
|
VCVT(vD1, S1, 0);
|
||||||
|
}
|
|
@ -39,9 +39,6 @@ void JitArm::mtspr(UGeckoInstruction inst)
|
||||||
case SPR_LR:
|
case SPR_LR:
|
||||||
case SPR_CTR:
|
case SPR_CTR:
|
||||||
case SPR_XER:
|
case SPR_XER:
|
||||||
// These are safe to do the easy way, see the bottom of this function.
|
|
||||||
break;
|
|
||||||
|
|
||||||
case SPR_GQR0:
|
case SPR_GQR0:
|
||||||
case SPR_GQR0 + 1:
|
case SPR_GQR0 + 1:
|
||||||
case SPR_GQR0 + 2:
|
case SPR_GQR0 + 2:
|
||||||
|
@ -50,19 +47,9 @@ void JitArm::mtspr(UGeckoInstruction inst)
|
||||||
case SPR_GQR0 + 5:
|
case SPR_GQR0 + 5:
|
||||||
case SPR_GQR0 + 6:
|
case SPR_GQR0 + 6:
|
||||||
case SPR_GQR0 + 7:
|
case SPR_GQR0 + 7:
|
||||||
// Prevent recompiler from compiling in old quantizer values.
|
// These are safe to do the easy way, see the bottom of this function.
|
||||||
// If the value changed, destroy all blocks using this quantizer
|
break;
|
||||||
// This will create a little bit of block churn, but hopefully not too bad.
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
MOV(32, R(EAX), M(&PowerPC::ppcState.spr[iIndex])); // Load old value
|
|
||||||
CMP(32, R(EAX), gpr.R(inst.RD));
|
|
||||||
FixupBranch skip_destroy = J_CC(CC_E, false);
|
|
||||||
int gqr = iIndex - SPR_GQR0;
|
|
||||||
ABI_CallFunctionC(ProtectFunction(&Jit64::DestroyBlocksWithFlag, 1), (u32)BLOCK_USE_GQR0 << gqr);
|
|
||||||
SetJumpTarget(skip_destroy);*/
|
|
||||||
}
|
|
||||||
// TODO - break block if quantizers are written to.
|
|
||||||
default:
|
default:
|
||||||
Default(inst);
|
Default(inst);
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -107,7 +107,7 @@ static GekkoOPTemplate primarytable[] =
|
||||||
{54, &JitArm::Default}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
|
{54, &JitArm::Default}, //"stfd", OPTYPE_STOREFP, FL_IN_A}},
|
||||||
{55, &JitArm::Default}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
{55, &JitArm::Default}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
|
||||||
|
|
||||||
{56, &JitArm::Default}, //"psq_l", OPTYPE_PS, FL_IN_A}},
|
{56, &JitArm::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}},
|
||||||
{57, &JitArm::Default}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
|
{57, &JitArm::Default}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
|
||||||
{60, &JitArm::Default}, //"psq_st", OPTYPE_PS, FL_IN_A}},
|
{60, &JitArm::Default}, //"psq_st", OPTYPE_PS, FL_IN_A}},
|
||||||
{61, &JitArm::Default}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
|
{61, &JitArm::Default}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
|
||||||
|
|
|
@ -137,40 +137,73 @@ void JitArmAsmRoutineManager::Generate()
|
||||||
ADD(_SP, _SP, 4);
|
ADD(_SP, _SP, 4);
|
||||||
|
|
||||||
POP(9, R4, R5, R6, R7, R8, R9, R10, R11, _PC); // Returns
|
POP(9, R4, R5, R6, R7, R8, R9, R10, R11, _PC); // Returns
|
||||||
|
|
||||||
|
GenerateCommon();
|
||||||
|
|
||||||
FlushIcache();
|
FlushIcache();
|
||||||
}
|
}
|
||||||
|
|
||||||
void JitArmAsmRoutineManager::GenerateCommon()
|
void JitArmAsmRoutineManager::GenerateCommon()
|
||||||
{
|
{
|
||||||
/* fifoDirectWrite8 = AlignCode4();
|
const u8* loadPairedIllegal = GetCodePtr();
|
||||||
GenFifoWrite(8);
|
BKPT(0x10);
|
||||||
fifoDirectWrite16 = AlignCode4();
|
|
||||||
GenFifoWrite(16);
|
|
||||||
fifoDirectWrite32 = AlignCode4();
|
|
||||||
GenFifoWrite(32);
|
|
||||||
fifoDirectWriteFloat = AlignCode4();
|
|
||||||
GenFifoFloatWrite();
|
|
||||||
fifoDirectWriteXmm64 = AlignCode4();
|
|
||||||
GenFifoXmm64Write();
|
|
||||||
|
|
||||||
GenQuantizedLoads();
|
const u8* loadPairedFloatTwo = GetCodePtr();
|
||||||
GenQuantizedStores();
|
PUSH(2, R12, _LR);
|
||||||
GenQuantizedSingleStores();
|
// R12, R14 is scratch
|
||||||
*/
|
// R10 is the address
|
||||||
//CMPSD(R(XMM0), M(&zero),
|
MOVI2R(R14, Memory::MEMVIEW32_MASK);
|
||||||
// TODO
|
AND(R10, R10, R14);
|
||||||
|
MOVI2R(R14, (u32)Memory::base);
|
||||||
|
ADD(R10, R10, R14);
|
||||||
|
|
||||||
|
LDR(R12, R10);
|
||||||
|
REV(R12, R12);
|
||||||
|
VMOV(S0, R12);
|
||||||
|
|
||||||
|
LDR(R12, R10, 4);
|
||||||
|
REV(R12, R12);
|
||||||
|
VMOV(S1, R12);
|
||||||
|
|
||||||
|
POP(2, R12, _PC);
|
||||||
|
const u8* loadPairedFloatOne = GetCodePtr();
|
||||||
|
BKPT(0x12);
|
||||||
|
const u8* loadPairedU8Two = GetCodePtr();
|
||||||
|
BKPT(0x13);
|
||||||
|
const u8* loadPairedU8One = GetCodePtr();
|
||||||
|
BKPT(0x14);
|
||||||
|
const u8* loadPairedS8Two = GetCodePtr();
|
||||||
|
BKPT(0x15);
|
||||||
|
const u8* loadPairedS8One = GetCodePtr();
|
||||||
|
BKPT(0x16);
|
||||||
|
const u8* loadPairedU16Two = GetCodePtr();
|
||||||
|
BKPT(0x17);
|
||||||
|
const u8* loadPairedU16One = GetCodePtr();
|
||||||
|
BKPT(0x18);
|
||||||
|
const u8* loadPairedS16Two = GetCodePtr();
|
||||||
|
BKPT(0x19);
|
||||||
|
const u8* loadPairedS16One = GetCodePtr();
|
||||||
|
BKPT(0x20);
|
||||||
|
|
||||||
|
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||||
|
ReserveCodeSpace(16 * sizeof(u8*));
|
||||||
|
|
||||||
|
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
||||||
|
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[4] = loadPairedU8Two;
|
||||||
|
pairedLoadQuantized[5] = loadPairedU16Two;
|
||||||
|
pairedLoadQuantized[6] = loadPairedS8Two;
|
||||||
|
pairedLoadQuantized[7] = loadPairedS16Two;
|
||||||
|
|
||||||
|
pairedLoadQuantized[8] = loadPairedFloatOne;
|
||||||
|
pairedLoadQuantized[9] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[10] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[11] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[12] = loadPairedU8One;
|
||||||
|
pairedLoadQuantized[13] = loadPairedU16One;
|
||||||
|
pairedLoadQuantized[14] = loadPairedS8One;
|
||||||
|
pairedLoadQuantized[15] = loadPairedS16One;
|
||||||
|
|
||||||
// Fast write routines - special case the most common hardware write
|
|
||||||
// TODO: use this.
|
|
||||||
// Even in x86, the param values will be in the right registers.
|
|
||||||
/*
|
|
||||||
const u8 *fastMemWrite8 = AlignCode16();
|
|
||||||
CMP(32, R(ABI_PARAM2), Imm32(0xCC008000));
|
|
||||||
FixupBranch skip_fast_write = J_CC(CC_NE, false);
|
|
||||||
MOV(32, EAX, M(&m_gatherPipeCount));
|
|
||||||
MOV(8, MDisp(EAX, (u32)&m_gatherPipe), ABI_PARAM1);
|
|
||||||
ADD(32, 1, M(&m_gatherPipeCount));
|
|
||||||
RET();
|
|
||||||
SetJumpTarget(skip_fast_write);
|
|
||||||
CALL((void *)&Memory::Write_U8);*/
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue