[ARM] Optimization to psq_l, no need to push/pop regs anymore. Implement support for single float loading, gives a decent speedup to Ikaruga in menus and game.
This commit is contained in:
parent
e5b5713d70
commit
ba0c52b104
|
@ -28,26 +28,18 @@ void JitArm::psq_l(UGeckoInstruction inst)
|
|||
|
||||
if (js.memcheck) { Default(inst); return; }
|
||||
|
||||
if (inst.W) {
|
||||
// Enable when supporting single loads
|
||||
Default(inst);
|
||||
return;
|
||||
}
|
||||
|
||||
LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
|
||||
//UBFX(R12, R11, 2, 6); // Scale
|
||||
UBFX(R11, R11, 13, 3); // Type
|
||||
UBFX(R12, R11, 13, 3); // Type
|
||||
UBFX(R11, R11, 2, 6); // Scale
|
||||
|
||||
MOVI2R(R10, (u32)offset);
|
||||
if (inst.RA)
|
||||
ADD(R10, R10, gpr.R(inst.RA));
|
||||
if (update)
|
||||
MOV(gpr.R(inst.RA), R10);
|
||||
if (inst.W)
|
||||
ADD(R11, R11, 8);
|
||||
MOVI2R(R14, (u32)asm_routines.pairedLoadQuantized);
|
||||
ADD(R14, R14, R11);
|
||||
LDR(R14, R14);
|
||||
ADD(R14, R14, R12);
|
||||
LDR(R14, R14, inst.W ? 8 * 4 : 0);
|
||||
|
||||
// Values returned in S0, S1
|
||||
BL(R14); // Jump to the quantizer Load
|
||||
|
|
|
@ -145,25 +145,37 @@ void JitArmAsmRoutineManager::Generate()
|
|||
|
||||
void JitArmAsmRoutineManager::GenerateCommon()
|
||||
{
|
||||
// R14 is LR
|
||||
// R12 is scratch
|
||||
// R11 is scale
|
||||
// R10 is the address
|
||||
Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK)
|
||||
NEONXEmitter nemit(this);
|
||||
|
||||
const u8* loadPairedIllegal = GetCodePtr();
|
||||
BKPT(0x10);
|
||||
|
||||
const u8* loadPairedFloatTwo = GetCodePtr();
|
||||
PUSH(2, R12, _LR);
|
||||
// R12, R14 is scratch
|
||||
// R10 is the address
|
||||
Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK)
|
||||
BIC(R10, R10, mask);
|
||||
MOVI2R(R14, (u32)Memory::base);
|
||||
ADD(R10, R10, R14);
|
||||
MOVI2R(R12, (u32)Memory::base);
|
||||
ADD(R10, R10, R12);
|
||||
|
||||
NEONXEmitter nemit(this);
|
||||
nemit.VLD1(I_32, D0, R10);
|
||||
nemit.VREV32(I_8, D0, D0);
|
||||
|
||||
MOV(_PC, _LR);
|
||||
|
||||
POP(2, R12, _PC);
|
||||
const u8* loadPairedFloatOne = GetCodePtr();
|
||||
BKPT(0x12);
|
||||
BIC(R10, R10, mask);
|
||||
MOVI2R(R12, (u32)Memory::base);
|
||||
ADD(R10, R10, R12);
|
||||
|
||||
nemit.VLD1(I_32, D0, R10);
|
||||
nemit.VREV32(I_8, D0, D0);
|
||||
|
||||
MOVI2F(S1, 1.0f, INVALID_REG); // Temp reg isn't used for 1.0f
|
||||
MOV(_PC, _LR);
|
||||
|
||||
const u8* loadPairedU8Two = GetCodePtr();
|
||||
BKPT(0x13);
|
||||
const u8* loadPairedU8One = GetCodePtr();
|
||||
|
|
Loading…
Reference in New Issue