From ba0c52b104c34755b5d7c27655b75816b98052aa Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 8 Sep 2013 08:18:34 +0000 Subject: [PATCH] [ARM] Optimization to psq_l, no need to push/pop regs anymore. Implement support for single float loading, gives a decent speedup to Ikaruga in menus and game. --- .../JitArm32/JitArm_LoadStorePaired.cpp | 16 +++------- .../Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp | 30 +++++++++++++------ 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp index 5e2cc1f876..1df10e074e 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp @@ -28,26 +28,18 @@ void JitArm::psq_l(UGeckoInstruction inst) if (js.memcheck) { Default(inst); return; } - if (inst.W) { - // Enable when supporting single loads - Default(inst); - return; - } - LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); - //UBFX(R12, R11, 2, 6); // Scale - UBFX(R11, R11, 13, 3); // Type + UBFX(R12, R11, 13, 3); // Type + UBFX(R11, R11, 2, 6); // Scale MOVI2R(R10, (u32)offset); if (inst.RA) ADD(R10, R10, gpr.R(inst.RA)); if (update) MOV(gpr.R(inst.RA), R10); - if (inst.W) - ADD(R11, R11, 8); MOVI2R(R14, (u32)asm_routines.pairedLoadQuantized); - ADD(R14, R14, R11); - LDR(R14, R14); + ADD(R14, R14, R12); + LDR(R14, R14, inst.W ? 8 * 4 : 0); // Values returned in S0, S1 BL(R14); // Jump to the quantizer Load diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp index 973b7659de..a4df254c97 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp @@ -145,25 +145,37 @@ void JitArmAsmRoutineManager::Generate() void JitArmAsmRoutineManager::GenerateCommon() { + // R14 is LR + // R12 is scratch + // R11 is scale + // R10 is the address + Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK) + NEONXEmitter nemit(this); + const u8* loadPairedIllegal = GetCodePtr(); BKPT(0x10); const u8* loadPairedFloatTwo = GetCodePtr(); - PUSH(2, R12, _LR); - // R12, R14 is scratch - // R10 is the address - Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK) BIC(R10, R10, mask); - MOVI2R(R14, (u32)Memory::base); - ADD(R10, R10, R14); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); - NEONXEmitter nemit(this); nemit.VLD1(I_32, D0, R10); nemit.VREV32(I_8, D0, D0); + + MOV(_PC, _LR); - POP(2, R12, _PC); const u8* loadPairedFloatOne = GetCodePtr(); - BKPT(0x12); + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + nemit.VLD1(I_32, D0, R10); + nemit.VREV32(I_8, D0, D0); + + MOVI2F(S1, 1.0f, INVALID_REG); // Temp reg isn't used for 1.0f + MOV(_PC, _LR); + const u8* loadPairedU8Two = GetCodePtr(); BKPT(0x13); const u8* loadPairedU8One = GetCodePtr();