diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/Jit.h b/Source/Core/Core/Src/PowerPC/JitArm32/Jit.h index 5d5d2dd19b..58c6cbda16 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/Jit.h +++ b/Source/Core/Core/Src/PowerPC/JitArm32/Jit.h @@ -221,6 +221,8 @@ public: // LoadStore paired void psq_l(UGeckoInstruction _inst); + void psq_st(UGeckoInstruction _inst); + }; #endif // _JIT64_H diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp index 1df10e074e..7ed402593c 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_LoadStorePaired.cpp @@ -18,22 +18,24 @@ void JitArm::psq_l(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStorePairedOff) - + bool update = inst.OPCD == 57; s32 offset = inst.SIMM_12; // R12 contains scale // R11 contains type // R10 is the ADDR - + if (js.memcheck) { Default(inst); return; } LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); - UBFX(R12, R11, 13, 3); // Type - UBFX(R11, R11, 2, 6); // Scale + UBFX(R12, R11, 16, 3); // Type + LSL(R12, R12, 2); + UBFX(R11, R11, 24, 6); // Scale + LSL(R11, R11, 2); MOVI2R(R10, (u32)offset); - if (inst.RA) + if (inst.RA || update) // Always uses the register on update ADD(R10, R10, gpr.R(inst.RA)); if (update) MOV(gpr.R(inst.RA), R10); @@ -47,5 +49,53 @@ void JitArm::psq_l(UGeckoInstruction inst) ARMReg vD0 = fpr.R0(inst.RS, false); ARMReg vD1 = fpr.R1(inst.RS, false); VCVT(vD0, S0, 0); - VCVT(vD1, S1, 0); + if (!inst.W) + VCVT(vD1, S1, 0); + else + MOVI2F(vD1, 1.0f, INVALID_REG); // No need for temp reg with 1.0f +} + +void JitArm::psq_st(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStorePairedOff) + + bool update = inst.OPCD == 61; + s32 offset = inst.SIMM_12; + + // R12 contains scale + // R11 contains type + // R10 is the ADDR + if (js.memcheck) { Default(inst); return; } + + LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I])); + UBFX(R12, R11, 0, 3); // Type + LSL(R12, R12, 2); + UBFX(R11, R11, 8, 6); // Scale + LSL(R11, R11, 2); + + if (inst.RA || update) // Always uses the register on update + { + MOVI2R(R14, offset); + ADD(R10, gpr.R(inst.RA), R14); + } + else + MOVI2R(R10, (u32)offset); + + if (update) + MOV(gpr.R(inst.RA), R10); + MOVI2R(R14, (u32)asm_routines.pairedStoreQuantized); + ADD(R14, R14, R12); + LDR(R14, R14, inst.W ? 8 * 4 : 0); + + ARMReg vD0 = fpr.R0(inst.RS); + VCVT(S0, vD0, 0); + + if (!inst.W) + { + ARMReg vD1 = fpr.R1(inst.RS); + VCVT(S1, vD1, 0); + } + // floats passed through D0 + BL(R14); // Jump to the quantizer Store } diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Tables.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Tables.cpp index 23c5ab56a5..78e8e9d06b 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Tables.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitArm_Tables.cpp @@ -108,9 +108,9 @@ static GekkoOPTemplate primarytable[] = {55, &JitArm::Default}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}}, {56, &JitArm::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}}, - {57, &JitArm::Default}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, - {60, &JitArm::Default}, //"psq_st", OPTYPE_PS, FL_IN_A}}, - {61, &JitArm::Default}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, + {57, &JitArm::psq_l}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, + {60, &JitArm::psq_st}, //"psq_st", OPTYPE_PS, FL_IN_A}}, + {61, &JitArm::psq_st}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}}, //missing: 0, 5, 6, 9, 22, 30, 62, 58 {0, &JitArm::Default}, //"unknown_instruction", OPTYPE_UNKNOWN, 0}}, diff --git a/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp index a4df254c97..d20ff66397 100644 --- a/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp +++ b/Source/Core/Core/Src/PowerPC/JitArm32/JitAsm.cpp @@ -41,6 +41,64 @@ using namespace ArmGen; JitArmAsmRoutineManager asm_routines; +static const float GC_ALIGNED16(m_quantizeTableS[]) = +{ + (1 << 0), (1 << 1), (1 << 2), (1 << 3), + (1 << 4), (1 << 5), (1 << 6), (1 << 7), + (1 << 8), (1 << 9), (1 << 10), (1 << 11), + (1 << 12), (1 << 13), (1 << 14), (1 << 15), + (1 << 16), (1 << 17), (1 << 18), (1 << 19), + (1 << 20), (1 << 21), (1 << 22), (1 << 23), + (1 << 24), (1 << 25), (1 << 26), (1 << 27), + (1 << 28), (1 << 29), (1 << 30), (1 << 31), + 1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29), + 1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25), + 1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21), + 1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17), + 1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13), + 1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9), + 1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5), + 1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1), +}; + +static const float GC_ALIGNED16(m_dequantizeTableS[]) = +{ + 1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3), + 1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7), + 1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11), + 1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15), + 1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19), + 1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23), + 1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27), + 1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31), + (1ULL << 32), (1 << 31), (1 << 30), (1 << 29), + (1 << 28), (1 << 27), (1 << 26), (1 << 25), + (1 << 24), (1 << 23), (1 << 22), (1 << 21), + (1 << 20), (1 << 19), (1 << 18), (1 << 17), + (1 << 16), (1 << 15), (1 << 14), (1 << 13), + (1 << 12), (1 << 11), (1 << 10), (1 << 9), + (1 << 8), (1 << 7), (1 << 6), (1 << 5), + (1 << 4), (1 << 3), (1 << 2), (1 << 1), +}; + +static void WriteDual32(u32 value1, u32 value2, u32 address) +{ + Memory::Write_U32(value1, address); + Memory::Write_U32(value2, address + 4); +} + +static void WriteDual16(u32 value1, u32 value2, u32 address) +{ + Memory::Write_U16(value1, address); + Memory::Write_U16(value2, address + 2); +} + +static void WriteDual8(u32 value1, u32 value2, u32 address) +{ + Memory::Write_U8(value1, address); + Memory::Write_U8(value2, address + 1); +} + void JitArmAsmRoutineManager::Generate() { enterCode = GetCodePtr(); @@ -150,48 +208,221 @@ void JitArmAsmRoutineManager::GenerateCommon() // R11 is scale // R10 is the address Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK) + Operand2 arghmask(3, 3); // 0x0C000000 NEONXEmitter nemit(this); const u8* loadPairedIllegal = GetCodePtr(); BKPT(0x10); const u8* loadPairedFloatTwo = GetCodePtr(); - BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); - - nemit.VLD1(I_32, D0, R10); - nemit.VREV32(I_8, D0, D0); - - MOV(_PC, _LR); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + nemit.VLD1(I_32, D0, R10); + nemit.VREV32(I_8, D0, D0); + + MOV(_PC, _LR); + } const u8* loadPairedFloatOne = GetCodePtr(); - BIC(R10, R10, mask); - MOVI2R(R12, (u32)Memory::base); - ADD(R10, R10, R12); - - nemit.VLD1(I_32, D0, R10); - nemit.VREV32(I_8, D0, D0); - - MOVI2F(S1, 1.0f, INVALID_REG); // Temp reg isn't used for 1.0f - MOV(_PC, _LR); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + nemit.VLD1(I_32, D0, R10); + nemit.VREV32(I_8, D0, D0); + + MOV(_PC, _LR); + } const u8* loadPairedU8Two = GetCodePtr(); - BKPT(0x13); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + LDRH(R12, R10); + SXTB(R12, R12); + VMOV(S0, R12); + + LDRH(R12, R10, 2); + SXTB(R12, R12); + VMOV(S1, R12); + + MOVI2R(R10, (u32)&m_dequantizeTableS); + ADD(R10, R10, R11); + VLDR(S2, R10, 0); + + VCVT(S0, S0, TO_FLOAT); + VCVT(S1, S1, TO_FLOAT); + + VMUL(S0, S0, S2); + VMUL(S1, S1, S2); + + MOV(_PC, _LR); + } const u8* loadPairedU8One = GetCodePtr(); - BKPT(0x14); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + LDRB(R12, R10); + SXTB(R12, R12); + VMOV(S0, R12); + + MOVI2R(R10, (u32)&m_dequantizeTableS); + ADD(R10, R10, R11); + VLDR(S2, R10, 0); + + VCVT(S0, S0, TO_FLOAT); + + VMUL(S0, S0, S2); + + MOV(_PC, _LR); + } const u8* loadPairedS8Two = GetCodePtr(); - BKPT(0x15); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + LDRH(R12, R10); + SXTB(R12, R12); + VMOV(S0, R12); + + LDRH(R12, R10, 2); + SXTB(R12, R12); + VMOV(S1, R12); + + MOVI2R(R10, (u32)&m_dequantizeTableS); + ADD(R10, R10, R11); + VLDR(S2, R10, 0); + + VCVT(S0, S0, TO_FLOAT | IS_SIGNED); + VCVT(S1, S1, TO_FLOAT | IS_SIGNED); + + VMUL(S0, S0, S2); + VMUL(S1, S1, S2); + + MOV(_PC, _LR); + } const u8* loadPairedS8One = GetCodePtr(); - BKPT(0x16); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + LDRB(R12, R10); + SXTB(R12, R12); + VMOV(S0, R12); + + MOVI2R(R10, (u32)&m_dequantizeTableS); + ADD(R10, R10, R11); + VLDR(S2, R10, 0); + + VCVT(S0, S0, TO_FLOAT | IS_SIGNED); + + VMUL(S0, S0, S2); + + MOV(_PC, _LR); + } const u8* loadPairedU16Two = GetCodePtr(); - BKPT(0x17); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + LDRH(R12, R10); + REV16(R12, R12); + SXTH(R12, R12); + VMOV(S0, R12); + + LDRH(R12, R10, 2); + REV16(R12, R12); + SXTH(R12, R12); + VMOV(S1, R12); + + MOVI2R(R10, (u32)&m_dequantizeTableS); + ADD(R10, R10, R11); + VLDR(S2, R10, 0); + + VCVT(S0, S0, TO_FLOAT); + VCVT(S1, S1, TO_FLOAT); + + VMUL(S0, S0, S2); + VMUL(S1, S1, S2); + + MOV(_PC, _LR); + } const u8* loadPairedU16One = GetCodePtr(); - BKPT(0x18); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + LDRH(R12, R10); + REV16(R12, R12); + VMOV(S0, R12); + + MOVI2R(R10, (u32)&m_dequantizeTableS); + ADD(R10, R10, R11); + VLDR(S2, R10, 0); + + VCVT(S0, S0, TO_FLOAT); + + VMUL(S0, S0, S2); + MOV(_PC, _LR); + } const u8* loadPairedS16Two = GetCodePtr(); - BKPT(0x19); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + LDRH(R12, R10); + REV16(R12, R12); + SXTH(R12, R12); + VMOV(S0, R12); + + LDRH(R12, R10, 2); + REV16(R12, R12); + SXTH(R12, R12); + VMOV(S1, R12); + + MOVI2R(R10, (u32)&m_dequantizeTableS); + ADD(R10, R10, R11); + VLDR(S2, R10, 0); + + VCVT(S0, S0, TO_FLOAT | IS_SIGNED); + VCVT(S1, S1, TO_FLOAT | IS_SIGNED); + + VMUL(S0, S0, S2); + VMUL(S1, S1, S2); + + MOV(_PC, _LR); + } const u8* loadPairedS16One = GetCodePtr(); - BKPT(0x20); + { + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + LDRH(R12, R10); + + MOVI2R(R10, (u32)&m_dequantizeTableS); + ADD(R10, R10, R11); + VLDR(S2, R10, 0); + + REV16(R12, R12); + SXTH(R12, R12); + VMOV(S0, R12); + VCVT(S0, S0, TO_FLOAT | IS_SIGNED); + + VMUL(S0, S0, S2); + MOV(_PC, _LR); + } pairedLoadQuantized = reinterpret_cast(const_cast(AlignCode16())); ReserveCodeSpace(16 * sizeof(u8*)); @@ -214,4 +445,182 @@ void JitArmAsmRoutineManager::GenerateCommon() pairedLoadQuantized[14] = loadPairedS8One; pairedLoadQuantized[15] = loadPairedS16One; + // Stores + const u8* storePairedIllegal = GetCodePtr(); + BKPT(0x21); + const u8* storePairedFloat = GetCodePtr(); + { + TST(R10, arghmask); + FixupBranch argh = B_CC(CC_NEQ); + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + nemit.VREV32(I_8, D0, D0); + nemit.VST1(I_32, D0, R10); + MOV(_PC, _LR); + + SetJumpTarget(argh); + + PUSH(5, R0, R1, R2, R3, _LR); + VMOV(R0, S0); + VMOV(R1, S1); + MOV(R2, R10); + MOVI2R(R12, (u32)&WriteDual32); + BL(R12); + POP(5, R0, R1, R2, R3, _PC); + } + const u8* storePairedU8 = GetCodePtr(); + const u8* storePairedS8 = GetCodePtr(); + { + // R10 is the addr + // R11 is the scale + // R12 is scratch + // S0, S1 is the values + PUSH(5, R0, R1, R2, R3, _LR); + + MOVI2R(R12, (u32)&m_quantizeTableS); + ADD(R12, R12, R11); + VLDR(S2, R12, 0); + VMUL(S0, S0, S2); + VMUL(S1, S1, S2); + + VCVT(S0, S0, TO_INT | ROUND_TO_ZERO); + VCVT(S1, S1, TO_INT | ROUND_TO_ZERO); + + VMOV(R0, S0); + VMOV(R1, S1); + MOV(R2, R10); + MOVI2R(R12, (u32)&WriteDual8); + BL(R12); + + POP(5, R0, R1, R2, R3, _PC); + } + const u8* storePairedU16 = GetCodePtr(); + const u8* storePairedS16 = GetCodePtr(); + { + PUSH(5, R0, R1, R2, R3, _LR); + + MOVI2R(R12, (u32)&m_quantizeTableS); + ADD(R12, R12, R11); + VLDR(S2, R12, 0); + VMUL(S0, S0, S2); + VMUL(S1, S1, S2); + + VCVT(S0, S0, TO_INT | ROUND_TO_ZERO); + VCVT(S1, S1, TO_INT | ROUND_TO_ZERO); + + VMOV(R0, S0); + VMOV(R1, S1); + MOV(R2, R10); + MOVI2R(R12, (u32)&WriteDual16); + BL(R12); + + POP(5, R0, R1, R2, R3, _PC); + } + const u8* storeSingleIllegal = GetCodePtr(); + BKPT(0x27); + const u8* storeSingleFloat = GetCodePtr(); + { + TST(R10, arghmask); + FixupBranch argh = B_CC(CC_NEQ); + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + VMOV(R12, S0); + REV(R12, R12); + STR(R12, R10); + MOV(_PC, _LR); + + SetJumpTarget(argh); + + PUSH(5, R0, R1, R2, R3, _LR); + VMOV(R0, S0); + MOV(R1, R10); + MOVI2R(R10, (u32)&Memory::Write_U32); + BL(R10); + + POP(5, R0, R1, R2, R3, _PC); + } + const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii + const u8* storeSingleS8 = GetCodePtr(); + { + MOVI2R(R12, (u32)&m_quantizeTableS); + ADD(R12, R12, R11); + VLDR(S2, R12, 0); + VMUL(S0, S0, S2); + + TST(R10, arghmask); + FixupBranch argh = B_CC(CC_NEQ); + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + VCVT(S0, S0, TO_INT | ROUND_TO_ZERO); + VMOV(R12, S0); + STRB(R12, R10); + MOV(_PC, _LR); + + SetJumpTarget(argh); + + PUSH(5, R0, R1, R2, R3, _LR); + VMOV(R0, S0); + MOV(R1, R10); + MOVI2R(R10, (u32)&Memory::Write_U8); + BL(R10); + POP(5, R0, R1, R2, R3, _PC); + } + const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii + const u8* storeSingleS16 = GetCodePtr(); + { + MOVI2R(R12, (u32)&m_quantizeTableS); + ADD(R12, R12, R11); + VLDR(S2, R12, 0); + VMUL(S0, S0, S2); + + TST(R10, arghmask); + FixupBranch argh = B_CC(CC_NEQ); + BIC(R10, R10, mask); + MOVI2R(R12, (u32)Memory::base); + ADD(R10, R10, R12); + + VCVT(S0, S0, TO_INT | ROUND_TO_ZERO); + VMOV(R12, S0); + REV16(R12, R12); + STRH(R12, R10); + MOV(_PC, _LR); + + SetJumpTarget(argh); + + PUSH(5, R0, R1, R2, R3, _LR); + VMOV(R0, S0); + MOV(R1, R10); + MOVI2R(R10, (u32)&Memory::Write_U16); + BL(R10); + + POP(5, R0, R1, R2, R3, _PC); + } + + pairedStoreQuantized = reinterpret_cast(const_cast(AlignCode16())); + ReserveCodeSpace(16 * sizeof(u8*)); + + pairedStoreQuantized[0] = storePairedFloat; + pairedStoreQuantized[1] = storePairedIllegal; + pairedStoreQuantized[2] = storePairedIllegal; + pairedStoreQuantized[3] = storePairedIllegal; + pairedStoreQuantized[4] = storePairedU8; + pairedStoreQuantized[5] = storePairedU16; + pairedStoreQuantized[6] = storePairedS8; + pairedStoreQuantized[7] = storePairedS16; + + pairedStoreQuantized[8] = storeSingleFloat; + pairedStoreQuantized[9] = storeSingleIllegal; + pairedStoreQuantized[10] = storeSingleIllegal; + pairedStoreQuantized[11] = storeSingleIllegal; + pairedStoreQuantized[12] = storeSingleU8; + pairedStoreQuantized[13] = storeSingleU16; + pairedStoreQuantized[14] = storeSingleS8; + pairedStoreQuantized[15] = storeSingleS16; + }