[ARM] Implement psq_st. Optimizations in psq_l and fix all the remaining bugs...except clamping within the max value range of the value. Causes some minor visual effects mostly.

This commit is contained in:
Ryan Houdek 2013-09-08 21:55:15 +00:00
parent c1aa80cefa
commit 4146e1f3d6
4 changed files with 495 additions and 34 deletions

View File

@ -221,6 +221,8 @@ public:
// LoadStore paired
void psq_l(UGeckoInstruction _inst);
void psq_st(UGeckoInstruction _inst);
};
#endif // _JIT64_H

View File

@ -18,22 +18,24 @@ void JitArm::psq_l(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff)
bool update = inst.OPCD == 57;
s32 offset = inst.SIMM_12;
// R12 contains scale
// R11 contains type
// R10 is the ADDR
if (js.memcheck) { Default(inst); return; }
LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
UBFX(R12, R11, 13, 3); // Type
UBFX(R11, R11, 2, 6); // Scale
UBFX(R12, R11, 16, 3); // Type
LSL(R12, R12, 2);
UBFX(R11, R11, 24, 6); // Scale
LSL(R11, R11, 2);
MOVI2R(R10, (u32)offset);
if (inst.RA)
if (inst.RA || update) // Always uses the register on update
ADD(R10, R10, gpr.R(inst.RA));
if (update)
MOV(gpr.R(inst.RA), R10);
@ -47,5 +49,53 @@ void JitArm::psq_l(UGeckoInstruction inst)
ARMReg vD0 = fpr.R0(inst.RS, false);
ARMReg vD1 = fpr.R1(inst.RS, false);
VCVT(vD0, S0, 0);
VCVT(vD1, S1, 0);
if (!inst.W)
VCVT(vD1, S1, 0);
else
MOVI2F(vD1, 1.0f, INVALID_REG); // No need for temp reg with 1.0f
}
void JitArm::psq_st(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(bJITLoadStorePairedOff)
bool update = inst.OPCD == 61;
s32 offset = inst.SIMM_12;
// R12 contains scale
// R11 contains type
// R10 is the ADDR
if (js.memcheck) { Default(inst); return; }
LDR(R11, R9, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
UBFX(R12, R11, 0, 3); // Type
LSL(R12, R12, 2);
UBFX(R11, R11, 8, 6); // Scale
LSL(R11, R11, 2);
if (inst.RA || update) // Always uses the register on update
{
MOVI2R(R14, offset);
ADD(R10, gpr.R(inst.RA), R14);
}
else
MOVI2R(R10, (u32)offset);
if (update)
MOV(gpr.R(inst.RA), R10);
MOVI2R(R14, (u32)asm_routines.pairedStoreQuantized);
ADD(R14, R14, R12);
LDR(R14, R14, inst.W ? 8 * 4 : 0);
ARMReg vD0 = fpr.R0(inst.RS);
VCVT(S0, vD0, 0);
if (!inst.W)
{
ARMReg vD1 = fpr.R1(inst.RS);
VCVT(S1, vD1, 0);
}
// floats passed through D0
BL(R14); // Jump to the quantizer Store
}

View File

@ -108,9 +108,9 @@ static GekkoOPTemplate primarytable[] =
{55, &JitArm::Default}, //"stfdu", OPTYPE_STOREFP, FL_OUT_A | FL_IN_A}},
{56, &JitArm::psq_l}, //"psq_l", OPTYPE_PS, FL_IN_A}},
{57, &JitArm::Default}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
{60, &JitArm::Default}, //"psq_st", OPTYPE_PS, FL_IN_A}},
{61, &JitArm::Default}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
{57, &JitArm::psq_l}, //"psq_lu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
{60, &JitArm::psq_st}, //"psq_st", OPTYPE_PS, FL_IN_A}},
{61, &JitArm::psq_st}, //"psq_stu", OPTYPE_PS, FL_OUT_A | FL_IN_A}},
//missing: 0, 5, 6, 9, 22, 30, 62, 58
{0, &JitArm::Default}, //"unknown_instruction", OPTYPE_UNKNOWN, 0}},

View File

@ -41,6 +41,64 @@ using namespace ArmGen;
JitArmAsmRoutineManager asm_routines;
static const float GC_ALIGNED16(m_quantizeTableS[]) =
{
(1 << 0), (1 << 1), (1 << 2), (1 << 3),
(1 << 4), (1 << 5), (1 << 6), (1 << 7),
(1 << 8), (1 << 9), (1 << 10), (1 << 11),
(1 << 12), (1 << 13), (1 << 14), (1 << 15),
(1 << 16), (1 << 17), (1 << 18), (1 << 19),
(1 << 20), (1 << 21), (1 << 22), (1 << 23),
(1 << 24), (1 << 25), (1 << 26), (1 << 27),
(1 << 28), (1 << 29), (1 << 30), (1 << 31),
1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29),
1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25),
1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21),
1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17),
1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13),
1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9),
1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5),
1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1),
};
static const float GC_ALIGNED16(m_dequantizeTableS[]) =
{
1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3),
1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7),
1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11),
1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15),
1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19),
1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23),
1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27),
1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31),
(1ULL << 32), (1 << 31), (1 << 30), (1 << 29),
(1 << 28), (1 << 27), (1 << 26), (1 << 25),
(1 << 24), (1 << 23), (1 << 22), (1 << 21),
(1 << 20), (1 << 19), (1 << 18), (1 << 17),
(1 << 16), (1 << 15), (1 << 14), (1 << 13),
(1 << 12), (1 << 11), (1 << 10), (1 << 9),
(1 << 8), (1 << 7), (1 << 6), (1 << 5),
(1 << 4), (1 << 3), (1 << 2), (1 << 1),
};
static void WriteDual32(u32 value1, u32 value2, u32 address)
{
Memory::Write_U32(value1, address);
Memory::Write_U32(value2, address + 4);
}
static void WriteDual16(u32 value1, u32 value2, u32 address)
{
Memory::Write_U16(value1, address);
Memory::Write_U16(value2, address + 2);
}
static void WriteDual8(u32 value1, u32 value2, u32 address)
{
Memory::Write_U8(value1, address);
Memory::Write_U8(value2, address + 1);
}
void JitArmAsmRoutineManager::Generate()
{
enterCode = GetCodePtr();
@ -150,48 +208,221 @@ void JitArmAsmRoutineManager::GenerateCommon()
// R11 is scale
// R10 is the address
Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK)
Operand2 arghmask(3, 3); // 0x0C000000
NEONXEmitter nemit(this);
const u8* loadPairedIllegal = GetCodePtr();
BKPT(0x10);
const u8* loadPairedFloatTwo = GetCodePtr();
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);
MOV(_PC, _LR);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);
MOV(_PC, _LR);
}
const u8* loadPairedFloatOne = GetCodePtr();
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);
MOVI2F(S1, 1.0f, INVALID_REG); // Temp reg isn't used for 1.0f
MOV(_PC, _LR);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
nemit.VLD1(I_32, D0, R10);
nemit.VREV32(I_8, D0, D0);
MOV(_PC, _LR);
}
const u8* loadPairedU8Two = GetCodePtr();
BKPT(0x13);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
LDRH(R12, R10);
SXTB(R12, R12);
VMOV(S0, R12);
LDRH(R12, R10, 2);
SXTB(R12, R12);
VMOV(S1, R12);
MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);
VCVT(S0, S0, TO_FLOAT);
VCVT(S1, S1, TO_FLOAT);
VMUL(S0, S0, S2);
VMUL(S1, S1, S2);
MOV(_PC, _LR);
}
const u8* loadPairedU8One = GetCodePtr();
BKPT(0x14);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
LDRB(R12, R10);
SXTB(R12, R12);
VMOV(S0, R12);
MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);
VCVT(S0, S0, TO_FLOAT);
VMUL(S0, S0, S2);
MOV(_PC, _LR);
}
const u8* loadPairedS8Two = GetCodePtr();
BKPT(0x15);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
LDRH(R12, R10);
SXTB(R12, R12);
VMOV(S0, R12);
LDRH(R12, R10, 2);
SXTB(R12, R12);
VMOV(S1, R12);
MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VCVT(S1, S1, TO_FLOAT | IS_SIGNED);
VMUL(S0, S0, S2);
VMUL(S1, S1, S2);
MOV(_PC, _LR);
}
const u8* loadPairedS8One = GetCodePtr();
BKPT(0x16);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
LDRB(R12, R10);
SXTB(R12, R12);
VMOV(S0, R12);
MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VMUL(S0, S0, S2);
MOV(_PC, _LR);
}
const u8* loadPairedU16Two = GetCodePtr();
BKPT(0x17);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
LDRH(R12, R10);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S0, R12);
LDRH(R12, R10, 2);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S1, R12);
MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);
VCVT(S0, S0, TO_FLOAT);
VCVT(S1, S1, TO_FLOAT);
VMUL(S0, S0, S2);
VMUL(S1, S1, S2);
MOV(_PC, _LR);
}
const u8* loadPairedU16One = GetCodePtr();
BKPT(0x18);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
LDRH(R12, R10);
REV16(R12, R12);
VMOV(S0, R12);
MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);
VCVT(S0, S0, TO_FLOAT);
VMUL(S0, S0, S2);
MOV(_PC, _LR);
}
const u8* loadPairedS16Two = GetCodePtr();
BKPT(0x19);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
LDRH(R12, R10);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S0, R12);
LDRH(R12, R10, 2);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S1, R12);
MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VCVT(S1, S1, TO_FLOAT | IS_SIGNED);
VMUL(S0, S0, S2);
VMUL(S1, S1, S2);
MOV(_PC, _LR);
}
const u8* loadPairedS16One = GetCodePtr();
BKPT(0x20);
{
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
LDRH(R12, R10);
MOVI2R(R10, (u32)&m_dequantizeTableS);
ADD(R10, R10, R11);
VLDR(S2, R10, 0);
REV16(R12, R12);
SXTH(R12, R12);
VMOV(S0, R12);
VCVT(S0, S0, TO_FLOAT | IS_SIGNED);
VMUL(S0, S0, S2);
MOV(_PC, _LR);
}
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
@ -214,4 +445,182 @@ void JitArmAsmRoutineManager::GenerateCommon()
pairedLoadQuantized[14] = loadPairedS8One;
pairedLoadQuantized[15] = loadPairedS16One;
// Stores
const u8* storePairedIllegal = GetCodePtr();
BKPT(0x21);
const u8* storePairedFloat = GetCodePtr();
{
TST(R10, arghmask);
FixupBranch argh = B_CC(CC_NEQ);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
nemit.VREV32(I_8, D0, D0);
nemit.VST1(I_32, D0, R10);
MOV(_PC, _LR);
SetJumpTarget(argh);
PUSH(5, R0, R1, R2, R3, _LR);
VMOV(R0, S0);
VMOV(R1, S1);
MOV(R2, R10);
MOVI2R(R12, (u32)&WriteDual32);
BL(R12);
POP(5, R0, R1, R2, R3, _PC);
}
const u8* storePairedU8 = GetCodePtr();
const u8* storePairedS8 = GetCodePtr();
{
// R10 is the addr
// R11 is the scale
// R12 is scratch
// S0, S1 is the values
PUSH(5, R0, R1, R2, R3, _LR);
MOVI2R(R12, (u32)&m_quantizeTableS);
ADD(R12, R12, R11);
VLDR(S2, R12, 0);
VMUL(S0, S0, S2);
VMUL(S1, S1, S2);
VCVT(S0, S0, TO_INT | ROUND_TO_ZERO);
VCVT(S1, S1, TO_INT | ROUND_TO_ZERO);
VMOV(R0, S0);
VMOV(R1, S1);
MOV(R2, R10);
MOVI2R(R12, (u32)&WriteDual8);
BL(R12);
POP(5, R0, R1, R2, R3, _PC);
}
const u8* storePairedU16 = GetCodePtr();
const u8* storePairedS16 = GetCodePtr();
{
PUSH(5, R0, R1, R2, R3, _LR);
MOVI2R(R12, (u32)&m_quantizeTableS);
ADD(R12, R12, R11);
VLDR(S2, R12, 0);
VMUL(S0, S0, S2);
VMUL(S1, S1, S2);
VCVT(S0, S0, TO_INT | ROUND_TO_ZERO);
VCVT(S1, S1, TO_INT | ROUND_TO_ZERO);
VMOV(R0, S0);
VMOV(R1, S1);
MOV(R2, R10);
MOVI2R(R12, (u32)&WriteDual16);
BL(R12);
POP(5, R0, R1, R2, R3, _PC);
}
const u8* storeSingleIllegal = GetCodePtr();
BKPT(0x27);
const u8* storeSingleFloat = GetCodePtr();
{
TST(R10, arghmask);
FixupBranch argh = B_CC(CC_NEQ);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
VMOV(R12, S0);
REV(R12, R12);
STR(R12, R10);
MOV(_PC, _LR);
SetJumpTarget(argh);
PUSH(5, R0, R1, R2, R3, _LR);
VMOV(R0, S0);
MOV(R1, R10);
MOVI2R(R10, (u32)&Memory::Write_U32);
BL(R10);
POP(5, R0, R1, R2, R3, _PC);
}
const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii
const u8* storeSingleS8 = GetCodePtr();
{
MOVI2R(R12, (u32)&m_quantizeTableS);
ADD(R12, R12, R11);
VLDR(S2, R12, 0);
VMUL(S0, S0, S2);
TST(R10, arghmask);
FixupBranch argh = B_CC(CC_NEQ);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
VCVT(S0, S0, TO_INT | ROUND_TO_ZERO);
VMOV(R12, S0);
STRB(R12, R10);
MOV(_PC, _LR);
SetJumpTarget(argh);
PUSH(5, R0, R1, R2, R3, _LR);
VMOV(R0, S0);
MOV(R1, R10);
MOVI2R(R10, (u32)&Memory::Write_U8);
BL(R10);
POP(5, R0, R1, R2, R3, _PC);
}
const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii
const u8* storeSingleS16 = GetCodePtr();
{
MOVI2R(R12, (u32)&m_quantizeTableS);
ADD(R12, R12, R11);
VLDR(S2, R12, 0);
VMUL(S0, S0, S2);
TST(R10, arghmask);
FixupBranch argh = B_CC(CC_NEQ);
BIC(R10, R10, mask);
MOVI2R(R12, (u32)Memory::base);
ADD(R10, R10, R12);
VCVT(S0, S0, TO_INT | ROUND_TO_ZERO);
VMOV(R12, S0);
REV16(R12, R12);
STRH(R12, R10);
MOV(_PC, _LR);
SetJumpTarget(argh);
PUSH(5, R0, R1, R2, R3, _LR);
VMOV(R0, S0);
MOV(R1, R10);
MOVI2R(R10, (u32)&Memory::Write_U16);
BL(R10);
POP(5, R0, R1, R2, R3, _PC);
}
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
pairedStoreQuantized[3] = storePairedIllegal;
pairedStoreQuantized[4] = storePairedU8;
pairedStoreQuantized[5] = storePairedU16;
pairedStoreQuantized[6] = storePairedS8;
pairedStoreQuantized[7] = storePairedS16;
pairedStoreQuantized[8] = storeSingleFloat;
pairedStoreQuantized[9] = storeSingleIllegal;
pairedStoreQuantized[10] = storeSingleIllegal;
pairedStoreQuantized[11] = storeSingleIllegal;
pairedStoreQuantized[12] = storeSingleU8;
pairedStoreQuantized[13] = storeSingleU16;
pairedStoreQuantized[14] = storeSingleS8;
pairedStoreQuantized[15] = storeSingleS16;
}