[ARM] Use NEON for loading the values from psq_l, gives a minimal performance increase. This change also begins a new NEONXEmitter for having cleaner support for NEON.
This commit is contained in:
parent
40f848d279
commit
e6af4970d8
|
@ -892,54 +892,6 @@ ARMReg ARMXEmitter::SubBase(ARMReg Reg)
|
||||||
return Reg;
|
return Reg;
|
||||||
}
|
}
|
||||||
|
|
||||||
// NEON Specific
|
|
||||||
void ARMXEmitter::VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
|
||||||
{
|
|
||||||
_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)");
|
|
||||||
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it");
|
|
||||||
bool register_quad = Vd >= Q0;
|
|
||||||
|
|
||||||
// Gets encoded as a double register
|
|
||||||
Vd = SubBase(Vd);
|
|
||||||
Vn = SubBase(Vn);
|
|
||||||
Vm = SubBase(Vm);
|
|
||||||
|
|
||||||
Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
|
|
||||||
| ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
|
|
||||||
| ((Vm & 0x10) << 2) | (Vm & 0xF));
|
|
||||||
}
|
|
||||||
void ARMXEmitter::VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
|
||||||
{
|
|
||||||
_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)");
|
|
||||||
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VADD(integer) when CPU doesn't support it");
|
|
||||||
|
|
||||||
bool register_quad = Vd >= Q0;
|
|
||||||
|
|
||||||
// Gets encoded as a double register
|
|
||||||
Vd = SubBase(Vd);
|
|
||||||
Vn = SubBase(Vn);
|
|
||||||
Vm = SubBase(Vm);
|
|
||||||
|
|
||||||
Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
|
|
||||||
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
|
|
||||||
| ((Vm & 0x10) << 1) | (Vm & 0xF));
|
|
||||||
|
|
||||||
}
|
|
||||||
void ARMXEmitter::VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
|
||||||
{
|
|
||||||
_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)");
|
|
||||||
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it");
|
|
||||||
|
|
||||||
// Gets encoded as a double register
|
|
||||||
Vd = SubBase(Vd);
|
|
||||||
Vn = SubBase(Vn);
|
|
||||||
Vm = SubBase(Vm);
|
|
||||||
|
|
||||||
Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (Size << 20) | ((Vn & 0xF) << 16) \
|
|
||||||
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \
|
|
||||||
| ((Vm & 0x10) << 2) | (Vm & 0xF));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Double/single, Neon
|
// Double/single, Neon
|
||||||
extern const VFPEnc VFPOps[16][2] = {
|
extern const VFPEnc VFPOps[16][2] = {
|
||||||
{{0xE0, 0xA0}, {0x20, 0xD1}}, // 0: VMLA
|
{{0xE0, 0xA0}, {0x20, 0xD1}}, // 0: VMLA
|
||||||
|
@ -1269,4 +1221,100 @@ void ARMXEmitter::VCVT(ARMReg Dest, ARMReg Source, int flags)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void NEONXEmitter::VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||||
|
{
|
||||||
|
_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VABD(float)");
|
||||||
|
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VABD(float) when CPU doesn't support it");
|
||||||
|
bool register_quad = Vd >= Q0;
|
||||||
|
|
||||||
|
// Gets encoded as a double register
|
||||||
|
Vd = SubBase(Vd);
|
||||||
|
Vn = SubBase(Vn);
|
||||||
|
Vm = SubBase(Vm);
|
||||||
|
|
||||||
|
Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \
|
||||||
|
| ((Vd & 0xF) << 12) | (0xD << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
|
||||||
|
| ((Vm & 0x10) << 2) | (Vm & 0xF));
|
||||||
}
|
}
|
||||||
|
void NEONXEmitter::VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||||
|
{
|
||||||
|
_dbg_assert_msg_(DYNA_REC, Vd >= D0, "Pass invalid register to VADD(integer)");
|
||||||
|
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VADD(integer) when CPU doesn't support it");
|
||||||
|
|
||||||
|
bool register_quad = Vd >= Q0;
|
||||||
|
|
||||||
|
// Gets encoded as a double register
|
||||||
|
Vd = SubBase(Vd);
|
||||||
|
Vn = SubBase(Vn);
|
||||||
|
Vm = SubBase(Vm);
|
||||||
|
|
||||||
|
Write32((0xF2 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \
|
||||||
|
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (register_quad << 6) \
|
||||||
|
| ((Vm & 0x10) << 1) | (Vm & 0xF));
|
||||||
|
|
||||||
|
}
|
||||||
|
void NEONXEmitter::VSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm)
|
||||||
|
{
|
||||||
|
_dbg_assert_msg_(DYNA_REC, Vd >= Q0, "Pass invalid register to VSUB(integer)");
|
||||||
|
_dbg_assert_msg_(DYNA_REC, cpu_info.bNEON, "Can't use VSUB(integer) when CPU doesn't support it");
|
||||||
|
|
||||||
|
// Gets encoded as a double register
|
||||||
|
Vd = SubBase(Vd);
|
||||||
|
Vn = SubBase(Vn);
|
||||||
|
Vm = SubBase(Vm);
|
||||||
|
|
||||||
|
Write32((0xF3 << 24) | ((Vd & 0x10) << 18) | (encodedSize(Size) << 20) | ((Vn & 0xF) << 16) \
|
||||||
|
| ((Vd & 0xF) << 12) | (0x8 << 8) | ((Vn & 0x10) << 3) | (1 << 6) \
|
||||||
|
| ((Vm & 0x10) << 2) | (Vm & 0xF));
|
||||||
|
}
|
||||||
|
|
||||||
|
void NEONXEmitter::VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
|
||||||
|
{
|
||||||
|
u32 spacing = 0x7; // Only support loading to 1 reg
|
||||||
|
// Gets encoded as a double register
|
||||||
|
Vd = SubBase(Vd);
|
||||||
|
|
||||||
|
Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (1 << 21) | (Rn << 16)
|
||||||
|
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
|
||||||
|
| (align << 4) | Rm);
|
||||||
|
}
|
||||||
|
|
||||||
|
void NEONXEmitter::VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align, ARMReg Rm)
|
||||||
|
{
|
||||||
|
u32 spacing = 0x8; // Single spaced registers
|
||||||
|
// Gets encoded as a double register
|
||||||
|
Vd = SubBase(Vd);
|
||||||
|
|
||||||
|
Write32((0xF4 << 24) | ((Vd & 0x10) << 18) | (1 << 21) | (Rn << 16)
|
||||||
|
| ((Vd & 0xF) << 12) | (spacing << 8) | (encodedSize(Size) << 6)
|
||||||
|
| (align << 4) | Rm);
|
||||||
|
}
|
||||||
|
|
||||||
|
void NEONXEmitter::VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm)
|
||||||
|
{
|
||||||
|
bool register_quad = Vd >= Q0;
|
||||||
|
Vd = SubBase(Vd);
|
||||||
|
Vm = SubBase(Vm);
|
||||||
|
|
||||||
|
Write32((0xF3 << 24) | (1 << 23) | ((Vd & 0x10) << 18) | (0x3 << 20)
|
||||||
|
| (encodedSize(Size) << 18) | ((Vd & 0xF) << 12) | (size << 7)
|
||||||
|
| (register_quad << 6) | ((Vm & 0x10) << 2) | (Vm & 0xF));
|
||||||
|
}
|
||||||
|
|
||||||
|
void NEONXEmitter::VREV64(NEONElementType Size, ARMReg Vd, ARMReg Vm)
|
||||||
|
{
|
||||||
|
VREVX(2, Size, Vd, Vm);
|
||||||
|
}
|
||||||
|
|
||||||
|
void NEONXEmitter::VREV32(NEONElementType Size, ARMReg Vd, ARMReg Vm)
|
||||||
|
{
|
||||||
|
VREVX(1, Size, Vd, Vm);
|
||||||
|
}
|
||||||
|
|
||||||
|
void NEONXEmitter::VREV16(NEONElementType Size, ARMReg Vd, ARMReg Vm)
|
||||||
|
{
|
||||||
|
VREVX(0, Size, Vd, Vm);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -104,13 +104,6 @@ enum ShiftType
|
||||||
ST_ROR = 3,
|
ST_ROR = 3,
|
||||||
ST_RRX = 4
|
ST_RRX = 4
|
||||||
};
|
};
|
||||||
enum IntegerSize
|
|
||||||
{
|
|
||||||
I_I8 = 0,
|
|
||||||
I_I16,
|
|
||||||
I_I32,
|
|
||||||
I_I64
|
|
||||||
};
|
|
||||||
|
|
||||||
enum
|
enum
|
||||||
{
|
{
|
||||||
|
@ -349,6 +342,7 @@ typedef const u8* JumpTarget;
|
||||||
class ARMXEmitter
|
class ARMXEmitter
|
||||||
{
|
{
|
||||||
friend struct OpArg; // for Write8 etc
|
friend struct OpArg; // for Write8 etc
|
||||||
|
friend class NEONXEmitter;
|
||||||
private:
|
private:
|
||||||
u8 *code, *startcode;
|
u8 *code, *startcode;
|
||||||
u8 *lastCacheFlushEnd;
|
u8 *lastCacheFlushEnd;
|
||||||
|
@ -533,11 +527,7 @@ public:
|
||||||
|
|
||||||
// Subtracts the base from the register to give us the real one
|
// Subtracts the base from the register to give us the real one
|
||||||
ARMReg SubBase(ARMReg Reg);
|
ARMReg SubBase(ARMReg Reg);
|
||||||
// NEON Only
|
|
||||||
void VABD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
|
||||||
void VADD(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
|
||||||
void VSUB(IntegerSize Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
|
||||||
|
|
||||||
// VFP Only
|
// VFP Only
|
||||||
void VLDR(ARMReg Dest, ARMReg Base, s16 offset);
|
void VLDR(ARMReg Dest, ARMReg Base, s16 offset);
|
||||||
void VSTR(ARMReg Src, ARMReg Base, s16 offset);
|
void VSTR(ARMReg Src, ARMReg Base, s16 offset);
|
||||||
|
@ -584,6 +574,65 @@ public:
|
||||||
|
|
||||||
}; // class ARMXEmitter
|
}; // class ARMXEmitter
|
||||||
|
|
||||||
|
enum NEONElementType
|
||||||
|
{
|
||||||
|
I_8 = (1 << 0),
|
||||||
|
I_16 = (1 << 1),
|
||||||
|
I_32 = (1 << 2),
|
||||||
|
I_64 = (1 << 3),
|
||||||
|
I_SIGNED = (1 << 4),
|
||||||
|
I_UNSIGNED = (1 << 5),
|
||||||
|
F_32 = (1 << 6)
|
||||||
|
};
|
||||||
|
|
||||||
|
enum NEONAlignment
|
||||||
|
{
|
||||||
|
ALIGN_NONE = 0,
|
||||||
|
ALIGN_64 = 1,
|
||||||
|
ALIGN_128 = 2,
|
||||||
|
ALIGN_256 = 3
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
class NEONXEmitter
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
ARMXEmitter *_emit;
|
||||||
|
ARMReg SubBase(ARMReg Reg) { return _emit->SubBase(Reg); }
|
||||||
|
inline void Write32(u32 value) { _emit->Write32(value); }
|
||||||
|
|
||||||
|
inline u32 encodedSize(u32 value)
|
||||||
|
{
|
||||||
|
if (value & I_8)
|
||||||
|
return 0;
|
||||||
|
else if (value & I_16)
|
||||||
|
return 1;
|
||||||
|
else if (value & I_32)
|
||||||
|
return 2;
|
||||||
|
else if (value & I_64)
|
||||||
|
return 3;
|
||||||
|
else
|
||||||
|
_dbg_assert_msg_(DYNA_REC, false, "Passed invalid size to integer NEON instruction");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void VREVX(u32 size, NEONElementType Size, ARMReg Vd, ARMReg Vm);
|
||||||
|
|
||||||
|
public:
|
||||||
|
NEONXEmitter(ARMXEmitter *emit)
|
||||||
|
: _emit(emit)
|
||||||
|
{}
|
||||||
|
|
||||||
|
void VABD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||||
|
void VADD(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||||
|
void VSUB(NEONElementType Size, ARMReg Vd, ARMReg Vn, ARMReg Vm);
|
||||||
|
void VREV64(NEONElementType Size, ARMReg Vd, ARMReg Vm);
|
||||||
|
void VREV32(NEONElementType Size, ARMReg Vd, ARMReg Vm);
|
||||||
|
void VREV16(NEONElementType Size, ARMReg Vd, ARMReg Vm);
|
||||||
|
|
||||||
|
void VLD1(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
|
||||||
|
void VLD2(NEONElementType Size, ARMReg Vd, ARMReg Rn, NEONAlignment align = ALIGN_NONE, ARMReg Rm = _PC);
|
||||||
|
};
|
||||||
|
|
||||||
// Everything that needs to generate X86 code should inherit from this.
|
// Everything that needs to generate X86 code should inherit from this.
|
||||||
// You get memory management for free, plus, you can use all the MOV etc functions without
|
// You get memory management for free, plus, you can use all the MOV etc functions without
|
||||||
|
|
|
@ -467,7 +467,8 @@ const u8* JitArm::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBlo
|
||||||
MOVI2R(RB, (u32)&One);
|
MOVI2R(RB, (u32)&One);
|
||||||
VLDR(VA, RA, 0);
|
VLDR(VA, RA, 0);
|
||||||
VLDR(VB, RB, 0);
|
VLDR(VB, RB, 0);
|
||||||
VADD(I_I64, VA, VA, VB);
|
NEONXEmitter nemit(this);
|
||||||
|
nemit.VADD(I_64, VA, VA, VB);
|
||||||
VSTR(VA, RA, 0);
|
VSTR(VA, RA, 0);
|
||||||
gpr.Unlock(RA, RB);
|
gpr.Unlock(RA, RB);
|
||||||
fpr.Unlock(VA);
|
fpr.Unlock(VA);
|
||||||
|
|
|
@ -157,13 +157,9 @@ void JitArmAsmRoutineManager::GenerateCommon()
|
||||||
MOVI2R(R14, (u32)Memory::base);
|
MOVI2R(R14, (u32)Memory::base);
|
||||||
ADD(R10, R10, R14);
|
ADD(R10, R10, R14);
|
||||||
|
|
||||||
LDR(R12, R10);
|
NEONXEmitter nemit(this);
|
||||||
REV(R12, R12);
|
nemit.VLD1(I_32, D0, R10);
|
||||||
VMOV(S0, R12);
|
nemit.VREV32(I_8, D0, D0);
|
||||||
|
|
||||||
LDR(R12, R10, 4);
|
|
||||||
REV(R12, R12);
|
|
||||||
VMOV(S1, R12);
|
|
||||||
|
|
||||||
POP(2, R12, _PC);
|
POP(2, R12, _PC);
|
||||||
const u8* loadPairedFloatOne = GetCodePtr();
|
const u8* loadPairedFloatOne = GetCodePtr();
|
||||||
|
|
Loading…
Reference in New Issue