From 4fa23abbe158f7182307fb36063dc6fcac5f8cca Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 23 Aug 2015 15:34:53 -0500 Subject: [PATCH 1/2] [AArch64] Implement MOVI and ORR(imm) in the NEON emitter. --- Source/Core/Common/Arm64Emitter.cpp | 101 ++++++++++++++++++++++++++++ Source/Core/Common/Arm64Emitter.h | 5 ++ 2 files changed, 106 insertions(+) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index a94b2ee11f..9069bedeb7 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -2505,6 +2505,23 @@ void ARM64FloatEmitter::EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64 Rm.GetData() | (1 << 11) | (Rn << 5) | Rt); } +void ARM64FloatEmitter::EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh) +{ + union + { + u8 hex; + struct + { + unsigned defgh : 5; + unsigned abc : 3; + }; + } v; + v.hex = abcdefgh; + Rd = DecodeReg(Rd); + Write32((Q << 30) | (op << 29) | (0xF << 24) | (v.abc << 16) | (cmode << 12) | \ + (o2 << 11) | (1 << 10) | (v.defgh << 5) | Rd); +} + void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); @@ -3630,6 +3647,90 @@ void ARM64FloatEmitter::FMLA(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 EmitVectorxElement(0, 2 | (size >> 6), L, 1, H, Rd, Rn, Rm); } +// Modified Immediate +void ARM64FloatEmitter::MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift) +{ + bool Q = IsQuad(Rd); + u8 cmode = 0; + u8 op = 0; + u8 abcdefgh = imm & 0xFF; + if (size == 8) + { + _assert_msg_(DYNA_REC, shift == 0, "%s(size8) doesn't support shift!", __FUNCTION__); + _assert_msg_(DYNA_REC, !(imm & ~0xFFULL), "%s(size8) only supports 8bit values!", __FUNCTION__); + } + else if (size == 16) + { + _assert_msg_(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", __FUNCTION__); + _assert_msg_(DYNA_REC, !(imm & ~0xFFULL), "%s(size16) only supports 8bit values!", __FUNCTION__); + + if (shift == 8) + cmode |= 2; + } + else if (size == 32) + { + _assert_msg_(DYNA_REC, + shift == 0 || shift == 8 || shift == 16 || shift == 24, + "%s(size32) only supports shift of {0, 8, 16, 24}!", __FUNCTION__); + // XXX: Implement support for MOVI - shifting ones variant + _assert_msg_(DYNA_REC, !(imm & ~0xFFULL), "%s(size32) only supports 8bit values!", __FUNCTION__); + switch (shift) + { + case 8: cmode |= 2; break; + case 16: cmode |= 4; break; + case 24: cmode |= 6; break; + default: break; + } + } + else // 64 + { + _assert_msg_(DYNA_REC, shift == 0, "%s(size64) doesn't support shift!", __FUNCTION__); + + op = 1; + cmode = 0xE; + abcdefgh = 0; + for (int i = 0; i < 8; ++i) + { + u8 tmp = (imm >> (i << 3)) & 0xFF; + _assert_msg_(DYNA_REC, tmp == 0xFF || tmp == 0, "%s(size64) Invalid immediate!", __FUNCTION__); + if (tmp == 0xFF) + abcdefgh |= (1 << i); + } + } + EncodeModImm(Q, op, cmode, 0, Rd, abcdefgh); +} + +void ARM64FloatEmitter::BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift) +{ + bool Q = IsQuad(Rd); + u8 cmode = 1; + u8 op = 1; + if (size == 16) + { + _assert_msg_(DYNA_REC, shift == 0 || shift == 8, "%s(size16) only supports shift of {0, 8}!", __FUNCTION__); + + if (shift == 8) + cmode |= 2; + } + else if (size == 32) + { + _assert_msg_(DYNA_REC, + shift == 0 || shift == 8 || shift == 16 || shift == 24, + "%s(size32) only supports shift of {0, 8, 16, 24}!", __FUNCTION__); + // XXX: Implement support for MOVI - shifting ones variant + switch (shift) + { + case 8: cmode |= 2; break; + case 16: cmode |= 4; break; + case 24: cmode |= 6; break; + default: break; + } + } + else + _assert_msg_(DYNA_REC, false, "%s only supports size of {16, 32}!", __FUNCTION__); + EncodeModImm(Q, op, cmode, 0, Rd, imm); +} + void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers, ARM64Reg tmp) { bool bundled_loadstore = false; diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index ec3da4cf64..0133d7f0d4 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -901,6 +901,10 @@ public: void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); void FMLA(u8 esize, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + // Modified Immediate + void MOVI(u8 size, ARM64Reg Rd, u64 imm, u8 shift = 0); + void BIC(u8 size, ARM64Reg Rd, u8 imm, u8 shift = 0); + void MOVI2F(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG, bool negate = false); void MOVI2FDUP(ARM64Reg Rd, float value, ARM64Reg scratch = INVALID_REG); @@ -938,6 +942,7 @@ private: void EmitScalar3Source(bool isDouble, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra, int opcode); void EncodeLoadStorePair(u32 size, bool load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void EncodeLoadStoreRegisterOffset(u32 size, bool load, ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + void EncodeModImm(bool Q, u8 op, u8 cmode, u8 o2, ARM64Reg Rd, u8 abcdefgh); void SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); void USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift, bool upper); From 561744819e534e5d3981f8e1835ecbb2ba7c96c8 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 23 Aug 2015 15:35:18 -0500 Subject: [PATCH 2/2] [AArch64] Implement fctiwzx Improves the povray benchmark time by 5.6% --- Source/Core/Core/PowerPC/JitArm64/Jit.h | 1 + .../JitArm64/JitArm64_FloatingPoint.cpp | 35 +++++++++++++++++++ .../Core/PowerPC/JitArm64/JitArm64_Tables.cpp | 2 +- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/Jit.h b/Source/Core/Core/PowerPC/JitArm64/Jit.h index 2a851e3339..b403d54103 100644 --- a/Source/Core/Core/PowerPC/JitArm64/Jit.h +++ b/Source/Core/Core/PowerPC/JitArm64/Jit.h @@ -147,6 +147,7 @@ public: void fsubx(UGeckoInstruction inst); void fcmpx(UGeckoInstruction inst); void frspx(UGeckoInstruction inst); + void fctiwzx(UGeckoInstruction inst); // Paired void ps_abs(UGeckoInstruction inst); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp index 690adcc983..c6c00fad3c 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_FloatingPoint.cpp @@ -558,3 +558,38 @@ void JitArm64::fcmpx(UGeckoInstruction inst) gpr.Unlock(WA); } + +void JitArm64::fctiwzx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITFloatingPointOff); + FALLBACK_IF(inst.Rc); + + u32 b = inst.FB, d = inst.FD; + fpr.BindToRegister(d, d == b); + + ARM64Reg VB = fpr.R(b); + ARM64Reg VD = fpr.R(d); + + ARM64Reg V0 = fpr.GetReg(); + + // Generate 0xFFF8000000000000ULL + m_float_emit.MOVI(64, EncodeRegToDouble(V0), 0xFFFF000000000000ULL); + m_float_emit.BIC(16, EncodeRegToDouble(V0), 0x7); + if (fpr.IsLower(d)) + { + m_float_emit.FCVTN(32, EncodeRegToDouble(VD), EncodeRegToDouble(VB)); + m_float_emit.FCVTS(EncodeRegToSingle(VD), EncodeRegToSingle(VD), ROUND_Z); + m_float_emit.ORR(EncodeRegToDouble(VD), EncodeRegToDouble(VD), EncodeRegToDouble(V0)); + } + else + { + ARM64Reg V1 = fpr.GetReg(); + m_float_emit.FCVTN(32, EncodeRegToDouble(V1), EncodeRegToDouble(VB)); + m_float_emit.FCVTS(EncodeRegToSingle(V1), EncodeRegToSingle(V1), ROUND_Z); + m_float_emit.ORR(EncodeRegToDouble(V1), EncodeRegToDouble(V1), EncodeRegToDouble(V0)); + m_float_emit.INS(64, VD, 0, V1, 0); + fpr.Unlock(V1); + } + fpr.Unlock(V0); +} diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp index a2c71acf91..153725a426 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_Tables.cpp @@ -330,7 +330,7 @@ static GekkoOPTemplate table63[] = {32, &JitArm64::fcmpx}, // fcmpo {0, &JitArm64::fcmpx}, // fcmpu {14, &JitArm64::FallBackToInterpreter}, // fctiwx - {15, &JitArm64::FallBackToInterpreter}, // fctiwzx + {15, &JitArm64::fctiwzx}, // fctiwzx {72, &JitArm64::fmrx}, // fmrx {136, &JitArm64::fnabsx}, // fnabsx {40, &JitArm64::fnegx}, // fnegx