From 57c66798fc533d84534c23757ba325c5598798f3 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:41:52 -0600 Subject: [PATCH 01/12] [Android] Register panic alert handler. This lets me see _assert_msg_ alerts on Android when passing dumb arguments to my AArch64 emitter. --- Source/Core/DolphinWX/MainAndroid.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Source/Core/DolphinWX/MainAndroid.cpp b/Source/Core/DolphinWX/MainAndroid.cpp index 0d9d4428ce..89ebc8837d 100644 --- a/Source/Core/DolphinWX/MainAndroid.cpp +++ b/Source/Core/DolphinWX/MainAndroid.cpp @@ -105,6 +105,12 @@ void Host_SetWiiMoteConnectionState(int _State) {} void Host_ShowVideoConfig(void*, const std::string&, const std::string&) {} +static bool MsgAlert(const char* caption, const char* text, bool /*yes_no*/, int /*Style*/) +{ + __android_log_print(ANDROID_LOG_INFO, DOLPHIN_TAG, "%s:%s", caption, text); + return false; +} + #define DVD_BANNER_WIDTH 96 #define DVD_BANNER_HEIGHT 32 std::vector m_volume_names; @@ -344,6 +350,8 @@ JNIEXPORT void JNICALL Java_org_dolphinemu_dolphinemu_NativeLibrary_Run(JNIEnv * OSD::AddCallback(OSD::OSD_INIT, ButtonManager::Init); OSD::AddCallback(OSD::OSD_SHUTDOWN, ButtonManager::Shutdown); + RegisterMsgAlertHandler(&MsgAlert); + UICommon::Init(); // No use running the loop when booting fails From 20dae1f2107af4b7763eb8f6436964859565bc61 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:43:50 -0600 Subject: [PATCH 02/12] [AArch64] Fix a bunch of emitter asserts. Since I've added the msg handler. I found all these asserts that were backwards. So they were asserting on the correct arguments. --- Source/Core/Common/Arm64Emitter.cpp | 63 ++++++++++++++--------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index d8c315016c..57ad0b3215 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -188,7 +188,7 @@ static u32 LoadStoreExcEnc[][5] = { void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr) { bool b64Bit = Is64Bit(Rt); - s64 distance = (s64)ptr - (s64(m_code) + 8); + s64 distance = (s64)ptr - (s64)m_code; _assert_msg_(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %lx", __FUNCTION__, distance); @@ -198,13 +198,13 @@ void ARM64XEmitter::EncodeCompareBranchInst(u32 op, ARM64Reg Rt, const void* ptr Rt = DecodeReg(Rt); Write32((b64Bit << 31) | (0x34 << 24) | (op << 24) | \ - (distance << 5) | Rt); + (((u32)distance << 5) & 0xFFFFE0) | Rt); } void ARM64XEmitter::EncodeTestBranchInst(u32 op, ARM64Reg Rt, u8 bits, const void* ptr) { bool b64Bit = Is64Bit(Rt); - s64 distance = (s64)ptr - (s64(m_code) + 8); + s64 distance = (s64)ptr - (s64)m_code; _assert_msg_(DYNA_REC, !(distance & 0x3), "%s: distance must be a multiple of 4: %lx", __FUNCTION__, distance); @@ -414,7 +414,7 @@ void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, u32 op2, ARM64Reg Rt, ARM u32 offset = imm & 0x1FF; - _assert_msg_(DYNA_REC, imm < -256 || imm > 255, "%s: offset too large %d", __FUNCTION__, imm); + _assert_msg_(DYNA_REC, !(imm < -256 || imm > 255), "%s: offset too large %d", __FUNCTION__, imm); Rt = DecodeReg(Rt); Rn = DecodeReg(Rn); @@ -433,7 +433,7 @@ void ARM64XEmitter::EncodeLoadStoreIndexedInst(u32 op, ARM64Reg Rt, ARM64Reg Rn, else if (size == 16) imm >>= 1; - _assert_msg_(DYNA_REC, imm < 0, "%s(INDEX_UNSIGNED): offset must be positive", __FUNCTION__); + _assert_msg_(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED): offset must be positive %d", __FUNCTION__, imm); _assert_msg_(DYNA_REC, !(imm & ~0xFFF), "%s(INDEX_UNSIGNED): offset too large %d", __FUNCTION__, imm); Rt = DecodeReg(Rt); @@ -557,7 +557,7 @@ void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch) _assert_msg_(DYNA_REC, distance >= -0xFFFFF && distance < 0xFFFFF, "%s(%d): Received too large distance: %lx", __FUNCTION__, branch.type, distance); bool b64Bit = Is64Bit(branch.reg); ARM64Reg reg = DecodeReg(branch.reg); - inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | (distance << 5) | reg; + inst = (b64Bit << 31) | (0x1A << 25) | (Not << 24) | ((distance << 5) & 0xFFFFE0) | reg; } break; case 2: // B (conditional) @@ -1437,9 +1437,6 @@ void ARM64XEmitter::MOVI2R(ARM64Reg Rd, u64 imm, bool optimize) BitSet32 upload_part(0); bool need_movz = false; - if (!Is64Bit(Rd)) - _assert_msg_(DYNA_REC, !(imm >> 32), "%s: immediate doesn't fit in 32bit register: %lx", __FUNCTION__, imm); - if (!imm) { // Zero immediate, just clear the register @@ -1640,8 +1637,8 @@ void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, if (type == INDEX_UNSIGNED) { - _assert_msg_(DYNA_REC, imm & (size - 1), "%s(INDEX_UNSIGNED) immediate offset must be aligned to size!", __FUNCTION__); - _assert_msg_(DYNA_REC, imm < 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!", __FUNCTION__); + _assert_msg_(DYNA_REC, !(imm & ((size - 1) >> 3)), "%s(INDEX_UNSIGNED) immediate offset must be aligned to size!", __FUNCTION__); + _assert_msg_(DYNA_REC, imm >= 0, "%s(INDEX_UNSIGNED) immediate offset must be positive!", __FUNCTION__); if (size == 16) imm >>= 1; else if (size == 32) @@ -1654,7 +1651,7 @@ void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, } else { - _assert_msg_(DYNA_REC, imm < -256 || imm > 255, "%s immediate offset must be within range of -256 to 256!", __FUNCTION__); + _assert_msg_(DYNA_REC, !(imm < -256 || imm > 255), "%s immediate offset must be within range of -256 to 256!", __FUNCTION__); encoded_imm = (imm & 0x1FF) << 2; if (type == INDEX_POST) encoded_imm |= 1; @@ -1668,7 +1665,7 @@ void ARM64FloatEmitter::EmitLoadStoreImmediate(u8 size, u32 opc, IndexType type, void ARM64FloatEmitter::Emit2Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - _assert_msg_(DYNA_REC, IsQuad(Rd), "%s only supports double and single registers!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s only supports double and single registers!", __FUNCTION__); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); Rm = DecodeReg(Rm); @@ -1679,7 +1676,7 @@ void ARM64FloatEmitter::Emit2Source(bool M, bool S, u32 type, u32 opcode, ARM64R void ARM64FloatEmitter::EmitThreeSame(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - _assert_msg_(DYNA_REC, IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); bool quad = IsQuad(Rd); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); @@ -1700,7 +1697,7 @@ void ARM64FloatEmitter::EmitCopy(bool Q, u32 op, u32 imm5, u32 imm4, ARM64Reg Rd void ARM64FloatEmitter::Emit2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { - _assert_msg_(DYNA_REC, IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); bool quad = IsQuad(Rd); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); @@ -1711,7 +1708,7 @@ void ARM64FloatEmitter::Emit2RegMisc(bool U, u32 size, u32 opcode, ARM64Reg Rd, void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn) { - _assert_msg_(DYNA_REC, IsSingle(Rt), "%s doesn't support singles!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __FUNCTION__); bool quad = IsQuad(Rt); Rt = DecodeReg(Rt); Rn = DecodeReg(Rn); @@ -1722,7 +1719,7 @@ void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, bool S, u32 size, ARM64Reg Rt, ARM64Reg Rn, ARM64Reg Rm) { - _assert_msg_(DYNA_REC, IsSingle(Rt), "%s doesn't support singles!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsSingle(Rt), "%s doesn't support singles!", __FUNCTION__); bool quad = IsQuad(Rt); Rt = DecodeReg(Rt); Rn = DecodeReg(Rn); @@ -1734,7 +1731,7 @@ void ARM64FloatEmitter::EmitLoadStoreSingleStructure(bool L, bool R, u32 opcode, void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { - _assert_msg_(DYNA_REC, IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); @@ -1744,7 +1741,7 @@ void ARM64FloatEmitter::Emit1Source(bool M, bool S, u32 type, u32 opcode, ARM64R void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { - _assert_msg_(DYNA_REC, !(Rn <= SP), "%s only supports GPR as source!", __FUNCTION__); + _assert_msg_(DYNA_REC, Rn <= SP, "%s only supports GPR as source!", __FUNCTION__); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); @@ -1754,7 +1751,7 @@ void ARM64FloatEmitter::EmitConversion(bool sf, bool S, u32 type, u32 rmode, u32 void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Reg Rn, ARM64Reg Rm) { - _assert_msg_(DYNA_REC, IsQuad(Rn), "%s doesn't support vector!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsQuad(Rn), "%s doesn't support vector!", __FUNCTION__); bool is_double = IsDouble(Rn); Rn = DecodeReg(Rn); @@ -1766,7 +1763,7 @@ void ARM64FloatEmitter::EmitCompare(bool M, bool S, u32 op, u32 opcode2, ARM64Re void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - _assert_msg_(DYNA_REC, IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); bool is_double = IsDouble(Rd); Rd = DecodeReg(Rd); @@ -1779,7 +1776,7 @@ void ARM64FloatEmitter::EmitCondSelect(bool M, bool S, CCFlags cond, ARM64Reg Rd void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) { - _assert_msg_(DYNA_REC, IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsSingle(Rd), "%s doesn't support singles!", __FUNCTION__); bool quad = IsQuad(Rd); @@ -1801,7 +1798,7 @@ void ARM64FloatEmitter::EmitPermute(u32 size, u32 op, ARM64Reg Rd, ARM64Reg Rn, void ARM64FloatEmitter::EmitScalarImm(bool M, bool S, u32 type, u32 imm5, ARM64Reg Rd, u32 imm) { - _assert_msg_(DYNA_REC, IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); bool is_double = !IsSingle(Rd); @@ -1815,7 +1812,7 @@ void ARM64FloatEmitter::EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM { bool quad = IsQuad(Rd); - _assert_msg_(DYNA_REC, !immh, "%s bad encoding! Can't have zero immh", __FUNCTION__); + _assert_msg_(DYNA_REC, immh, "%s bad encoding! Can't have zero immh", __FUNCTION__); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); @@ -1844,7 +1841,7 @@ void ARM64FloatEmitter::EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opc void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn) { - _assert_msg_(DYNA_REC, IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); + _assert_msg_(DYNA_REC, !IsQuad(Rd), "%s doesn't support vector!", __FUNCTION__); Rd = DecodeReg(Rd); Rn = DecodeReg(Rn); @@ -2083,7 +2080,7 @@ void ARM64FloatEmitter::ST1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Re // Loadstore multiple structure void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) { - _assert_msg_(DYNA_REC, count == 0 || count > 4, "%s must have a count of 1 to 4 registers!", __FUNCTION__); + _assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__); u32 opcode = 0; if (count == 1) opcode = 0b111; @@ -2317,8 +2314,8 @@ void ARM64FloatEmitter::INS(u8 size, ARM64Reg Rd, u8 index1, ARM64Reg Rn, u8 ind void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { bool b64Bit = Is64Bit(Rd); - _assert_msg_(DYNA_REC, Rd > SP, "%s destination must be a GPR!", __FUNCTION__); - _assert_msg_(DYNA_REC, b64Bit && size != 64, "%s must have a size of 64 when destination is 64bit!", __FUNCTION__); + _assert_msg_(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __FUNCTION__); + _assert_msg_(DYNA_REC, !(b64Bit && size != 64), "%s must have a size of 64 when destination is 64bit!", __FUNCTION__); u32 imm5 = 0; if (size == 8) @@ -2347,8 +2344,8 @@ void ARM64FloatEmitter::UMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index) { bool b64Bit = Is64Bit(Rd); - _assert_msg_(DYNA_REC, Rd > SP, "%s destination must be a GPR!", __FUNCTION__); - _assert_msg_(DYNA_REC, size == 64, "%s doesn't support 64bit destination. Use UMOV!", __FUNCTION__); + _assert_msg_(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __FUNCTION__); + _assert_msg_(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __FUNCTION__); u32 imm5 = 0; if (size == 8) @@ -2513,7 +2510,7 @@ void ARM64FloatEmitter::ZIP2(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) // Shift by immediate void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { - _assert_msg_(DYNA_REC, shift >= src_size, "%s shift amount must less than the element size!", __FUNCTION__); + _assert_msg_(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", __FUNCTION__); u32 immh = 0; u32 immb = shift & 0xFFF; @@ -2534,7 +2531,7 @@ void ARM64FloatEmitter::SSHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { - _assert_msg_(DYNA_REC, shift >= src_size, "%s shift amount must less than the element size!", __FUNCTION__); + _assert_msg_(DYNA_REC, shift < src_size, "%s shift amount must less than the element size!", __FUNCTION__); u32 immh = 0; u32 immb = shift & 0xFFF; @@ -2555,7 +2552,7 @@ void ARM64FloatEmitter::USHLL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) void ARM64FloatEmitter::SHRN(u8 dest_size, ARM64Reg Rd, ARM64Reg Rn, u32 shift) { - _assert_msg_(DYNA_REC, shift >= dest_size, "%s shift amount must less than the element size!", __FUNCTION__); + _assert_msg_(DYNA_REC, shift < dest_size, "%s shift amount must less than the element size!", __FUNCTION__); u32 immh = 0; u32 immb = shift & 0xFFF; From 8074d062707330404c2fa4426e217a1dc5051c7c Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:45:06 -0600 Subject: [PATCH 03/12] [ARM] Fix poison memory functions. We were poisoning 4x more data and overrunning our buffers. We don't want to do this. --- Source/Core/Common/Arm64Emitter.h | 2 +- Source/Core/Common/ArmEmitter.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 78052def11..62c0d6347d 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -756,7 +756,7 @@ private: void PoisonMemory() override { u32* ptr = (u32*)region; - u32* maxptr = (u32*)region + region_size; + u32* maxptr = (u32*)(region + region_size); // If our memory isn't a multiple of u32 then this won't write the last remaining bytes with anything // Less than optimal, but there would be nothing we could do but throw a runtime warning anyway. // AArch64: 0xD4200000 = BRK 0 diff --git a/Source/Core/Common/ArmEmitter.h b/Source/Core/Common/ArmEmitter.h index 1f9ee751b3..f04232d682 100644 --- a/Source/Core/Common/ArmEmitter.h +++ b/Source/Core/Common/ArmEmitter.h @@ -689,7 +689,7 @@ private: void PoisonMemory() override { u32* ptr = (u32*)region; - u32* maxptr = (u32*)region + region_size; + u32* maxptr = (u32*)(region + region_size); // If our memory isn't a multiple of u32 then this won't write the last remaining bytes with anything // Less than optimal, but there would be nothing we could do but throw a runtime warning anyway. // ARM: 0x01200070 = BKPT 0 From 0252bbb33f6288c74466c364e50fb77b0afed273 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:46:20 -0600 Subject: [PATCH 04/12] [AArch64] Fix non-PCH build. --- Source/Core/Common/Arm64Emitter.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 62c0d6347d..b94e306612 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -4,6 +4,8 @@ #pragma once +#include + #include "Common/ArmCommon.h" #include "Common/BitSet.h" #include "Common/CodeBlock.h" From 814aaaf5381e79230beedc3a76a14ae2baf17857 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:47:06 -0600 Subject: [PATCH 05/12] [AArch64] Implement a couple of emitter instructions. These will be used with the vertex loader JIT recompiler. --- Source/Core/Common/Arm64Emitter.cpp | 52 +++++++++++++++++++++++++++++ Source/Core/Common/Arm64Emitter.h | 10 ++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 57ad0b3215..741bdf8620 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -1006,6 +1006,10 @@ void ARM64XEmitter::SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { EncodeData3SrcInst(2, Rd, Rn, Rm, Ra); } +void ARM64XEmitter::SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + SMADDL(Rd, Rn, Rm, SP); +} void ARM64XEmitter::SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra) { EncodeData3SrcInst(3, Rd, Rn, Rm, Ra); @@ -1850,6 +1854,18 @@ void ARM64FloatEmitter::EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, (opcode << 15) | (1 << 14) | (Rn << 5) | Rd); } +void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm) +{ + bool quad = IsQuad(Rd); + + Rd = DecodeReg(Rd); + Rn = DecodeReg(Rn); + Rm = DecodeReg(Rm); + + Write32((quad << 30) | (U << 29) | (0b01111 << 24) | (size << 22) | (L << 21) | \ + (Rm << 16) | (opcode << 12) | (H << 11) | (Rn << 5) | Rd); +} + void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); @@ -2092,6 +2108,21 @@ void ARM64FloatEmitter::LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) opcode = 0b0010; EmitLoadStoreMultipleStructure(size, 1, opcode, Rt, Rn); } +void ARM64FloatEmitter::ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn) +{ + _assert_msg_(DYNA_REC, !(count == 0 || count > 4), "%s must have a count of 1 to 4 registers!", __FUNCTION__); + u32 opcode = 0; + if (count == 1) + opcode = 0b111; + else if (count == 2) + opcode = 0b1010; + else if (count == 3) + opcode = 0b0110; + else if (count == 4) + opcode = 0b0010; + EmitLoadStoreMultipleStructure(size, 0, opcode, Rt, Rn); +} + // Scalar - 1 Source void ARM64FloatEmitter::FABS(ARM64Reg Rd, ARM64Reg Rn) { @@ -2581,6 +2612,27 @@ void ARM64FloatEmitter::UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn) USHLL(src_size, Rd, Rn, 0); } +// vector x indexed element +void ARM64FloatEmitter::FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index) +{ + _assert_msg_(DYNA_REC, size == 32 || size == 64, "%s only supports 32bit or 64bit size!", __FUNCTION__); + + bool L = false; + bool H = false; + + if (size == 32) + { + L = index & 1; + H = (index >> 1) & 1; + } + else if (size == 64) + { + H = index == 1; + } + + EmitVectorxElement(0, 2 | (size >> 6), L, 0b1001, H, Rd, Rn, Rm); +} + void ARM64FloatEmitter::ABI_PushRegisters(BitSet32 registers) { for (auto it : registers) diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index b94e306612..7bcbe4a45a 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -78,8 +78,8 @@ enum ARM64Reg }; inline bool Is64Bit(ARM64Reg reg) { return reg & 0x20; } -inline bool IsSingle(ARM64Reg reg) { return reg & 0x40; } -inline bool IsDouble(ARM64Reg reg) { return reg & 0x80; } +inline bool IsSingle(ARM64Reg reg) { return (reg & 0xC0) == 0x40; } +inline bool IsDouble(ARM64Reg reg) { return (reg & 0xC0) == 0x80; } inline bool IsQuad(ARM64Reg reg) { return (reg & 0xC0) == 0xC0; } inline bool IsVector(ARM64Reg reg) { return (reg & 0xC0) != 0; } inline ARM64Reg DecodeReg(ARM64Reg reg) { return (ARM64Reg)(reg & 0x1F); } @@ -479,6 +479,7 @@ public: void MADD(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void MSUB(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void SMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); + void SMULL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void SMSUBL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); void SMULH(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); void UMADDL(ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, ARM64Reg Ra); @@ -641,6 +642,7 @@ public: // Loadstore multiple structure void LD1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); + void ST1(u8 size, u8 count, ARM64Reg Rt, ARM64Reg Rn); // Scalar - 1 Source void FABS(ARM64Reg Rd, ARM64Reg Rn); @@ -725,6 +727,9 @@ public: void SXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); void UXTL(u8 src_size, ARM64Reg Rd, ARM64Reg Rn); + // vector x indexed element + void FMUL(u8 size, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm, u8 index); + // ABI related void ABI_PushRegisters(BitSet32 registers); void ABI_PopRegisters(BitSet32 registers, BitSet32 ignore_mask = BitSet32(0)); @@ -750,6 +755,7 @@ private: void EmitShiftImm(bool U, u32 immh, u32 immb, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); + void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); }; class ARM64CodeBlock : public CodeBlock From b989c2fd8fc81c6b9f4746a3bed8bee6238d3f14 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:47:54 -0600 Subject: [PATCH 06/12] [AArch64] Fix another assert in the JIT register cache. --- Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp index 0abcb29abb..d94a611498 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_RegCache.cpp @@ -46,14 +46,14 @@ u32 Arm64RegCache::GetUnlockedRegisterCount() void Arm64RegCache::LockRegister(ARM64Reg host_reg) { auto reg = std::find(m_host_registers.begin(), m_host_registers.end(), host_reg); - _assert_msg_(DYNA_REC, reg == m_host_registers.end(), "Don't try locking a register that isn't in the cache"); + _assert_msg_(DYNA_REC, reg != m_host_registers.end(), "Don't try locking a register that isn't in the cache. Reg %d", host_reg); reg->Lock(); } void Arm64RegCache::UnlockRegister(ARM64Reg host_reg) { auto reg = std::find(m_host_registers.begin(), m_host_registers.end(), host_reg); - _assert_msg_(DYNA_REC, reg == m_host_registers.end(), "Don't try unlocking a register that isn't in the cache"); + _assert_msg_(DYNA_REC, reg != m_host_registers.end(), "Don't try unlocking a register that isn't in the cache. Reg %d", host_reg); reg->Unlock(); } From 15e41c67f8fdb527af3267790154538454412596 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:50:10 -0600 Subject: [PATCH 07/12] Change RunVertices' function arguments. This reduces some dumb state shuffling when calling the emitted vertex loaders. --- Source/Core/VideoBackends/Software/SWVertexLoader.cpp | 4 ++-- Source/Core/VideoCommon/VertexLoader.cpp | 2 +- Source/Core/VideoCommon/VertexLoader.h | 2 +- Source/Core/VideoCommon/VertexLoaderBase.cpp | 6 +++--- Source/Core/VideoCommon/VertexLoaderBase.h | 2 +- Source/Core/VideoCommon/VertexLoaderManager.cpp | 2 +- Source/Core/VideoCommon/VertexLoaderX64.cpp | 2 +- Source/Core/VideoCommon/VertexLoaderX64.h | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp index 752cea9a5b..d824893243 100644 --- a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp +++ b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp @@ -174,9 +174,9 @@ void SWVertexLoader::LoadVertex() // convert the vertex from the gc format to the videocommon (hardware optimized) format u8* old = g_video_buffer_read_ptr; int converted_vertices = m_CurrentLoader->RunVertices( - m_primitiveType, 1, DataReader(g_video_buffer_read_ptr, nullptr), // src - DataReader(m_LoadedVertices.data(), m_LoadedVertices.data() + m_LoadedVertices.size()) // dst + DataReader(m_LoadedVertices.data(), m_LoadedVertices.data() + m_LoadedVertices.size()), // dst + 1, m_primitiveType ); g_video_buffer_read_ptr = old + m_CurrentLoader->m_VertexSize; diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 9538b1b11d..881424052c 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -316,7 +316,7 @@ void VertexLoader::WriteCall(TPipelineFunction func) m_PipelineStages[m_numPipelineStages++] = func; } -int VertexLoader::RunVertices(int primitive, int count, DataReader src, DataReader dst) +int VertexLoader::RunVertices(DataReader src, DataReader dst, int count, int primitive) { g_vertex_manager_write_ptr = dst.GetPointer(); g_video_buffer_read_ptr = src.GetPointer(); diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index 009c1ae006..3bae14a2d0 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -32,7 +32,7 @@ class VertexLoader : public VertexLoaderBase public: VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr); - int RunVertices(int primitive, int count, DataReader src, DataReader dst) override; + int RunVertices(DataReader src, DataReader dst, int count, int primitive) override; std::string GetName() const override { return "OldLoader"; } bool IsInitialized() override { return true; } // This vertex loader supports all formats diff --git a/Source/Core/VideoCommon/VertexLoaderBase.cpp b/Source/Core/VideoCommon/VertexLoaderBase.cpp index b17731509d..c8ab14fb3b 100644 --- a/Source/Core/VideoCommon/VertexLoaderBase.cpp +++ b/Source/Core/VideoCommon/VertexLoaderBase.cpp @@ -159,13 +159,13 @@ public: delete b; } - int RunVertices(int primitive, int count, DataReader src, DataReader dst) override + int RunVertices(DataReader src, DataReader dst, int count, int primitive) override { buffer_a.resize(count * a->m_native_vtx_decl.stride + 4); buffer_b.resize(count * b->m_native_vtx_decl.stride + 4); - int count_a = a->RunVertices(primitive, count, src, DataReader(buffer_a.data(), buffer_a.data()+buffer_a.size())); - int count_b = b->RunVertices(primitive, count, src, DataReader(buffer_b.data(), buffer_b.data()+buffer_b.size())); + int count_a = a->RunVertices(src, DataReader(buffer_a.data(), buffer_a.data()+buffer_a.size()), count, primitive); + int count_b = b->RunVertices(src, DataReader(buffer_b.data(), buffer_b.data()+buffer_b.size()), count, primitive); if (count_a != count_b) ERROR_LOG(VIDEO, "The two vertex loaders have loaded a different amount of vertices (a: %d, b: %d).", count_a, count_b); diff --git a/Source/Core/VideoCommon/VertexLoaderBase.h b/Source/Core/VideoCommon/VertexLoaderBase.h index c5a11d2e63..fc3737ae69 100644 --- a/Source/Core/VideoCommon/VertexLoaderBase.h +++ b/Source/Core/VideoCommon/VertexLoaderBase.h @@ -74,7 +74,7 @@ public: static VertexLoaderBase* CreateVertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr); virtual ~VertexLoaderBase() {} - virtual int RunVertices(int primitive, int count, DataReader src, DataReader dst) = 0; + virtual int RunVertices(DataReader src, DataReader dst, int count, int primitive) = 0; virtual bool IsInitialized() = 0; diff --git a/Source/Core/VideoCommon/VertexLoaderManager.cpp b/Source/Core/VideoCommon/VertexLoaderManager.cpp index c15cadd267..506e3d92d1 100644 --- a/Source/Core/VideoCommon/VertexLoaderManager.cpp +++ b/Source/Core/VideoCommon/VertexLoaderManager.cpp @@ -164,7 +164,7 @@ int RunVertices(int vtx_attr_group, int primitive, int count, DataReader src, bo DataReader dst = VertexManager::PrepareForAdditionalData(primitive, count, loader->m_native_vtx_decl.stride, cullall); - count = loader->RunVertices(primitive, count, src, dst); + count = loader->RunVertices(src, dst, count, primitive); IndexGenerator::AddIndices(primitive, count); diff --git a/Source/Core/VideoCommon/VertexLoaderX64.cpp b/Source/Core/VideoCommon/VertexLoaderX64.cpp index de4ab58cc6..5c962095bd 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderX64.cpp @@ -457,7 +457,7 @@ bool VertexLoaderX64::IsInitialized() return cpu_info.bSSSE3; } -int VertexLoaderX64::RunVertices(int primitive, int count, DataReader src, DataReader dst) +int VertexLoaderX64::RunVertices(DataReader src, DataReader dst, int count, int primitive) { m_numLoadedVertices += count; return ((int (*)(u8* src, u8* dst, int count))region)(src.GetPointer(), dst.GetPointer(), count); diff --git a/Source/Core/VideoCommon/VertexLoaderX64.h b/Source/Core/VideoCommon/VertexLoaderX64.h index da65fb2e34..0139d392b7 100644 --- a/Source/Core/VideoCommon/VertexLoaderX64.h +++ b/Source/Core/VideoCommon/VertexLoaderX64.h @@ -9,7 +9,7 @@ public: protected: std::string GetName() const override { return "VertexLoaderX64"; } bool IsInitialized() override; - int RunVertices(int primitive, int count, DataReader src, DataReader dst) override; + int RunVertices(DataReader src, DataReader dst, int count, int primitive) override; private: u32 m_src_ofs = 0; From b4b03641b3edd060bcb8ba8d8f99fb65ab38c08e Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:52:07 -0600 Subject: [PATCH 08/12] [AArch64] Implement vertex loader recompiler. Shows a noticeable reduction in time spent in the vertex loader. --- Source/Core/VideoCommon/CMakeLists.txt | 9 +- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 487 ++++++++++++++++++ Source/Core/VideoCommon/VertexLoaderARM64.h | 29 ++ Source/Core/VideoCommon/VertexLoaderBase.cpp | 7 + 4 files changed, 527 insertions(+), 5 deletions(-) create mode 100644 Source/Core/VideoCommon/VertexLoaderARM64.cpp create mode 100644 Source/Core/VideoCommon/VertexLoaderARM64.h diff --git a/Source/Core/VideoCommon/CMakeLists.txt b/Source/Core/VideoCommon/CMakeLists.txt index ceb83564f1..4e6dd8a6b6 100644 --- a/Source/Core/VideoCommon/CMakeLists.txt +++ b/Source/Core/VideoCommon/CMakeLists.txt @@ -45,12 +45,11 @@ set(SRCS BoundingBox.cpp set(LIBS core png) if(_M_X86) - set(SRCS ${SRCS} TextureDecoder_x64.cpp VertexLoaderX64.cpp) + set(SRCS ${SRCS} TextureDecoder_x64.cpp VertexLoaderX64.cpp) +elseif(_M_ARM_64) + set(SRCS ${SRCS} VertexLoaderARM64.cpp TextureDecoder_Generic.cpp) else() - set(SRCS ${SRCS} TextureDecoder_Generic.cpp) -endif() -if(NOT ${CL} STREQUAL CL-NOTFOUND) - list(APPEND LIBS ${CL}) + set(SRCS ${SRCS} TextureDecoder_Generic.cpp) endif() if(LIBAV_FOUND OR WIN32) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp new file mode 100644 index 0000000000..8aa30a38f7 --- /dev/null +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -0,0 +1,487 @@ +// Copyright 2013 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#include "Core/PowerPC/JitArm64/Jit.h" +#include "VideoCommon/VertexLoaderARM64.h" + +using namespace Arm64Gen; + +ARM64Reg src_reg = X0; +ARM64Reg dst_reg = X1; +ARM64Reg count_reg = W2; +ARM64Reg skipped_reg = W17; +ARM64Reg scratch1_reg = W16; +ARM64Reg scratch2_reg = W15; +ARM64Reg scratch3_reg = W14; +ARM64Reg scratch4_reg = W13; +ARM64Reg saved_count = W12; + +ARM64Reg stride_reg = X11; +ARM64Reg arraybase_reg = X10; +ARM64Reg scale_reg = X9; + +static const float GC_ALIGNED16(scale_factors[]) = +{ + 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), + 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 6), 1.0 / (1ULL << 7), + 1.0 / (1ULL << 8), 1.0 / (1ULL << 9), 1.0 / (1ULL << 10), 1.0 / (1ULL << 11), + 1.0 / (1ULL << 12), 1.0 / (1ULL << 13), 1.0 / (1ULL << 14), 1.0 / (1ULL << 15), + 1.0 / (1ULL << 16), 1.0 / (1ULL << 17), 1.0 / (1ULL << 18), 1.0 / (1ULL << 19), + 1.0 / (1ULL << 20), 1.0 / (1ULL << 21), 1.0 / (1ULL << 22), 1.0 / (1ULL << 23), + 1.0 / (1ULL << 24), 1.0 / (1ULL << 25), 1.0 / (1ULL << 26), 1.0 / (1ULL << 27), + 1.0 / (1ULL << 28), 1.0 / (1ULL << 29), 1.0 / (1ULL << 30), 1.0 / (1ULL << 31), +}; + +VertexLoaderARM64::VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_att) + : VertexLoaderBase(vtx_desc, vtx_att), m_float_emit(this) +{ + if (!IsInitialized()) + return; + + AllocCodeSpace(4096); + ClearCodeSpace(); + GenerateVertexLoader(); + WriteProtect(); +} + +void VertexLoaderARM64::GetVertexAddr(int array, u64 attribute, ARM64Reg reg) +{ + ADD(reg, src_reg, m_src_ofs); + if (attribute & MASK_INDEXED) + { + if (attribute == INDEX8) + { + LDRB(INDEX_UNSIGNED, scratch1_reg, reg, 0); + m_src_ofs += 1; + } + else + { + LDRH(INDEX_UNSIGNED, scratch1_reg, reg, 0); + m_src_ofs += 2; + REV16(scratch1_reg, scratch1_reg); + } + + if (array == ARRAY_POSITION) + { + EOR(scratch2_reg, scratch1_reg, 0, attribute == INDEX8 ? 7 : 15); // 0xFF : 0xFFFF + m_skip_vertex = CBZ(scratch2_reg); + } + + LDR(INDEX_UNSIGNED, scratch2_reg, stride_reg, array * 4); + MUL(scratch1_reg, scratch1_reg, scratch2_reg); + + LDR(INDEX_UNSIGNED, EncodeRegTo64(scratch2_reg), arraybase_reg, array * 8); + ADD(EncodeRegTo64(reg), EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch2_reg)); + } +} + +s32 VertexLoaderARM64::GetAddressImm(int array, u64 attribute, Arm64Gen::ARM64Reg reg, u32 align) +{ + if (attribute & MASK_INDEXED || (m_src_ofs & (align - 1))) + GetVertexAddr(array, attribute, reg); + else + return m_src_ofs; + return -1; +} + +int VertexLoaderARM64::ReadVertex(u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format) +{ + ARM64Reg coords = count_in == 3 ? Q31 : D31; + ARM64Reg scale = count_in == 3 ? Q30 : D30; + + int elem_size = 1 << (format / 2); + int load_bytes = elem_size * count_in; + elem_size <<= 3; + + if (count_in == 1) + m_float_emit.LDR(elem_size, INDEX_UNSIGNED, coords, EncodeRegTo64(scratch1_reg), 0); + else + m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg)); + + if (format != FORMAT_FLOAT) + { + // Extend and convert to float + switch (format) + { + case FORMAT_UBYTE: + m_float_emit.UXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords)); + m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords)); + break; + case FORMAT_BYTE: + m_float_emit.SXTL(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords)); + m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords)); + break; + case FORMAT_USHORT: + m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords)); + m_float_emit.UXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords)); + break; + case FORMAT_SHORT: + m_float_emit.REV16(8, EncodeRegToDouble(coords), EncodeRegToDouble(coords)); + m_float_emit.SXTL(16, EncodeRegToDouble(coords), EncodeRegToDouble(coords)); + break; + } + + m_float_emit.SCVTF(32, coords, coords); + + if (dequantize && scaling_exponent) + { + m_float_emit.LDR(32, INDEX_UNSIGNED, scale, scale_reg, scaling_exponent * 4); + m_float_emit.FMUL(32, coords, coords, scale, 0); + } + } + else + { + m_float_emit.REV32(8, coords, coords); + } + + const u32 write_size = count_out == 3 ? 128 : count_out * 32; + const u32 mask = count_out == 3 ? 0xF : count_out == 2 ? 0x7 : 0x3; + if (!(m_dst_ofs & mask)) + { + m_float_emit.STR(write_size, INDEX_UNSIGNED, coords, dst_reg, m_dst_ofs); + + } + else + { + ADD(EncodeRegTo64(scratch2_reg), dst_reg, m_dst_ofs); + m_float_emit.ST1(32, 1, coords, EncodeRegTo64(scratch2_reg)); + } + + native_format->components = count_out; + native_format->enable = true; + native_format->offset = m_dst_ofs; + native_format->type = VAR_FLOAT; + native_format->integer = false; + m_dst_ofs += sizeof(float) * count_out; + + if (attribute == DIRECT) + m_src_ofs += load_bytes; + + return load_bytes; +} + +void VertexLoaderARM64::ReadColor(u64 attribute, int format, s32 offset) +{ + int load_bytes = 0; + switch (format) + { + case FORMAT_24B_888: + case FORMAT_32B_888x: + case FORMAT_32B_8888: + if (offset == -1) + LDR(INDEX_UNSIGNED, scratch2_reg, EncodeRegTo64(scratch1_reg), 0); + else + LDR(INDEX_UNSIGNED, scratch2_reg, src_reg, m_src_ofs); + + if (format != FORMAT_32B_8888) + ORR(scratch2_reg, scratch2_reg, 8, 7); // 0xFF000000 + STR(INDEX_UNSIGNED, scratch2_reg, dst_reg, m_dst_ofs); + load_bytes = 3 + (format != FORMAT_24B_888); + break; + + case FORMAT_16B_565: + // RRRRRGGG GGGBBBBB + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + if (offset == -1) + LDRH(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); + else + LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, m_src_ofs); + + REV16(scratch3_reg, scratch3_reg); + + // B + AND(scratch2_reg, scratch3_reg, 32, 4); + ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 3)); + ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 5)); + ORR(scratch1_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16)); + + // G + UBFM(scratch2_reg, scratch3_reg, 5, 10); + ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2)); + ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6)); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8)); + + // R + UBFM(scratch2_reg, scratch3_reg, 11, 15); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 3)); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 2)); + + // A + ORR(scratch2_reg, scratch2_reg, 8, 7); // 0xFF000000 + + STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs); + load_bytes = 2; + break; + + case FORMAT_16B_4444: + // BBBBAAAA RRRRGGGG + // REV16 - RRRRGGGG BBBBAAAA + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + if (offset == -1) + LDRH(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); + else + LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, m_src_ofs); + + // R + UBFM(scratch1_reg, scratch3_reg, 4, 7); + + // G + AND(scratch2_reg, scratch3_reg, 32, 3); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8)); + + // B + UBFM(scratch2_reg, scratch3_reg, 12, 15); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16)); + + // A + UBFM(scratch2_reg, scratch3_reg, 8, 11); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 24)); + + // Final duplication + ORR(scratch1_reg, scratch1_reg, scratch1_reg, ArithOption(scratch1_reg, ST_LSL, 4)); + + STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs); + load_bytes = 2; + break; + + case FORMAT_24B_6666: + // RRRRRRGG GGGGBBBB BBAAAAAA + // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR + if (offset == -1) + LDR(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); + else + LDR(INDEX_UNSIGNED, scratch3_reg, src_reg, m_src_ofs); + + REV32(scratch3_reg, scratch3_reg); + + // A + AND(scratch2_reg, scratch3_reg, 32, 5); + ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2)); + ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6)); + ORR(scratch1_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 24)); + + // B + UBFM(scratch2_reg, scratch3_reg, 6, 11); + ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2)); + ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6)); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 16)); + + // G + UBFM(scratch2_reg, scratch3_reg, 12, 17); + ORR(scratch2_reg, WSP, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2)); + ORR(scratch2_reg, scratch2_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 6)); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 8)); + + // R + UBFM(scratch2_reg, scratch3_reg, 18, 23); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSL, 2)); + ORR(scratch1_reg, scratch1_reg, scratch2_reg, ArithOption(scratch2_reg, ST_LSR, 4)); + + STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs); + load_bytes = 3; + break; + } + if (attribute == DIRECT) + m_src_ofs += load_bytes; +} + +void VertexLoaderARM64::GenerateVertexLoader() +{ + // R0 - Source pointer + // R1 - Destination pointer + // R2 - Count + // R30 - LR + // + // R0 return how many + // + // Registers we don't have to worry about saving + // R9-R17 are caller saved temporaries + // R18 is a temporary or platform specific register(iOS) + // + // VFP registers + // We can touch all except v8-v15 + // If we need to use those, we need to retain the lower 64bits(!) of the register + + MOV(skipped_reg, WSP); + MOV(saved_count, count_reg); + + MOVI2R(stride_reg, (u64)&g_main_cp_state.array_strides); + MOVI2R(arraybase_reg, (u64)&cached_arraybases); + MOVI2R(scale_reg, (u64)&scale_factors); + + const u8* loop_start = GetCodePtr(); + + if (m_VtxDesc.PosMatIdx) + { + LDRB(INDEX_UNSIGNED, scratch1_reg, src_reg, m_src_ofs); + AND(scratch1_reg, scratch1_reg, 0, 5); + STR(INDEX_UNSIGNED, scratch1_reg, dst_reg, m_dst_ofs); + m_native_components |= VB_HAS_POSMTXIDX; + m_native_vtx_decl.posmtx.components = 4; + m_native_vtx_decl.posmtx.enable = true; + m_native_vtx_decl.posmtx.offset = m_dst_ofs; + m_native_vtx_decl.posmtx.type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.posmtx.integer = true; + m_src_ofs += sizeof(u8); + m_dst_ofs += sizeof(u32); + } + + u32 texmatidx_ofs[8]; + const u64 tm[8] = { + m_VtxDesc.Tex0MatIdx, m_VtxDesc.Tex1MatIdx, m_VtxDesc.Tex2MatIdx, m_VtxDesc.Tex3MatIdx, + m_VtxDesc.Tex4MatIdx, m_VtxDesc.Tex5MatIdx, m_VtxDesc.Tex6MatIdx, m_VtxDesc.Tex7MatIdx, + }; + for (int i = 0; i < 8; i++) + { + if (tm[i]) + texmatidx_ofs[i] = m_src_ofs++; + } + + GetVertexAddr(ARRAY_POSITION, m_VtxDesc.Position, EncodeRegTo64(scratch1_reg)); + ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, + m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position); + + if (m_VtxDesc.Normal) + { + static const u8 map[8] = {7, 6, 15, 14}; + u8 scaling_exponent = map[m_VtxAttr.NormalFormat]; + + for (int i = 0; i < (m_VtxAttr.NormalElements ? 3 : 1); i++) + { + if (!i || m_VtxAttr.NormalIndex3) + { + GetVertexAddr(ARRAY_NORMAL, m_VtxDesc.Normal, EncodeRegTo64(scratch1_reg)); + int elem_size = 1 << (m_VtxAttr.NormalFormat / 2); + ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), i * elem_size * 3); + } + int bytes_read = ReadVertex(m_VtxDesc.Normal, m_VtxAttr.NormalFormat, 3, 3, + true, scaling_exponent, &m_native_vtx_decl.normals[i]); + + ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), bytes_read); + } + + m_native_components |= VB_HAS_NRM0; + if (m_VtxAttr.NormalElements) + m_native_components |= VB_HAS_NRM1 | VB_HAS_NRM2; + } + + const u64 col[2] = {m_VtxDesc.Color0, m_VtxDesc.Color1}; + for (int i = 0; i < 2; i++) + { + m_native_vtx_decl.colors[i].components = 4; + m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.colors[i].integer = false; + + if (col[i]) + { + u32 align = 4; + if (m_VtxAttr.color[i].Comp == FORMAT_16B_565 || + m_VtxAttr.color[i].Comp == FORMAT_16B_4444) + align = 2; + + s32 offset = GetAddressImm(ARRAY_COLOR + i, col[i], EncodeRegTo64(scratch1_reg), align); + ReadColor(col[i], m_VtxAttr.color[i].Comp, offset); + m_native_components |= VB_HAS_COL0 << i; + m_native_vtx_decl.colors[i].components = 4; + m_native_vtx_decl.colors[i].enable = true; + m_native_vtx_decl.colors[i].offset = m_dst_ofs; + m_native_vtx_decl.colors[i].type = VAR_UNSIGNED_BYTE; + m_native_vtx_decl.colors[i].integer = false; + m_dst_ofs += 4; + } + } + + const u64 tc[8] = { + m_VtxDesc.Tex0Coord, m_VtxDesc.Tex1Coord, m_VtxDesc.Tex2Coord, m_VtxDesc.Tex3Coord, + m_VtxDesc.Tex4Coord, m_VtxDesc.Tex5Coord, m_VtxDesc.Tex6Coord, m_VtxDesc.Tex7Coord, + }; + + for (int i = 0; i < 8; i++) + { + m_native_vtx_decl.texcoords[i].offset = m_dst_ofs; + m_native_vtx_decl.texcoords[i].type = VAR_FLOAT; + m_native_vtx_decl.texcoords[i].integer = false; + + int elements = m_VtxAttr.texCoord[i].Elements + 1; + if (tc[i]) + { + m_native_components |= VB_HAS_UV0 << i; + GetVertexAddr(ARRAY_TEXCOORD0 + i, tc[i], EncodeRegTo64(scratch1_reg)); + u8 scaling_exponent = m_VtxAttr.texCoord[i].Frac; + ReadVertex(tc[i], m_VtxAttr.texCoord[i].Format, elements, tm[i] ? 2 : elements, + m_VtxAttr.ByteDequant, scaling_exponent, &m_native_vtx_decl.texcoords[i]); + } + if (tm[i]) + { + m_native_components |= VB_HAS_TEXMTXIDX0 << i; + m_native_vtx_decl.texcoords[i].components = 3; + m_native_vtx_decl.texcoords[i].enable = true; + m_native_vtx_decl.texcoords[i].type = VAR_FLOAT; + m_native_vtx_decl.texcoords[i].integer = false; + + LDRB(INDEX_UNSIGNED, scratch2_reg, src_reg, texmatidx_ofs[i]); + m_float_emit.UCVTF(S31, scratch2_reg); + + if (tc[i]) + { + m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs); + m_dst_ofs += sizeof(float); + } + else + { + m_native_vtx_decl.texcoords[i].offset = m_dst_ofs; + + if (!(m_dst_ofs & 7)) + { + // If m_dst_ofs isn't 8byte aligned we can't store an 8byte zero register + // So store two 4byte zero registers + // The destination is always 4byte aligned + STR(INDEX_UNSIGNED, WSP, dst_reg, m_dst_ofs); + STR(INDEX_UNSIGNED, WSP, dst_reg, m_dst_ofs + 4); + m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs + 8); + } + else + { + STR(INDEX_UNSIGNED, SP, dst_reg, m_dst_ofs); + m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs + 8); + } + m_dst_ofs += sizeof(float) * 3; + } + } + } + + // Prepare for the next vertex. + ADD(dst_reg, dst_reg, m_dst_ofs); + const u8* cont = GetCodePtr(); + ADD(src_reg, src_reg, m_src_ofs); + + SUB(count_reg, count_reg, 1); + CBNZ(count_reg, loop_start); + + if (m_VtxDesc.Position & MASK_INDEXED) + { + SUB(W0, saved_count, skipped_reg); + RET(X30); + + SetJumpTarget(m_skip_vertex); + ADD(skipped_reg, skipped_reg, 1); + B(cont); + } + else + { + MOV(W0, saved_count); + RET(X30); + } + + FlushIcache(); + + m_VertexSize = m_src_ofs; + m_native_vtx_decl.stride = m_dst_ofs; +} + +int VertexLoaderARM64::RunVertices(DataReader src, DataReader dst, int count, int primitive) +{ + m_numLoadedVertices += count; + return ((int (*)(u8* src, u8* dst, int count))region)(src.GetPointer(), dst.GetPointer(), count); +} diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.h b/Source/Core/VideoCommon/VertexLoaderARM64.h new file mode 100644 index 0000000000..5dc0544adf --- /dev/null +++ b/Source/Core/VideoCommon/VertexLoaderARM64.h @@ -0,0 +1,29 @@ +// Copyright 2013 Dolphin Emulator Project +// Licensed under GPLv2 +// Refer to the license.txt file included. + +#pragma once +#include "Common/Arm64Emitter.h" +#include "VideoCommon/VertexLoaderBase.h" + +class VertexLoaderARM64 : public VertexLoaderBase, public Arm64Gen::ARM64CodeBlock +{ +public: + VertexLoaderARM64(const TVtxDesc& vtx_desc, const VAT& vtx_att); + +protected: + std::string GetName() const override { return "VertexLoaderARM64"; } + bool IsInitialized() override { return true; } + int RunVertices(DataReader src, DataReader dst, int count, int primitive) override; + +private: + u32 m_src_ofs = 0; + u32 m_dst_ofs = 0; + Arm64Gen::FixupBranch m_skip_vertex; + Arm64Gen::ARM64FloatEmitter m_float_emit; + void GetVertexAddr(int array, u64 attribute, Arm64Gen::ARM64Reg reg); + s32 GetAddressImm(int array, u64 attribute, Arm64Gen::ARM64Reg reg, u32 align); + int ReadVertex(u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format); + void ReadColor(u64 attribute, int format, s32 offset); + void GenerateVertexLoader(); +}; diff --git a/Source/Core/VideoCommon/VertexLoaderBase.cpp b/Source/Core/VideoCommon/VertexLoaderBase.cpp index c8ab14fb3b..924a51fc61 100644 --- a/Source/Core/VideoCommon/VertexLoaderBase.cpp +++ b/Source/Core/VideoCommon/VertexLoaderBase.cpp @@ -12,6 +12,8 @@ #ifdef _M_X86_64 #include "VideoCommon/VertexLoaderX64.h" +#elif defined(_M_ARM_64) +#include "VideoCommon/VertexLoaderARM64.h" #endif VertexLoaderBase::VertexLoaderBase(const TVtxDesc &vtx_desc, const VAT &vtx_attr) @@ -208,6 +210,11 @@ VertexLoaderBase* VertexLoaderBase::CreateVertexLoader(const TVtxDesc& vtx_desc, if (loader->IsInitialized()) return loader; delete loader; +#elif defined(_M_ARM_64) + loader = new VertexLoaderARM64(vtx_desc, vtx_attr); + if (loader->IsInitialized()) + return loader; + delete loader; #endif // last try: The old VertexLoader From f54a0d3ff4b7705546d501228286ad931f8fa222 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 18:53:46 -0600 Subject: [PATCH 09/12] [AArch64] Minor floating point loadstore improvements. These use the Vector x Element FMUL instruction to improve speeds slightly. Should give ~3cycle improvement per quantized loadstore done. --- .../PowerPC/JitArm64/JitArm64_BackPatch.cpp | 4 +- .../JitArm64/JitArm64_LoadStoreFloating.cpp | 2 +- Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp | 48 +++++++++---------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp index 747b30e8e6..2a1548317a 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_BackPatch.cpp @@ -137,7 +137,7 @@ u32 JitArm64::EmitBackpatchRoutine(ARM64XEmitter* emit, u32 flags, bool fastmem, ARM64FloatEmitter float_emit(emit); if (flags & BackPatchInfo::FLAG_SIZE_F32) { - float_emit.FCVT(32, 64, Q0, RS); + float_emit.FCVT(32, 64, D0, RS); float_emit.REV32(8, D0, D0); trouble_offset = (emit->GetCodePtr() - code_base) / 4; float_emit.STR(32, INDEX_UNSIGNED, D0, addr, 0); @@ -215,7 +215,7 @@ u32 JitArm64::EmitBackpatchRoutine(ARM64XEmitter* emit, u32 flags, bool fastmem, ARM64FloatEmitter float_emit(emit); if (flags & BackPatchInfo::FLAG_SIZE_F32) { - float_emit.FCVT(32, 64, Q0, RS); + float_emit.FCVT(32, 64, D0, RS); float_emit.UMOV(32, W0, Q0, 0); emit->MOVI2R(X30, (u64)&PowerPC::Write_U32); emit->BLR(X30); diff --git a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp index d2ffcf094f..21b8cbd897 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitArm64_LoadStoreFloating.cpp @@ -390,7 +390,7 @@ void JitArm64::stfXX(UGeckoInstruction inst) } else if (accessSize == 32) { - m_float_emit.FCVT(32, 64, Q0, V0); + m_float_emit.FCVT(32, 64, D0, EncodeRegToDouble(V0)); m_float_emit.REV32(8, D0, D0); m_float_emit.STR(32, INDEX_UNSIGNED, D0, X1, 0); } diff --git a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp index 81943ce16f..62cce8374b 100644 --- a/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp +++ b/Source/Core/Core/PowerPC/JitArm64/JitAsm.cpp @@ -128,8 +128,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(addr_reg, (u64)&m_dequantizeTableS); ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); RET(X30); } const u8* loadPairedS8Two = GetCodePtr(); @@ -142,8 +142,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(addr_reg, (u64)&m_dequantizeTableS); ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); RET(X30); } const u8* loadPairedU16Two = GetCodePtr(); @@ -156,8 +156,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(addr_reg, (u64)&m_dequantizeTableS); ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); RET(X30); } const u8* loadPairedS16Two = GetCodePtr(); @@ -170,8 +170,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(addr_reg, (u64)&m_dequantizeTableS); ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); RET(X30); } @@ -192,8 +192,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(addr_reg, (u64)&m_dequantizeTableS); ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); RET(X30); } const u8* loadPairedS8One = GetCodePtr(); @@ -206,8 +206,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(addr_reg, (u64)&m_dequantizeTableS); ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); RET(X30); } const u8* loadPairedU16One = GetCodePtr(); @@ -220,8 +220,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(addr_reg, (u64)&m_dequantizeTableS); ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); RET(X30); } const u8* loadPairedS16One = GetCodePtr(); @@ -234,8 +234,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(addr_reg, (u64)&m_dequantizeTableS); ADD(scale_reg, addr_reg, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); RET(X30); } @@ -295,8 +295,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(X2, (u64)&m_quantizeTableS); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -326,8 +326,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(X2, (u64)&m_quantizeTableS); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.XTN(8, D0, D0); @@ -358,8 +358,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(X2, (u64)&m_quantizeTableS); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); float_emit.FCVTZU(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.REV16(8, D0, D0); @@ -388,8 +388,8 @@ void JitArm64AsmRoutineManager::GenerateCommon() MOVI2R(X2, (u64)&m_quantizeTableS); ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3)); - float_emit.LD1R(32, D1, scale_reg); - float_emit.FMUL(32, D0, D0, D1); + float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0); + float_emit.FMUL(32, D0, D0, D1, 0); float_emit.FCVTZS(32, D0, D0); float_emit.XTN(16, D0, D0); float_emit.REV16(8, D0, D0); From 2ebe57ed3f63875e37a4d3f844b4cb68c3fc0f4b Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Thu, 12 Feb 2015 19:04:07 -0600 Subject: [PATCH 10/12] Convert our vertex loader unit test to the new RunVertices arguments arrangement. --- Source/UnitTests/VideoCommon/VertexLoaderTest.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp index 864e7de296..631ad463f4 100644 --- a/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp +++ b/Source/UnitTests/VideoCommon/VertexLoaderTest.cpp @@ -105,7 +105,7 @@ TEST_F(VertexLoaderTest, PositionDirectFloatXYZ) Input(0.0f); Input(0.0f); Input(1.0f); // Convert 4 points. "7" -> primitive are points. - int count = loader->RunVertices(7, 4, src, dst); + int count = loader->RunVertices(src, dst, 4, 7); src.Skip(4 * loader->m_VertexSize); dst.Skip(count * loader->m_native_vtx_decl.stride); delete loader; @@ -119,7 +119,7 @@ TEST_F(VertexLoaderTest, PositionDirectFloatXYZ) Input(1.0f); Input(2.0f); Input(4.0f); m_vtx_attr.g0.PosFrac = 1; loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); - count = loader->RunVertices(7, 1, src, dst); + count = loader->RunVertices(src, dst, 1, 7); src.Skip(1 * loader->m_VertexSize); dst.Skip(count * loader->m_native_vtx_decl.stride); ExpectOut(1.0f); ExpectOut(2.0f); ExpectOut(4.0f); @@ -145,7 +145,7 @@ TEST_F(VertexLoaderTest, PositionDirectU16XY) Input(12345); Input(54321); // Convert 5 points. "7" -> primitive are points. - int count = loader->RunVertices(7, 5, src, dst); + int count = loader->RunVertices(src, dst, 5, 7); src.Skip(5 * loader->m_VertexSize); dst.Skip(count * loader->m_native_vtx_decl.stride); delete loader; @@ -161,7 +161,7 @@ TEST_F(VertexLoaderTest, PositionDirectU16XY) m_vtx_attr.g0.PosFrac = 1; m_vtx_attr.g0.ByteDequant = 1; loader = VertexLoaderBase::CreateVertexLoader(m_vtx_desc, m_vtx_attr); - count = loader->RunVertices(7, 1, src, dst); + count = loader->RunVertices(src, dst, 1, 7); src.Skip(1 * loader->m_VertexSize); dst.Skip(count * loader->m_native_vtx_decl.stride); ExpectOut(21.0f); ExpectOut(12.0f); ExpectOut(0.0f); @@ -182,7 +182,7 @@ TEST_F(VertexLoaderTest, PositionDirectFloatXYZSpeed) for (int i = 0; i < 1000; ++i) { ResetPointers(); - int count = loader->RunVertices(7, 100000, src, dst); + int count = loader->RunVertices(src, dst, 100000, 7); src.Skip(100000 * loader->m_VertexSize); dst.Skip(count * loader->m_native_vtx_decl.stride); } @@ -203,7 +203,7 @@ TEST_F(VertexLoaderTest, PositionDirectU16XYSpeed) for (int i = 0; i < 1000; ++i) { ResetPointers(); - int count = loader->RunVertices(7, 100000, src, dst); + int count = loader->RunVertices(src, dst, 100000, 7); src.Skip(100000 * loader->m_VertexSize); dst.Skip(count * loader->m_native_vtx_decl.stride); } @@ -267,7 +267,7 @@ TEST_F(VertexLoaderTest, LargeFloatVertexSpeed) for (int i = 0; i < 100; ++i) { ResetPointers(); - int count = loader->RunVertices(7, 100000, src, dst); + int count = loader->RunVertices(src, dst, 100000, 7); src.Skip(100000 * loader->m_VertexSize); dst.Skip(count * loader->m_native_vtx_decl.stride); } From 120df4c68883065235e21c69f33ee47ffbcf2a8f Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 16 Feb 2015 22:00:43 -0600 Subject: [PATCH 11/12] [AArch64] Implement loadstore unscaled. --- Source/Core/Common/Arm64Emitter.cpp | 126 ++++++++++++++++++++++++++++ Source/Core/Common/Arm64Emitter.h | 17 ++++ 2 files changed, 143 insertions(+) diff --git a/Source/Core/Common/Arm64Emitter.cpp b/Source/Core/Common/Arm64Emitter.cpp index 741bdf8620..b8a7ee78a8 100644 --- a/Source/Core/Common/Arm64Emitter.cpp +++ b/Source/Core/Common/Arm64Emitter.cpp @@ -540,6 +540,15 @@ void ARM64XEmitter::EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm) ((imm & 0x1FFFFC) << 3) | Rd); } +void ARM64XEmitter::EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + _assert_msg_(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __FUNCTION__, imm); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((size << 30) | (0b111 << 27) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); +} + // FixupBranch branching void ARM64XEmitter::SetJumpTarget(FixupBranch const& branch) { @@ -1424,6 +1433,45 @@ void ARM64XEmitter::PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm) EncodeLoadStoreRegisterOffset(3, 2, Rt, Rn, Rm); } +// Load/Store register (unscaled offset) +void ARM64XEmitter::STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(0, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(1, Is64Bit(Rt) ? 2 : 3, Rt, Rn, imm); +} +void ARM64XEmitter::STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 0, Rt, Rn, imm); +} +void ARM64XEmitter::LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + EncodeLoadStoreUnscaled(Is64Bit(Rt) ? 3 : 2, 1, Rt, Rn, imm); +} +void ARM64XEmitter::LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + _assert_msg_(DYNA_REC, !Is64Bit(Rt), "%s must have a 64bit destination register!", __FUNCTION__); + EncodeLoadStoreUnscaled(2, 2, Rt, Rn, imm); +} + // Address of label/page PC-relative void ARM64XEmitter::ADR(ARM64Reg Rd, s32 imm) { @@ -1866,6 +1914,15 @@ void ARM64FloatEmitter::EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, (Rm << 16) | (opcode << 12) | (H << 11) | (Rn << 5) | Rd); } +void ARM64FloatEmitter::EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + _assert_msg_(DYNA_REC, !(imm < -256 || imm > 255), "%s received too large offset: %d", __FUNCTION__, imm); + Rt = DecodeReg(Rt); + Rn = DecodeReg(Rn); + + Write32((size << 30) | (0b1111 << 26) | (op << 22) | ((imm & 0x1FF) << 12) | (Rn << 5) | Rt); +} + void ARM64FloatEmitter::LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm) { EmitLoadStoreImmediate(size, 1, type, Rt, Rn, imm); @@ -1875,6 +1932,75 @@ void ARM64FloatEmitter::STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s EmitLoadStoreImmediate(size, 0, type, Rt, Rn, imm); } +// Loadstore unscaled +void ARM64FloatEmitter::LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 1; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 1; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 1; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 1; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 3; + } + + EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm); +} +void ARM64FloatEmitter::STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm) +{ + u32 encoded_size = 0; + u32 encoded_op = 0; + + if (size == 8) + { + encoded_size = 0; + encoded_op = 0; + } + else if (size == 16) + { + encoded_size = 1; + encoded_op = 0; + } + else if (size == 32) + { + encoded_size = 2; + encoded_op = 0; + } + else if (size == 64) + { + encoded_size = 3; + encoded_op = 0; + } + else if (size == 128) + { + encoded_size = 0; + encoded_op = 2; + } + + EmitLoadStoreUnscaled(encoded_size, encoded_op, Rt, Rn, imm); + +} + // Loadstore single structure void ARM64FloatEmitter::LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn) { diff --git a/Source/Core/Common/Arm64Emitter.h b/Source/Core/Common/Arm64Emitter.h index 7bcbe4a45a..bb3bf770ce 100644 --- a/Source/Core/Common/Arm64Emitter.h +++ b/Source/Core/Common/Arm64Emitter.h @@ -334,6 +334,7 @@ private: void EncodeLogicalImmInst(u32 op, ARM64Reg Rd, ARM64Reg Rn, u32 immr, u32 imms); void EncodeLoadStorePair(u32 op, u32 load, IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void EncodeAddressInst(u32 op, ARM64Reg Rd, s32 imm); + void EncodeLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); protected: inline void Write32(u32 value) @@ -585,6 +586,17 @@ public: void LDRSW(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); void PRFM(ARM64Reg Rt, ARM64Reg Rn, ArithOption Rm); + // Load/Store register (unscaled offset) + void STURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSB(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSH(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDUR(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void LDURSW(ARM64Reg Rt, ARM64Reg Rn, s32 imm); + // Load/Store pair void LDP(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); void LDPSW(IndexType type, ARM64Reg Rt, ARM64Reg Rt2, ARM64Reg Rn, s32 imm); @@ -633,6 +645,10 @@ public: void LDR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); void STR(u8 size, IndexType type, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + // Loadstore unscaled + void LDUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + void STUR(u8 size, ARM64Reg Rt, ARM64Reg Rn, s32 imm); + // Loadstore single structure void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn); void LD1(u8 size, ARM64Reg Rt, u8 index, ARM64Reg Rn, ARM64Reg Rm); @@ -756,6 +772,7 @@ private: void EmitLoadStoreMultipleStructure(u32 size, bool L, u32 opcode, ARM64Reg Rt, ARM64Reg Rn); void EmitScalar1Source(bool M, bool S, u32 type, u32 opcode, ARM64Reg Rd, ARM64Reg Rn); void EmitVectorxElement(bool U, u32 size, bool L, u32 opcode, bool H, ARM64Reg Rd, ARM64Reg Rn, ARM64Reg Rm); + void EmitLoadStoreUnscaled(u32 size, u32 op, ARM64Reg Rt, ARM64Reg Rn, s32 imm); }; class ARM64CodeBlock : public CodeBlock From ed008c3a695e1f510597604d443264de143afc59 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 16 Feb 2015 22:01:07 -0600 Subject: [PATCH 12/12] [AArch64] Change the vertex loader over to using unscaled loadstores. In nearly all direct loadstore cases we can use unscaled loadstores. Still have a fallback in case we hit a situation that we /can't/ do a unscaled loadstore. --- Source/Core/VideoCommon/VertexLoaderARM64.cpp | 100 ++++++++++++++---- Source/Core/VideoCommon/VertexLoaderARM64.h | 2 +- 2 files changed, 78 insertions(+), 24 deletions(-) diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.cpp b/Source/Core/VideoCommon/VertexLoaderARM64.cpp index 8aa30a38f7..4a404e9a75 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.cpp +++ b/Source/Core/VideoCommon/VertexLoaderARM64.cpp @@ -2,7 +2,6 @@ // Licensed under GPLv2 // Refer to the license.txt file included. -#include "Core/PowerPC/JitArm64/Jit.h" #include "VideoCommon/VertexLoaderARM64.h" using namespace Arm64Gen; @@ -78,26 +77,40 @@ void VertexLoaderARM64::GetVertexAddr(int array, u64 attribute, ARM64Reg reg) s32 VertexLoaderARM64::GetAddressImm(int array, u64 attribute, Arm64Gen::ARM64Reg reg, u32 align) { - if (attribute & MASK_INDEXED || (m_src_ofs & (align - 1))) + if (attribute & MASK_INDEXED || + (m_src_ofs > 255 && (m_src_ofs & (align - 1)))) GetVertexAddr(array, attribute, reg); else return m_src_ofs; return -1; } -int VertexLoaderARM64::ReadVertex(u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format) +int VertexLoaderARM64::ReadVertex(u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format, s32 offset) { ARM64Reg coords = count_in == 3 ? Q31 : D31; ARM64Reg scale = count_in == 3 ? Q30 : D30; int elem_size = 1 << (format / 2); int load_bytes = elem_size * count_in; + int load_size = load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16; + load_size <<= 3; elem_size <<= 3; - if (count_in == 1) - m_float_emit.LDR(elem_size, INDEX_UNSIGNED, coords, EncodeRegTo64(scratch1_reg), 0); + if (offset == -1) + { + if (count_in == 1) + m_float_emit.LDR(elem_size, INDEX_UNSIGNED, coords, EncodeRegTo64(scratch1_reg), 0); + else + m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg)); + } + else if (offset & (load_size - 1)) // Not aligned - unscaled + { + m_float_emit.LDUR(load_size, coords, src_reg, offset); + } else - m_float_emit.LD1(elem_size, 1, coords, EncodeRegTo64(scratch1_reg)); + { + m_float_emit.LDR(load_size, INDEX_UNSIGNED, coords, src_reg, offset); + } if (format != FORMAT_FLOAT) { @@ -137,10 +150,13 @@ int VertexLoaderARM64::ReadVertex(u64 attribute, int format, int count_in, int c const u32 write_size = count_out == 3 ? 128 : count_out * 32; const u32 mask = count_out == 3 ? 0xF : count_out == 2 ? 0x7 : 0x3; - if (!(m_dst_ofs & mask)) + if (m_dst_ofs < 256) + { + m_float_emit.STUR(write_size, coords, dst_reg, m_dst_ofs); + } + else if (!(m_dst_ofs & mask)) { m_float_emit.STR(write_size, INDEX_UNSIGNED, coords, dst_reg, m_dst_ofs); - } else { @@ -171,8 +187,10 @@ void VertexLoaderARM64::ReadColor(u64 attribute, int format, s32 offset) case FORMAT_32B_8888: if (offset == -1) LDR(INDEX_UNSIGNED, scratch2_reg, EncodeRegTo64(scratch1_reg), 0); + else if (offset & 3) // Not aligned - unscaled + LDUR(scratch2_reg, src_reg, offset); else - LDR(INDEX_UNSIGNED, scratch2_reg, src_reg, m_src_ofs); + LDR(INDEX_UNSIGNED, scratch2_reg, src_reg, offset); if (format != FORMAT_32B_8888) ORR(scratch2_reg, scratch2_reg, 8, 7); // 0xFF000000 @@ -185,8 +203,10 @@ void VertexLoaderARM64::ReadColor(u64 attribute, int format, s32 offset) // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR if (offset == -1) LDRH(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); + else if (offset & 1) // Not aligned - unscaled + LDURH(scratch2_reg, src_reg, offset); else - LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, m_src_ofs); + LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, offset); REV16(scratch3_reg, scratch3_reg); @@ -220,8 +240,10 @@ void VertexLoaderARM64::ReadColor(u64 attribute, int format, s32 offset) // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR if (offset == -1) LDRH(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); + else if (offset & 1) // Not aligned - unscaled + LDURH(scratch2_reg, src_reg, offset); else - LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, m_src_ofs); + LDRH(INDEX_UNSIGNED, scratch3_reg, src_reg, offset); // R UBFM(scratch1_reg, scratch3_reg, 4, 7); @@ -250,6 +272,8 @@ void VertexLoaderARM64::ReadColor(u64 attribute, int format, s32 offset) // AAAAAAAA BBBBBBBB GGGGGGGG RRRRRRRR if (offset == -1) LDR(INDEX_UNSIGNED, scratch3_reg, EncodeRegTo64(scratch1_reg), 0); + else if (offset & 3) // Not aligned - unscaled + LDUR(scratch2_reg, src_reg, offset); else LDR(INDEX_UNSIGNED, scratch3_reg, src_reg, m_src_ofs); @@ -338,27 +362,47 @@ void VertexLoaderARM64::GenerateVertexLoader() texmatidx_ofs[i] = m_src_ofs++; } - GetVertexAddr(ARRAY_POSITION, m_VtxDesc.Position, EncodeRegTo64(scratch1_reg)); - ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, - m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position); + // Position + { + int elem_size = 1 << (m_VtxAttr.PosFormat / 2); + int load_bytes = elem_size * (m_VtxAttr.PosElements + 2); + int load_size = load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16; + load_size <<= 3; + + s32 offset = GetAddressImm(ARRAY_POSITION, m_VtxDesc.Position, EncodeRegTo64(scratch1_reg), load_size); + ReadVertex(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements + 2, 3, + m_VtxAttr.ByteDequant, m_VtxAttr.PosFrac, &m_native_vtx_decl.position, offset); + } if (m_VtxDesc.Normal) { static const u8 map[8] = {7, 6, 15, 14}; u8 scaling_exponent = map[m_VtxAttr.NormalFormat]; + s32 offset = -1; for (int i = 0; i < (m_VtxAttr.NormalElements ? 3 : 1); i++) { if (!i || m_VtxAttr.NormalIndex3) { - GetVertexAddr(ARRAY_NORMAL, m_VtxDesc.Normal, EncodeRegTo64(scratch1_reg)); int elem_size = 1 << (m_VtxAttr.NormalFormat / 2); - ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), i * elem_size * 3); + + int load_bytes = elem_size * 3; + int load_size = load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16; + + offset = GetAddressImm(ARRAY_NORMAL, m_VtxDesc.Normal, EncodeRegTo64(scratch1_reg), load_size << 3); + + if (offset == -1) + ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), i * elem_size * 3); + else + offset += i * elem_size * 3; } int bytes_read = ReadVertex(m_VtxDesc.Normal, m_VtxAttr.NormalFormat, 3, 3, - true, scaling_exponent, &m_native_vtx_decl.normals[i]); + true, scaling_exponent, &m_native_vtx_decl.normals[i], offset); - ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), bytes_read); + if (offset == -1) + ADD(EncodeRegTo64(scratch1_reg), EncodeRegTo64(scratch1_reg), bytes_read); + else + offset += bytes_read; } m_native_components |= VB_HAS_NRM0; @@ -407,10 +451,16 @@ void VertexLoaderARM64::GenerateVertexLoader() if (tc[i]) { m_native_components |= VB_HAS_UV0 << i; - GetVertexAddr(ARRAY_TEXCOORD0 + i, tc[i], EncodeRegTo64(scratch1_reg)); + + int elem_size = 1 << (m_VtxAttr.texCoord[i].Format / 2); + int load_bytes = elem_size * (elements + 2); + int load_size = load_bytes == 1 ? 1 : load_bytes <= 2 ? 2 : load_bytes <= 4 ? 4 : load_bytes <= 8 ? 8 : 16; + load_size <<= 3; + + s32 offset = GetAddressImm(ARRAY_TEXCOORD0 + i, tc[i], EncodeRegTo64(scratch1_reg), load_size); u8 scaling_exponent = m_VtxAttr.texCoord[i].Frac; ReadVertex(tc[i], m_VtxAttr.texCoord[i].Format, elements, tm[i] ? 2 : elements, - m_VtxAttr.ByteDequant, scaling_exponent, &m_native_vtx_decl.texcoords[i]); + m_VtxAttr.ByteDequant, scaling_exponent, &m_native_vtx_decl.texcoords[i], offset); } if (tm[i]) { @@ -432,20 +482,24 @@ void VertexLoaderARM64::GenerateVertexLoader() { m_native_vtx_decl.texcoords[i].offset = m_dst_ofs; - if (!(m_dst_ofs & 7)) + if (m_dst_ofs < 256) + { + STUR(SP, dst_reg, m_dst_ofs); + } + else if (!(m_dst_ofs & 7)) { // If m_dst_ofs isn't 8byte aligned we can't store an 8byte zero register // So store two 4byte zero registers // The destination is always 4byte aligned STR(INDEX_UNSIGNED, WSP, dst_reg, m_dst_ofs); STR(INDEX_UNSIGNED, WSP, dst_reg, m_dst_ofs + 4); - m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs + 8); } else { STR(INDEX_UNSIGNED, SP, dst_reg, m_dst_ofs); - m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs + 8); } + m_float_emit.STR(32, INDEX_UNSIGNED, D31, dst_reg, m_dst_ofs + 8); + m_dst_ofs += sizeof(float) * 3; } } diff --git a/Source/Core/VideoCommon/VertexLoaderARM64.h b/Source/Core/VideoCommon/VertexLoaderARM64.h index 5dc0544adf..4a6a389201 100644 --- a/Source/Core/VideoCommon/VertexLoaderARM64.h +++ b/Source/Core/VideoCommon/VertexLoaderARM64.h @@ -23,7 +23,7 @@ private: Arm64Gen::ARM64FloatEmitter m_float_emit; void GetVertexAddr(int array, u64 attribute, Arm64Gen::ARM64Reg reg); s32 GetAddressImm(int array, u64 attribute, Arm64Gen::ARM64Reg reg, u32 align); - int ReadVertex(u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format); + int ReadVertex(u64 attribute, int format, int count_in, int count_out, bool dequantize, u8 scaling_exponent, AttributeFormat* native_format, s32 offset = -1); void ReadColor(u64 attribute, int format, s32 offset); void GenerateVertexLoader(); };