From 6f34b27323c56b49886de67f937b145e23ff2554 Mon Sep 17 00:00:00 2001 From: degasus Date: Fri, 26 Jun 2015 13:16:38 +0200 Subject: [PATCH 1/3] Jit64: implement dcbf + dcbi --- Source/Core/Core/HW/DSP.cpp | 18 +++++- Source/Core/Core/HW/DSP.h | 3 + .../Interpreter/Interpreter_LoadStore.cpp | 13 +--- Source/Core/Core/PowerPC/Jit64/Jit.h | 2 + .../Core/Core/PowerPC/Jit64/Jit64_Tables.cpp | 10 +-- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 64 +++++++++++++++++++ Source/Core/Core/PowerPC/JitCommon/JitCache.h | 7 +- 7 files changed, 98 insertions(+), 19 deletions(-) diff --git a/Source/Core/Core/HW/DSP.cpp b/Source/Core/Core/HW/DSP.cpp index 5ad1f2d143..8935139cef 100644 --- a/Source/Core/Core/HW/DSP.cpp +++ b/Source/Core/Core/HW/DSP.cpp @@ -145,12 +145,12 @@ struct ARAMInfo // STATE_TO_SAVE static ARAMInfo g_ARAM; -static UDSPControl g_dspState; static AudioDMA g_audioDMA; static ARAM_DMA g_arDMA; static u32 last_mmaddr; static u32 last_aram_dma_count; static bool instant_dma; +UDSPControl g_dspState; union ARAM_Info { @@ -216,6 +216,22 @@ void EnableInstantDMA() instant_dma = true; } +void FlushInstantDMA(u32 address) +{ + u64 dma_in_progress = DSP::DMAInProgress(); + if (dma_in_progress != 0) + { + u32 start_addr = (dma_in_progress >> 32) & Memory::RAM_MASK; + u32 end_addr = (dma_in_progress & Memory::RAM_MASK) & 0xffffffff; + u32 invalidated_addr = (address & Memory::RAM_MASK) & ~0x1f; + + if (invalidated_addr >= start_addr && invalidated_addr <= end_addr) + { + DSP::EnableInstantDMA(); + } + } +} + DSPEmulator *GetDSPEmulator() { return dsp_emulator; diff --git a/Source/Core/Core/HW/DSP.h b/Source/Core/Core/HW/DSP.h index 6245c435e0..2152cb41c6 100644 --- a/Source/Core/Core/HW/DSP.h +++ b/Source/Core/Core/HW/DSP.h @@ -56,6 +56,8 @@ union UDSPControl UDSPControl(u16 _Hex = 0) : Hex(_Hex) {} }; +extern UDSPControl g_dspState; + void Init(bool hle); void Shutdown(); @@ -78,5 +80,6 @@ void UpdateAudioDMA(); void UpdateDSPSlice(int cycles); u64 DMAInProgress(); void EnableInstantDMA(); +void FlushInstantDMA(u32 address); }// end of namespace DSP diff --git a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp index 2bd39cafec..43e89a54e3 100644 --- a/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Interpreter/Interpreter_LoadStore.cpp @@ -340,18 +340,7 @@ void Interpreter::dcbi(UGeckoInstruction _inst) // The following detects a situation where the game is writing to the dcache at the address being DMA'd. As we do not // have dcache emulation, invalid data is being DMA'd causing audio glitches. The following code detects this and // enables the DMA to complete instantly before the invalid data is written. Resident Evil 2 & 3 trigger this. - u64 dma_in_progress = DSP::DMAInProgress(); - if (dma_in_progress != 0) - { - u32 start_addr = (dma_in_progress >> 32) & Memory::RAM_MASK; - u32 end_addr = (dma_in_progress & Memory::RAM_MASK) & 0xffffffff; - u32 invalidated_addr = (address & Memory::RAM_MASK) & ~0x1f; - - if (invalidated_addr >= start_addr && invalidated_addr <= end_addr) - { - DSP::EnableInstantDMA(); - } - } + DSP::FlushInstantDMA(address); } void Interpreter::dcbst(UGeckoInstruction _inst) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit.h b/Source/Core/Core/PowerPC/Jit64/Jit.h index a4449ae256..235713108c 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit.h +++ b/Source/Core/Core/PowerPC/Jit64/Jit.h @@ -255,4 +255,6 @@ public: void lmw(UGeckoInstruction inst); void stmw(UGeckoInstruction inst); + + void dcbx(UGeckoInstruction inst); }; diff --git a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp index 90d9405601..674c768f85 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit64_Tables.cpp @@ -214,11 +214,11 @@ static GekkoOPTemplate table31[] = {824, &Jit64::srawix}, // srawix {24, &Jit64::slwx}, // slwx - {54, &Jit64::FallBackToInterpreter}, // dcbst - {86, &Jit64::FallBackToInterpreter}, // dcbf - {246, &Jit64::dcbt }, // dcbtst - {278, &Jit64::dcbt }, // dcbt - {470, &Jit64::FallBackToInterpreter}, // dcbi + {54, &Jit64::dcbx}, // dcbst + {86, &Jit64::dcbx}, // dcbf + {246, &Jit64::dcbt}, // dcbtst + {278, &Jit64::dcbt}, // dcbt + {470, &Jit64::dcbx}, // dcbi {758, &Jit64::DoNothing}, // dcba {1014, &Jit64::dcbz}, // dcbz diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index d3bd0aa588..a7db1e6dce 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -7,6 +7,8 @@ #include "Common/CommonTypes.h" +#include "Core/HW/DSP.h" +#include "Core/PowerPC/JitInterface.h" #include "Core/PowerPC/Jit64/Jit.h" #include "Core/PowerPC/Jit64/JitAsm.h" #include "Core/PowerPC/Jit64/JitRegCache.h" @@ -290,6 +292,68 @@ void Jit64::lXXx(UGeckoInstruction inst) gpr.UnlockAllX(); } +void Jit64::dcbx(UGeckoInstruction inst) +{ + INSTRUCTION_START + JITDISABLE(bJITLoadStoreOff); + + X64Reg addr = RSCRATCH; + X64Reg value = RSCRATCH2; + X64Reg tmp = ECX; + + PUSH(tmp); + + MOV(32, R(addr), gpr.R(inst.RB)); + if (inst.RA) + { + ADD(32, R(addr), gpr.R(inst.RA)); + } + + MOV(32, R(value), R(addr)); + SHL(32, R(value), Imm8(3)); + SHR(32, R(value), Imm8(13)); + MOV(64, R(tmp), ImmPtr(jit->GetBlockCache()->GetBlockBitSet())); + MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0)); + + MOV(32, R(tmp), R(addr)); + SHR(32, R(tmp), Imm8(5)); + SHR(32, R(value), R(tmp)); + TEST(32, R(value), Imm32(1)); + + FixupBranch c = J_CC(CC_NZ, true); + SwitchToFarCode(); + SetJumpTarget(c); + BitSet32 registersInUse = CallerSavedRegistersInUse(); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); + MOV(32, R(ABI_PARAM1), R(addr)); + MOV(32, R(ABI_PARAM2), Imm32(32)); + XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); + ABI_CallFunction((void*)JitInterface::InvalidateICache); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); + c = J(true); + SwitchToNearCode(); + SetJumpTarget(c); + + // dcbi + if (inst.SUBOP10 == 470) + { + MOV(16, R(tmp), M(&DSP::g_dspState)); + TEST(16, R(tmp), Imm16(1 << 9)); + c = J_CC(CC_NZ, true); + SwitchToFarCode(); + SetJumpTarget(c); + ABI_PushRegistersAndAdjustStack(registersInUse, 0); + MOV(32, R(ABI_PARAM1), R(addr)); + ABI_CallFunction((void*)DSP::FlushInstantDMA); + ABI_PopRegistersAndAdjustStack(registersInUse, 0); + c = J(true); + SwitchToNearCode(); + SetJumpTarget(c); + } + + POP(tmp); +} + void Jit64::dcbt(UGeckoInstruction inst) { INSTRUCTION_START diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index 7e310f6a35..47408f6c7c 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -60,6 +60,7 @@ typedef void (*CompiledCode)(); // implementation of std::bitset is slow. class ValidBlockBitSet final { +public: enum { VALID_BLOCK_MASK_SIZE = 0x20000000 / 32, @@ -67,7 +68,6 @@ class ValidBlockBitSet final }; std::unique_ptr m_valid_block; -public: ValidBlockBitSet() { m_valid_block.reset(new u32[VALID_BLOCK_ALLOC_ELEMENTS]); @@ -157,6 +157,11 @@ public: // DOES NOT WORK CORRECTLY WITH INLINING void InvalidateICache(u32 address, const u32 length, bool forced); + + u32* GetBlockBitSet() const + { + return valid_block.m_valid_block.get(); + } }; // x86 BlockCache From ac84d6d0fa7c47543db8e6704c23583c051cc89c Mon Sep 17 00:00:00 2001 From: Tillmann Karras Date: Thu, 6 Aug 2015 22:45:38 +0200 Subject: [PATCH 2/3] Jit64: some cache flush changes - dynamically allocate third scratch register instead of forcing ECX - use LEA as 3 operand add if possible - use BT,JC instead of SHR,TEST,JNZ - merge MOV,TEST - use appropriate ABI function (no asm change) --- .../Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index a7db1e6dce..c3f89a9002 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -299,14 +299,18 @@ void Jit64::dcbx(UGeckoInstruction inst) X64Reg addr = RSCRATCH; X64Reg value = RSCRATCH2; - X64Reg tmp = ECX; + X64Reg tmp = gpr.GetFreeXReg(); + gpr.FlushLockX(tmp); - PUSH(tmp); - - MOV(32, R(addr), gpr.R(inst.RB)); - if (inst.RA) + if (inst.RA && gpr.R(inst.RA).IsSimpleReg() && gpr.R(inst.RB).IsSimpleReg()) { - ADD(32, R(addr), gpr.R(inst.RA)); + LEA(32, addr, MRegSum(gpr.RX(inst.RA), gpr.RX(inst.RB))); + } + else + { + MOV(32, R(addr), gpr.R(inst.RB)); + if (inst.RA) + ADD(32, R(addr), gpr.R(inst.RA)); } MOV(32, R(value), R(addr)); @@ -317,10 +321,9 @@ void Jit64::dcbx(UGeckoInstruction inst) MOV(32, R(tmp), R(addr)); SHR(32, R(tmp), Imm8(5)); - SHR(32, R(value), R(tmp)); - TEST(32, R(value), Imm32(1)); + BT(32, R(value), R(tmp)); - FixupBranch c = J_CC(CC_NZ, true); + FixupBranch c = J_CC(CC_C, true); SwitchToFarCode(); SetJumpTarget(c); BitSet32 registersInUse = CallerSavedRegistersInUse(); @@ -337,21 +340,19 @@ void Jit64::dcbx(UGeckoInstruction inst) // dcbi if (inst.SUBOP10 == 470) { - MOV(16, R(tmp), M(&DSP::g_dspState)); - TEST(16, R(tmp), Imm16(1 << 9)); + TEST(16, M(&DSP::g_dspState), Imm16(1 << 9)); c = J_CC(CC_NZ, true); SwitchToFarCode(); SetJumpTarget(c); ABI_PushRegistersAndAdjustStack(registersInUse, 0); - MOV(32, R(ABI_PARAM1), R(addr)); - ABI_CallFunction((void*)DSP::FlushInstantDMA); + ABI_CallFunctionR((void*)DSP::FlushInstantDMA, addr); ABI_PopRegistersAndAdjustStack(registersInUse, 0); c = J(true); SwitchToNearCode(); SetJumpTarget(c); } - POP(tmp); + gpr.UnlockAllX(); } void Jit64::dcbt(UGeckoInstruction inst) From 0d92c8fb89d949f12eecbabf65989b1ebc9e2e37 Mon Sep 17 00:00:00 2001 From: degasus Date: Sat, 8 Aug 2015 17:23:52 +0200 Subject: [PATCH 3/3] Jit64: Optimize dcbx --- Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp | 15 ++++++++------- Source/Core/Core/PowerPC/JitCommon/JitCache.h | 1 + 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp index c3f89a9002..8e3e1cff11 100644 --- a/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/PowerPC/Jit64/Jit_LoadStore.cpp @@ -313,15 +313,13 @@ void Jit64::dcbx(UGeckoInstruction inst) ADD(32, R(addr), gpr.R(inst.RA)); } - MOV(32, R(value), R(addr)); - SHL(32, R(value), Imm8(3)); - SHR(32, R(value), Imm8(13)); + // Check whether a JIT cache line needs to be invalidated. + LEA(32, value, MScaled(addr, SCALE_8, 0)); // addr << 3 (masks the first 3 bits) + SHR(32, R(value), Imm8(3 + 5 + 5)); // >> 5 for cache line size, >> 5 for width of bitset MOV(64, R(tmp), ImmPtr(jit->GetBlockCache()->GetBlockBitSet())); MOV(32, R(value), MComplex(tmp, value, SCALE_4, 0)); - - MOV(32, R(tmp), R(addr)); - SHR(32, R(tmp), Imm8(5)); - BT(32, R(value), R(tmp)); + SHR(32, R(addr), Imm8(5)); + BT(32, R(value), R(addr)); FixupBranch c = J_CC(CC_C, true); SwitchToFarCode(); @@ -329,6 +327,7 @@ void Jit64::dcbx(UGeckoInstruction inst) BitSet32 registersInUse = CallerSavedRegistersInUse(); ABI_PushRegistersAndAdjustStack(registersInUse, 0); MOV(32, R(ABI_PARAM1), R(addr)); + SHL(32, R(ABI_PARAM1), Imm8(5)); MOV(32, R(ABI_PARAM2), Imm32(32)); XOR(32, R(ABI_PARAM3), R(ABI_PARAM3)); ABI_CallFunction((void*)JitInterface::InvalidateICache); @@ -340,11 +339,13 @@ void Jit64::dcbx(UGeckoInstruction inst) // dcbi if (inst.SUBOP10 == 470) { + // Flush DSP DMA if DMAState bit is set TEST(16, M(&DSP::g_dspState), Imm16(1 << 9)); c = J_CC(CC_NZ, true); SwitchToFarCode(); SetJumpTarget(c); ABI_PushRegistersAndAdjustStack(registersInUse, 0); + SHL(32, R(addr), Imm8(5)); ABI_CallFunctionR((void*)DSP::FlushInstantDMA, addr); ABI_PopRegistersAndAdjustStack(registersInUse, 0); c = J(true); diff --git a/Source/Core/Core/PowerPC/JitCommon/JitCache.h b/Source/Core/Core/PowerPC/JitCommon/JitCache.h index 47408f6c7c..af0580ac85 100644 --- a/Source/Core/Core/PowerPC/JitCommon/JitCache.h +++ b/Source/Core/Core/PowerPC/JitCommon/JitCache.h @@ -66,6 +66,7 @@ public: VALID_BLOCK_MASK_SIZE = 0x20000000 / 32, VALID_BLOCK_ALLOC_ELEMENTS = VALID_BLOCK_MASK_SIZE / 32 }; + // Directly accessed by Jit64. std::unique_ptr m_valid_block; ValidBlockBitSet()