From d23af1a15e07a66bf88352facd527bed6df3828c Mon Sep 17 00:00:00 2001 From: hrydgard Date: Thu, 17 Jul 2008 18:51:53 +0000 Subject: [PATCH] New LockThreads option. Also added new INI core option - OptimizeQuantizers. Set to False to work around Resident Evil 1 bug (this will slow down other games somewhat). git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@20 8ced0084-cf51-0410-be5f-012b33b47a6e --- Externals/zlib/zlib.vcproj | 12 +- Source/Core/Core/Core.vcproj | 4 + Source/Core/Core/Src/Core.cpp | 8 +- Source/Core/Core/Src/CoreParameter.cpp | 1 + Source/Core/Core/Src/CoreParameter.h | 2 + .../Core/Core/Src/PowerPC/Jit64/JitCache.cpp | 2 +- .../Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp | 272 +-------------- .../Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp | 312 ++++++++++++++++++ Source/Core/DebuggerWX/DebuggerWX.vcproj | 14 +- Source/Core/DolphinWX/src/Config.cpp | 10 +- 10 files changed, 358 insertions(+), 279 deletions(-) create mode 100644 Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp diff --git a/Externals/zlib/zlib.vcproj b/Externals/zlib/zlib.vcproj index 6788cc4d94..1790bedfc2 100644 --- a/Externals/zlib/zlib.vcproj +++ b/Externals/zlib/zlib.vcproj @@ -1,7 +1,7 @@ + + diff --git a/Source/Core/Core/Src/Core.cpp b/Source/Core/Core/Src/Core.cpp index f02dc1f8fd..1d99eb666a 100644 --- a/Source/Core/Core/Src/Core.cpp +++ b/Source/Core/Core/Src/Core.cpp @@ -182,7 +182,8 @@ THREAD_RETURN CpuThread(void *pArg) CPUCompare::ConnectAsClient(); } - Common::Thread::SetCurrentThreadAffinity(1); //Force to first core + if (_CoreParameter.bLockThreads) + Common::Thread::SetCurrentThreadAffinity(1); //Force to first core // Let's run under memory watch EMM::InstallExceptionHandler(); @@ -208,7 +209,8 @@ THREAD_RETURN EmuThread(void *pArg) Common::SetCurrentThreadName("Emuthread - starting"); const SCoreStartupParameter& _CoreParameter = *(SCoreStartupParameter*)pArg; - Common::Thread::SetCurrentThreadAffinity(2); //Force to second core + if (_CoreParameter.bLockThreads) + Common::Thread::SetCurrentThreadAffinity(2); //Force to second core LOG(OSREPORT, "Starting core = %s mode", _CoreParameter.bWii ? "Wii" : "Gamecube"); LOG(OSREPORT, "Dualcore = %s", _CoreParameter.bUseDualCore ? "Yes" : "No"); @@ -222,7 +224,7 @@ THREAD_RETURN EmuThread(void *pArg) VideoInitialize.pGetMemoryPointer = Memory::GetPointer; VideoInitialize.pSetPEToken = PixelEngine::SetToken; VideoInitialize.pSetPEFinish = PixelEngine::SetFinish; - VideoInitialize.pWindowHandle = _CoreParameter.hMainWindow; // NULL; // filled by video_initialize + VideoInitialize.pWindowHandle = NULL; // _CoreParameter.hMainWindow; // NULL; // filled by video_initialize VideoInitialize.pLog = Callback_VideoLog; VideoInitialize.pRequestWindowSize = NULL; //Callback_VideoRequestWindowSize; VideoInitialize.pCopiedToXFB = Callback_VideoCopiedToXFB; diff --git a/Source/Core/Core/Src/CoreParameter.cpp b/Source/Core/Core/Src/CoreParameter.cpp index b14e170740..0e76213a6d 100644 --- a/Source/Core/Core/Src/CoreParameter.cpp +++ b/Source/Core/Core/Src/CoreParameter.cpp @@ -32,6 +32,7 @@ void SCoreStartupParameter::LoadDefaults() bUseDynarec = false; bUseDualCore = false; bRunCompareServer = false; + bLockThreads = true; bWii = false; } diff --git a/Source/Core/Core/Src/CoreParameter.h b/Source/Core/Core/Src/CoreParameter.h index f8176fd74f..0a2fab16a8 100644 --- a/Source/Core/Core/Src/CoreParameter.h +++ b/Source/Core/Core/Src/CoreParameter.h @@ -41,6 +41,8 @@ struct SCoreStartupParameter bool bHLEBios; bool bThrottle; bool bUseFastMem; + bool bLockThreads; + bool bOptimizeQuantizers; bool bRunCompareServer; bool bRunCompareClient; diff --git a/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp b/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp index 52040c5da0..a140de146d 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/JitCache.cpp @@ -62,7 +62,7 @@ namespace Jit64 JitBlock *blocks; int numBlocks; - //stats + //stats int numFlushes; void PrintStats() diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp index 8851c3d8eb..0f46e4773c 100644 --- a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStore.cpp @@ -43,6 +43,9 @@ namespace Jit64 { + static u64 GC_ALIGNED16(temp64); + static u32 GC_ALIGNED16(temp32); + #ifdef _M_X64 void SafeLoadECXtoEAX(int accessSize, s32 offset) { @@ -119,11 +122,6 @@ namespace Jit64 gpr.UnlockAll(); } - void SafeStoreECXtoEDX(int accessSize, int offset) - { - - } - void lXz(UGeckoInstruction inst) { int d = inst.RD; @@ -209,9 +207,6 @@ namespace Jit64 gpr.UnlockAll(); } - u32 GC_ALIGNED16(temp32); - u64 GC_ALIGNED16(temp64); - void lfs(UGeckoInstruction inst) { // BIT32OLD; @@ -227,15 +222,15 @@ namespace Jit64 gpr.Flush(FLUSH_VOLATILE); gpr.Lock(d, a); - MOV(32,R(ECX),gpr.R(a)); + MOV(32,R(ECX), gpr.R(a)); #ifdef _M_X64 if (!jo.noAssumeFPLoadFromMem) { - MOV(32, R(EAX), MComplex(RBX,ECX,SCALE_1,offset)); + MOV(32, R(EAX), MComplex(RBX, ECX, SCALE_1, offset)); //#else // MOV(32, R(EAX), MDisp(ECX, (u32)Memory::GetMainRAMPtr() + (u32)offset)); //#endif - BSWAP(32,EAX); + BSWAP(32, EAX); } else #endif @@ -243,7 +238,7 @@ namespace Jit64 SafeLoadECXtoEAX(32, offset); } - MOV(32,M(&temp32), R(EAX)); + MOV(32, M(&temp32), R(EAX)); fpr.Lock(d); fpr.LoadToX64(d, false); CVTSS2SD(fpr.RX(d), M(&temp32)); @@ -252,7 +247,6 @@ namespace Jit64 fpr.UnlockAll(); } - void lfd(UGeckoInstruction inst) { BIT32OLD; @@ -301,8 +295,6 @@ namespace Jit64 fpr.UnlockAll(); } - double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0}; - void stfs(UGeckoInstruction inst) { BIT32OLD; @@ -364,255 +356,7 @@ namespace Jit64 fpr.UnlockAll(); } - - // TODO(ector): Improve 64-bit version - void WriteDual32(u64 value, u32 address) - { - Memory::Write_U32((u32)(value>>32), address); - Memory::Write_U32((u32)value, address+4); - } - - const double m_quantizeTableD[] = - { - (1 << 0), (1 << 1), (1 << 2), (1 << 3), - (1 << 4), (1 << 5), (1 << 6), (1 << 7), - (1 << 8), (1 << 9), (1 << 10), (1 << 11), - (1 << 12), (1 << 13), (1 << 14), (1 << 15), - (1 << 16), (1 << 17), (1 << 18), (1 << 19), - (1 << 20), (1 << 21), (1 << 22), (1 << 23), - (1 << 24), (1 << 25), (1 << 26), (1 << 27), - (1 << 28), (1 << 29), (1 << 30), (1 << 31), - 1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29), - 1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25), - 1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21), - 1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17), - 1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13), - 1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9), - 1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5), - 1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1), - }; - - const double m_dequantizeTableD[] = - { - 1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3), - 1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7), - 1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11), - 1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15), - 1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19), - 1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23), - 1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27), - 1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31), - (1ULL << 32), (1 << 31), (1 << 30), (1 << 29), - (1 << 28), (1 << 27), (1 << 26), (1 << 25), - (1 << 24), (1 << 23), (1 << 22), (1 << 21), - (1 << 20), (1 << 19), (1 << 18), (1 << 17), - (1 << 16), (1 << 15), (1 << 14), (1 << 13), - (1 << 12), (1 << 11), (1 << 10), (1 << 9), - (1 << 8), (1 << 7), (1 << 6), (1 << 5), - (1 << 4), (1 << 3), (1 << 2), (1 << 1), - }; - - u32 temp; - void psq_st(UGeckoInstruction inst) - { - BIT32OLD; - OLD; - const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); - const EQuantizeType stType = static_cast(gqr.ST_TYPE); - int stScale = gqr.ST_SCALE; - bool update = inst.OPCD == 61; - if (!inst.RA || inst.W) - { - // PanicAlert(inst.RA ? "W" : "inst"); - Default(inst); - return; - } - - int offset = inst.SIMM_12; - int a = inst.RA; - int s = inst.RS; // Fp numbers - - if (stType == QUANTIZE_FLOAT) - { - gpr.Flush(FLUSH_VOLATILE); - gpr.Lock(a); - fpr.Lock(s); - if (update) - gpr.LoadToX64(a, true, true); - MOV(32, R(EDX), gpr.R(a)); - if (offset) - ADD(32, R(EDX), Imm32((u32)offset)); - TEST(32, R(EDX), Imm32(0x0C000000)); - if (update && offset) - MOV(32, gpr.R(a), R(EDX)); - CVTPD2PS(XMM0, fpr.R(s)); - SHUFPS(XMM0, R(XMM0), 1); - MOVAPS(M(&temp64), XMM0); - MOV(64, R(ECX), M(&temp64)); - FixupBranch argh = J_CC(CC_NZ); - BSWAP(64, ECX); - MOV(64, MComplex(RBX, EDX, SCALE_1, 0), R(ECX)); - FixupBranch arg2 = J(); - SetJumpTarget(argh); - CALL((void *)&WriteDual32); - SetJumpTarget(arg2); - if (update) - MOV(32, gpr.R(a), R(EDX)); - gpr.UnlockAll(); - fpr.UnlockAll(); - } - else if (stType == QUANTIZE_U8) - { - gpr.Flush(FLUSH_VOLATILE); - gpr.Lock(a); - fpr.Lock(s); - if (update) - gpr.LoadToX64(a, true, update); - MOV(32, R(EDX), gpr.R(a)); - if (offset) - ADD(32,R(EDX),Imm32((u32)offset)); - MOVAPS(XMM0, fpr.R(s)); - MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale])); - MULPD(XMM0, R(XMM1)); - CVTPD2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - PACKUSWB(XMM0, R(XMM0)); - MOVAPS(M(&temp64), XMM0); - MOV(16, R(ECX), M(&temp64)); -#ifdef _M_X64 - MOV(16, MComplex(RBX, RDX, SCALE_1, 0), R(ECX)); -#else - BSWAP(32, ECX); - SHR(32, R(ECX), Imm8(16)); - CALL(&Memory::Write_U16); -#endif - if (update) - MOV(32, gpr.R(a), R(EDX)); - gpr.UnlockAll(); - fpr.UnlockAll(); - } - else if (stType == QUANTIZE_S16) - { - gpr.Lock(a); - fpr.Lock(s); - if (update) - gpr.LoadToX64(a, true, update); - MOV(32, R(EDX), gpr.R(a)); - if (offset) - ADD(32,R(EDX),Imm32((u32)offset)); - MOVAPS(XMM0, fpr.R(s)); - MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale])); - MULPD(XMM0, R(XMM1)); - SHUFPD(XMM0, R(XMM0), 1); - CVTPD2DQ(XMM0, R(XMM0)); - PACKSSDW(XMM0, R(XMM0)); - MOVD_xmm(M(&temp64), XMM0); - MOV(32, R(ECX), M(&temp64)); -#ifdef _M_X64 - BSWAP(32, ECX); - MOV(32, MComplex(RBX, RDX, SCALE_1, 0), R(ECX)); -#else - BSWAP(32, ECX); - CALL(&Memory::Write_U32); -#endif - if (update) - MOV(32, gpr.R(a), R(EDX)); - gpr.UnlockAll(); - fpr.UnlockAll(); - } - else { - // Dodger uses this. - PanicAlert("st %i:%i", stType, inst.W); - Default(inst); - } - } - - - void psq_l(UGeckoInstruction inst) - { - BIT32OLD; - OLD; - const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); - const EQuantizeType ldType = static_cast(gqr.LD_TYPE); - int ldScale = gqr.LD_SCALE; - if (!inst.RA || inst.W) - { - // 0 1 during load - //PanicAlert("ld:%i %i", ldType, (int)inst.W); - Default(inst); - return; - } - bool update = inst.OPCD == 57; - int offset = inst.SIMM_12; - //INT3(); - switch (ldType) { -#ifdef _M_X64 - case QUANTIZE_FLOAT: - { - gpr.LoadToX64(inst.RA); - MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); - BSWAP(64, RAX); - MOV(64, M(&psTemp[0]),R(RAX)); - fpr.LoadToX64(inst.RS, false); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - CVTPS2PD(r, M(&psTemp[0])); - SHUFPD(r, R(r),1); - if (update) - ADD(32, gpr.R(inst.RA), Imm32(offset)); - break; - } - - case QUANTIZE_U8: - { - gpr.LoadToX64(inst.RA); - XOR(32, R(EAX), R(EAX)); - MOV(16, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); - MOV(32, M(&temp64), R(EAX)); - MOVD_xmm(XMM0, M(&temp64)); - // SSE4 optimization opportunity here. - PXOR(XMM1, R(XMM1)); - PUNPCKLBW(XMM0, R(XMM1)); - PUNPCKLWD(XMM0, R(XMM1)); - CVTDQ2PD(XMM0, R(XMM0)); - fpr.LoadToX64(inst.RS, false); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale])); - MULPD(r, R(XMM0)); - if (update) - ADD(32, gpr.R(inst.RA), Imm32(offset)); - } - break; - case QUANTIZE_S16: - { - gpr.LoadToX64(inst.RA); - MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); - BSWAP(32, EAX); - MOV(32, M(&temp64), R(EAX)); - //INT3(); - fpr.LoadToX64(inst.RS, false); - X64Reg r = fpr.R(inst.RS).GetSimpleReg(); - MOVD_xmm(XMM0, M(&temp64)); - PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword.. - PSRAD(XMM0, 16); // then use this signed shift to sign extend. clever eh? :P - CVTDQ2PD(XMM0, R(XMM0)); - MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale])); - MULPD(r, R(XMM0)); - SHUFPD(r, R(r), 1); - if (update) - ADD(32, gpr.R(inst.RA), Imm32(offset)); - } - break; -#endif - default: - // 4 0 - PanicAlert("ld:%i %i", ldType, (int)inst.W); - Default(inst); - return; - } - - //u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12; - } - + // Zero cache line. void dcbz(UGeckoInstruction inst) { #ifdef _M_IX86 diff --git a/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp new file mode 100644 index 0000000000..34d0a4c225 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64/Jit_LoadStorePaired.cpp @@ -0,0 +1,312 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only. +// Should give a very noticable speed boost to paired single heavy code. + +#include "../PowerPC.h" +#include "../../Core.h" +#include "../../HW/GPFifo.h" +#include "../../HW/CommandProcessor.h" +#include "../../HW/PixelEngine.h" +#include "../../HW/Memmap.h" +#include "../PPCTables.h" +#include "x64Emitter.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitAsm.h" +#include "JitRegCache.h" + +#define OLD +//#define OLD Default(inst); return; + +#ifdef _M_IX86 +#define BIT32OLD Default(inst); return; +#else +#define BIT32OLD ; +#endif + +namespace Jit64 { + +static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0}; +static u64 GC_ALIGNED16(temp64); +static u32 GC_ALIGNED16(temp32); + + +// TODO(ector): Improve 64-bit version +void WriteDual32(u64 value, u32 address) +{ + Memory::Write_U32((u32)(value>>32), address); + Memory::Write_U32((u32)value, address+4); +} + +const double m_quantizeTableD[] = +{ + (1 << 0), (1 << 1), (1 << 2), (1 << 3), + (1 << 4), (1 << 5), (1 << 6), (1 << 7), + (1 << 8), (1 << 9), (1 << 10), (1 << 11), + (1 << 12), (1 << 13), (1 << 14), (1 << 15), + (1 << 16), (1 << 17), (1 << 18), (1 << 19), + (1 << 20), (1 << 21), (1 << 22), (1 << 23), + (1 << 24), (1 << 25), (1 << 26), (1 << 27), + (1 << 28), (1 << 29), (1 << 30), (1 << 31), + 1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29), + 1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25), + 1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21), + 1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17), + 1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13), + 1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9), + 1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5), + 1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1), +}; + +const double m_dequantizeTableD[] = +{ + 1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3), + 1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7), + 1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11), + 1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15), + 1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19), + 1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23), + 1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27), + 1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31), + (1ULL << 32), (1 << 31), (1 << 30), (1 << 29), + (1 << 28), (1 << 27), (1 << 26), (1 << 25), + (1 << 24), (1 << 23), (1 << 22), (1 << 21), + (1 << 20), (1 << 19), (1 << 18), (1 << 17), + (1 << 16), (1 << 15), (1 << 14), (1 << 13), + (1 << 12), (1 << 11), (1 << 10), (1 << 9), + (1 << 8), (1 << 7), (1 << 6), (1 << 5), + (1 << 4), (1 << 3), (1 << 2), (1 << 1), +}; + +u32 temp; +void psq_st(UGeckoInstruction inst) +{ + BIT32OLD; + OLD; + if (!Core::GetStartupParameter().bOptimizeQuantizers) + { + Default(inst); + return; + } + const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); + const EQuantizeType stType = static_cast(gqr.ST_TYPE); + int stScale = gqr.ST_SCALE; + bool update = inst.OPCD == 61; + if (!inst.RA || inst.W) + { + // PanicAlert(inst.RA ? "W" : "inst"); + Default(inst); + return; + } + + int offset = inst.SIMM_12; + int a = inst.RA; + int s = inst.RS; // Fp numbers + + if (stType == QUANTIZE_FLOAT) + { + gpr.Flush(FLUSH_VOLATILE); + gpr.Lock(a); + fpr.Lock(s); + if (update) + gpr.LoadToX64(a, true, true); + MOV(32, R(EDX), gpr.R(a)); + if (offset) + ADD(32, R(EDX), Imm32((u32)offset)); + TEST(32, R(EDX), Imm32(0x0C000000)); + if (update && offset) + MOV(32, gpr.R(a), R(EDX)); + CVTPD2PS(XMM0, fpr.R(s)); + SHUFPS(XMM0, R(XMM0), 1); + MOVAPS(M(&temp64), XMM0); + MOV(64, R(ECX), M(&temp64)); + FixupBranch argh = J_CC(CC_NZ); + BSWAP(64, ECX); + MOV(64, MComplex(RBX, EDX, SCALE_1, 0), R(ECX)); + FixupBranch arg2 = J(); + SetJumpTarget(argh); + CALL((void *)&WriteDual32); + SetJumpTarget(arg2); + if (update) + MOV(32, gpr.R(a), R(EDX)); + gpr.UnlockAll(); + fpr.UnlockAll(); + } + else if (stType == QUANTIZE_U8) + { + gpr.Flush(FLUSH_VOLATILE); + gpr.Lock(a); + fpr.Lock(s); + if (update) + gpr.LoadToX64(a, true, update); + MOV(32, R(EDX), gpr.R(a)); + if (offset) + ADD(32,R(EDX),Imm32((u32)offset)); + MOVAPS(XMM0, fpr.R(s)); + MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale])); + MULPD(XMM0, R(XMM1)); + CVTPD2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + PACKUSWB(XMM0, R(XMM0)); + MOVAPS(M(&temp64), XMM0); + MOV(16, R(ECX), M(&temp64)); +#ifdef _M_X64 + MOV(16, MComplex(RBX, RDX, SCALE_1, 0), R(ECX)); +#else + BSWAP(32, ECX); + SHR(32, R(ECX), Imm8(16)); + CALL(&Memory::Write_U16); +#endif + if (update) + MOV(32, gpr.R(a), R(EDX)); + gpr.UnlockAll(); + fpr.UnlockAll(); + } + else if (stType == QUANTIZE_S16) + { + gpr.Lock(a); + fpr.Lock(s); + if (update) + gpr.LoadToX64(a, true, update); + MOV(32, R(EDX), gpr.R(a)); + if (offset) + ADD(32,R(EDX),Imm32((u32)offset)); + MOVAPS(XMM0, fpr.R(s)); + MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale])); + MULPD(XMM0, R(XMM1)); + SHUFPD(XMM0, R(XMM0), 1); + CVTPD2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + MOVD_xmm(M(&temp64), XMM0); + MOV(32, R(ECX), M(&temp64)); +#ifdef _M_X64 + BSWAP(32, ECX); + MOV(32, MComplex(RBX, RDX, SCALE_1, 0), R(ECX)); +#else + BSWAP(32, ECX); + CALL(&Memory::Write_U32); +#endif + if (update) + MOV(32, gpr.R(a), R(EDX)); + gpr.UnlockAll(); + fpr.UnlockAll(); + } + else { + // Dodger uses this. + PanicAlert("st %i:%i", stType, inst.W); + Default(inst); + } +} + + +void psq_l(UGeckoInstruction inst) +{ + BIT32OLD; + OLD; + if (!Core::GetStartupParameter().bOptimizeQuantizers) + { + Default(inst); + return; + } + const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); + const EQuantizeType ldType = static_cast(gqr.LD_TYPE); + int ldScale = gqr.LD_SCALE; + if (!inst.RA || inst.W) + { + // 0 1 during load + //PanicAlert("ld:%i %i", ldType, (int)inst.W); + Default(inst); + return; + } + bool update = inst.OPCD == 57; + int offset = inst.SIMM_12; + //INT3(); + switch (ldType) { +#ifdef _M_X64 + case QUANTIZE_FLOAT: + { + gpr.LoadToX64(inst.RA); + MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); + BSWAP(64, RAX); + MOV(64, M(&psTemp[0]),R(RAX)); + fpr.LoadToX64(inst.RS, false); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + CVTPS2PD(r, M(&psTemp[0])); + SHUFPD(r, R(r),1); + if (update) + ADD(32, gpr.R(inst.RA), Imm32(offset)); + break; + } + + case QUANTIZE_U8: + { + gpr.LoadToX64(inst.RA); + XOR(32, R(EAX), R(EAX)); + MOV(16, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); + MOV(32, M(&temp64), R(EAX)); + MOVD_xmm(XMM0, M(&temp64)); + // SSE4 optimization opportunity here. + PXOR(XMM1, R(XMM1)); + PUNPCKLBW(XMM0, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + CVTDQ2PD(XMM0, R(XMM0)); + fpr.LoadToX64(inst.RS, false); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale])); + MULPD(r, R(XMM0)); + if (update) + ADD(32, gpr.R(inst.RA), Imm32(offset)); + } + break; + + case QUANTIZE_S16: + { + gpr.LoadToX64(inst.RA); + MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); + BSWAP(32, EAX); + MOV(32, M(&temp64), R(EAX)); + //INT3(); + fpr.LoadToX64(inst.RS, false); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + MOVD_xmm(XMM0, M(&temp64)); + PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword.. + PSRAD(XMM0, 16); // then use this signed shift to sign extend. clever eh? :P + CVTDQ2PD(XMM0, R(XMM0)); + MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale])); + MULPD(r, R(XMM0)); + SHUFPD(r, R(r), 1); + if (update) + ADD(32, gpr.R(inst.RA), Imm32(offset)); + } + break; +#endif + default: + // 4 0 + // 6 0 //power tennis + // 5 0 + PanicAlert("ld:%i %i", ldType, (int)inst.W); + Default(inst); + return; + } + + //u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12; +} + +} // namespace \ No newline at end of file diff --git a/Source/Core/DebuggerWX/DebuggerWX.vcproj b/Source/Core/DebuggerWX/DebuggerWX.vcproj index 4502ea779a..8f8f38e1d0 100644 --- a/Source/Core/DebuggerWX/DebuggerWX.vcproj +++ b/Source/Core/DebuggerWX/DebuggerWX.vcproj @@ -24,6 +24,7 @@ IntermediateDirectory="$(PlatformName)\$(ConfigurationName)" ConfigurationType="4" CharacterSet="2" + WholeProgramOptimization="0" >