From 4a9310755f55648bffdf84df9f6d38661b06c968 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sat, 5 Apr 2014 20:30:08 +0400 Subject: [PATCH 01/14] Working on simple SPU JIT No, it doesn't work. --- asmjit.vcxproj | 169 +++ asmjit.vcxproj.filters | 29 + asmjit.vcxproj.user | 4 + rpcs3.sln | 12 +- rpcs3/Emu/Cell/SPURecompiler.h | 1752 ++++++++++++++++++++++++++ rpcs3/Emu/Cell/SPURecompilerCore.cpp | 132 ++ rpcs3/rpcs3.vcxproj | 17 +- rpcs3/rpcs3.vcxproj.filters | 3 + 8 files changed, 2109 insertions(+), 9 deletions(-) create mode 100644 asmjit.vcxproj create mode 100644 asmjit.vcxproj.filters create mode 100644 asmjit.vcxproj.user create mode 100644 rpcs3/Emu/Cell/SPURecompiler.h create mode 100644 rpcs3/Emu/Cell/SPURecompilerCore.cpp diff --git a/asmjit.vcxproj b/asmjit.vcxproj new file mode 100644 index 0000000000..80ffab87f4 --- /dev/null +++ b/asmjit.vcxproj @@ -0,0 +1,169 @@ + + + + + Debug + Win32 + + + Debug + x64 + + + Release + Win32 + + + Release + x64 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + {AC40FF01-426E-4838-A317-66354CEFAE88} + asmjit + + + + StaticLibrary + true + v120 + Unicode + + + StaticLibrary + true + v120 + Unicode + + + StaticLibrary + false + v120 + true + Unicode + + + StaticLibrary + false + v120 + true + Unicode + + + + + + + + + + + + + + + + + + + .\libs\$(Configuration)\ + + + + + .\libs\$(Configuration)\ + + + + + .\libs\$(Configuration)\ + + + + + .\libs\$(Configuration)\ + + + + + + Level3 + Disabled + true + ASMJIT_STATIC;_MBCS;%(PreprocessorDefinitions) + + + true + + + + + Level3 + Disabled + false + ASMJIT_STATIC;_MBCS;%(PreprocessorDefinitions) + + + true + + + + + Level3 + MaxSpeed + true + true + true + ASMJIT_STATIC;_UNICODE;UNICODE;%(PreprocessorDefinitions) + + + true + true + true + + + + + Level3 + MaxSpeed + true + true + true + ASMJIT_STATIC;_UNICODE;UNICODE;%(PreprocessorDefinitions) + + + true + true + true + + + + + + \ No newline at end of file diff --git a/asmjit.vcxproj.filters b/asmjit.vcxproj.filters new file mode 100644 index 0000000000..bddd91cf68 --- /dev/null +++ b/asmjit.vcxproj.filters @@ -0,0 +1,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/asmjit.vcxproj.user b/asmjit.vcxproj.user new file mode 100644 index 0000000000..ef5ff2a1fa --- /dev/null +++ b/asmjit.vcxproj.user @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/rpcs3.sln b/rpcs3.sln index e8513ead3e..ea7660aef6 100644 --- a/rpcs3.sln +++ b/rpcs3.sln @@ -1,6 +1,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 2013 -VisualStudioVersion = 12.0.21005.1 +VisualStudioVersion = 12.0.30110.0 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rpcs3", "rpcs3\rpcs3.vcxproj", "{70CD65B0-91D6-4FAE-9A7B-4AF55D0D1B12}" ProjectSection(ProjectDependencies) = postProject @@ -80,6 +80,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "stc", "wxWidgets\build\msw\ EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "wxscintilla", "wxWidgets\build\msw\wx_vc10_wxscintilla.vcxproj", "{74827EBD-93DC-5110-BA95-3F2AB029B6B0}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "asmjit", "asmjit.vcxproj", "{AC40FF01-426E-4838-A317-66354CEFAE88}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Win32 = Debug|Win32 @@ -272,6 +274,14 @@ Global {74827EBD-93DC-5110-BA95-3F2AB029B6B0}.Release|Win32.Build.0 = Release|Win32 {74827EBD-93DC-5110-BA95-3F2AB029B6B0}.Release|x64.ActiveCfg = Release|x64 {74827EBD-93DC-5110-BA95-3F2AB029B6B0}.Release|x64.Build.0 = Release|x64 + {AC40FF01-426E-4838-A317-66354CEFAE88}.Debug|Win32.ActiveCfg = Debug|Win32 + {AC40FF01-426E-4838-A317-66354CEFAE88}.Debug|Win32.Build.0 = Debug|Win32 + {AC40FF01-426E-4838-A317-66354CEFAE88}.Debug|x64.ActiveCfg = Debug|x64 + {AC40FF01-426E-4838-A317-66354CEFAE88}.Debug|x64.Build.0 = Debug|x64 + {AC40FF01-426E-4838-A317-66354CEFAE88}.Release|Win32.ActiveCfg = Release|Win32 + {AC40FF01-426E-4838-A317-66354CEFAE88}.Release|Win32.Build.0 = Release|Win32 + {AC40FF01-426E-4838-A317-66354CEFAE88}.Release|x64.ActiveCfg = Release|x64 + {AC40FF01-426E-4838-A317-66354CEFAE88}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h new file mode 100644 index 0000000000..f78cdf8339 --- /dev/null +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -0,0 +1,1752 @@ +#pragma once + +#include "Emu/Cell/SPUOpcodes.h" +#include "Emu/Memory/Memory.h" +#include "Emu/Cell/SPUThread.h" +#include "Emu/SysCalls/SysCalls.h" + +#define ASMJIT_STATIC + +#include "asmjit.h" + +using namespace asmjit; +using namespace asmjit::host; + +#define UNIMPLEMENTED() UNK(__FUNCTION__) + +struct SPUImmTable +{ + __m128i s19_to_s32[1 << 18]; + __m128i fsmbi_mask[1 << 16]; + + SPUImmTable() + { + // signed numbers table + for (u32 i = 0; i < sizeof(s19_to_s32) / sizeof(__m128i); i++) + { + const u32 v = (i & 0x40000) ? (i | 0xfff8000) : i; + s19_to_s32[i].m128i_i32[0] = v; + s19_to_s32[i].m128i_i32[1] = v; + s19_to_s32[i].m128i_i32[2] = v; + s19_to_s32[i].m128i_i32[3] = v; + } + // FSMBI mask table + for (u32 i = 0; i < sizeof(fsmbi_mask) / sizeof(__m128i); i++) + { + for (u32 j = 0; j < 16; j++) + { + fsmbi_mask[i].m128i_i8[j] = ((i >> j) & 0x1) ? 0xff : 0; + } + } + } +}; + +class SPURecompiler; + +class SPURecompilerCore : public CPUDecoder +{ + SPURecompiler* m_enc; + SPUInterpreter* m_inter; + SPUThread& CPU; + +public: + JitRuntime runtime; + Compiler compiler; + + struct SPURecEntry + { + u16 host; // absolute position of first instruction of current block + u16 count; // count of instructions compiled from current point (and to be checked) + u32 valid; // copy of valid opcode for validation + void* pointer; // pointer to executable memory object + }; + + SPURecEntry entry[0x10000]; + + SPURecompilerCore(SPUThread& cpu); + + ~SPURecompilerCore(); + + void Compile(u16 pos); + + virtual void Decode(const u32 code); + + virtual u8 DecodeMemory(const u64 address); +}; + +#define cpu_xmm(x) oword_ptr(*cpu_var, offsetof(SPUThread, x)) +#define cpu_qword(x) qword_ptr(*cpu_var, offsetof(SPUThread, x)) +#define cpu_dword(x,...) dword_ptr(*cpu_var, __VA_ARGS__, offsetof(SPUThread, x)) +#define cpu_word(x) word_ptr(*cpu_var, offsetof(SPUThread, x)) +#define cpu_byte(x) byte_ptr(*cpu_var, offsetof(SPUThread, x)) + +#define imm_xmm(x) oword_ptr(*imm_var, offsetof(SPUImmTable, x)) + +class SPURecompiler : public SPUOpcodes +{ +private: + SPUThread& CPU; + SPURecompilerCore& rec; + Compiler& c; + +public: + bool do_finalize; + GpVar* cpu_var; + GpVar* ls_var; + GpVar* imm_var; + + SPURecompiler(SPUThread& cpu, SPURecompilerCore& rec) : CPU(cpu), rec(rec), c(rec.compiler) + { + } + +private: + //0 - 10 + void STOP(u32 code) + { + UNIMPLEMENTED(); + } + void LNOP() + { + UNIMPLEMENTED(); + } + void SYNC(u32 Cbit) + { + UNIMPLEMENTED(); + } + void DSYNC() + { + UNIMPLEMENTED(); + } + void MFSPR(u32 rt, u32 sa) + { + UNIMPLEMENTED(); + //If register is a dummy register (register labeled 0x0) + if(sa == 0x0) + { + CPU.GPR[rt]._u128.hi = 0x0; + CPU.GPR[rt]._u128.lo = 0x0; + } + else + { + CPU.GPR[rt]._u128.hi = CPU.SPR[sa]._u128.hi; + CPU.GPR[rt]._u128.lo = CPU.SPR[sa]._u128.lo; + } + } + void RDCH(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + CPU.ReadChannel(CPU.GPR[rt], ra); + } + void RCHCNT(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + CPU.GPR[rt].Reset(); + CPU.GPR[rt]._u32[3] = CPU.GetChannelCount(ra); + } + void SF(u32 rt, u32 ra, u32 rb) + { + XmmVar v0(c); + if (ra == rb) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + { + // sub from + c.movdqa(v0, cpu_xmm(GPR[rb])); + c.psubd(v0, cpu_xmm(GPR[ra])); + c.movdqa(cpu_xmm(GPR[rt]), v0); + } + } + void OR(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]; + CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]; + CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]; + CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]; + } + void BG(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] > CPU.GPR[rb]._u32[0] ? 0 : 1; + CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] > CPU.GPR[rb]._u32[1] ? 0 : 1; + CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] > CPU.GPR[rb]._u32[2] ? 0 : 1; + CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] > CPU.GPR[rb]._u32[3] ? 0 : 1; + } + void SFH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = CPU.GPR[rb]._u16[h] - CPU.GPR[ra]._u16[h]; + } + void NOR(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]); + CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]); + CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]); + CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]); + } + void ABSDB(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = CPU.GPR[rb]._u8[b] > CPU.GPR[ra]._u8[b] ? CPU.GPR[rb]._u8[b] - CPU.GPR[ra]._u8[b] : CPU.GPR[ra]._u8[b] - CPU.GPR[rb]._u8[b]; + } + void ROT(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = (CPU.GPR[ra]._u32[0] << (CPU.GPR[rb]._u32[0] & 0x1f)) | (CPU.GPR[ra]._u32[0] >> (32 - (CPU.GPR[rb]._u32[0] & 0x1f))); + CPU.GPR[rt]._u32[1] = (CPU.GPR[ra]._u32[1] << (CPU.GPR[rb]._u32[1] & 0x1f)) | (CPU.GPR[ra]._u32[1] >> (32 - (CPU.GPR[rb]._u32[1] & 0x1f))); + CPU.GPR[rt]._u32[2] = (CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x1f)) | (CPU.GPR[ra]._u32[2] >> (32 - (CPU.GPR[rb]._u32[2] & 0x1f))); + CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x1f)) | (CPU.GPR[ra]._u32[3] >> (32 - (CPU.GPR[rb]._u32[3] & 0x1f))); + } + void ROTM(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) % 64) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) % 64) : 0; + CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) % 64) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) % 64) : 0; + CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) % 64) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) % 64) : 0; + CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) % 64) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) % 64) : 0; + } + void ROTMA(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._i32[0]) % 64) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._i32[0]) % 64) : CPU.GPR[ra]._i32[0] >> 31; + CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._i32[1]) % 64) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._i32[1]) % 64) : CPU.GPR[ra]._i32[1] >> 31; + CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._i32[2]) % 64) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._i32[2]) % 64) : CPU.GPR[ra]._i32[2] >> 31; + CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._i32[3]) % 64) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._i32[3]) % 64) : CPU.GPR[ra]._i32[3] >> 31; + } + void SHL(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = (CPU.GPR[rb]._u32[0] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[0] << (CPU.GPR[rb]._u32[0] & 0x3f); + CPU.GPR[rt]._u32[1] = (CPU.GPR[rb]._u32[1] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[1] << (CPU.GPR[rb]._u32[1] & 0x3f); + CPU.GPR[rt]._u32[2] = (CPU.GPR[rb]._u32[2] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x3f); + CPU.GPR[rt]._u32[3] = (CPU.GPR[rb]._u32[3] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x3f); + } + void ROTH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0xf)) | (CPU.GPR[ra]._u16[h] >> (16 - (CPU.GPR[rb]._u16[h] & 0xf))); + } + void ROTHM(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) % 32) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) % 32) : 0; + } + void ROTMAH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._i16[h]) % 32) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._i16[h]) % 32) : CPU.GPR[ra]._i16[h] >> 15; + } + void SHLH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = (CPU.GPR[rb]._u16[h] & 0x1f) > 15 ? 0 : CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0x1f); + } + void ROTI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int nRot = i7 & 0x1f; + CPU.GPR[rt]._u32[0] = (CPU.GPR[ra]._u32[0] << nRot) | (CPU.GPR[ra]._u32[0] >> (32 - nRot)); + CPU.GPR[rt]._u32[1] = (CPU.GPR[ra]._u32[1] << nRot) | (CPU.GPR[ra]._u32[1] >> (32 - nRot)); + CPU.GPR[rt]._u32[2] = (CPU.GPR[ra]._u32[2] << nRot) | (CPU.GPR[ra]._u32[2] >> (32 - nRot)); + CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[3] << nRot) | (CPU.GPR[ra]._u32[3] >> (32 - nRot)); + } + void ROTMI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int nRot = (0 - i7) % 64; + CPU.GPR[rt]._u32[0] = nRot < 32 ? CPU.GPR[ra]._u32[0] >> nRot : 0; + CPU.GPR[rt]._u32[1] = nRot < 32 ? CPU.GPR[ra]._u32[1] >> nRot : 0; + CPU.GPR[rt]._u32[2] = nRot < 32 ? CPU.GPR[ra]._u32[2] >> nRot : 0; + CPU.GPR[rt]._u32[3] = nRot < 32 ? CPU.GPR[ra]._u32[3] >> nRot : 0; + } + void ROTMAI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int nRot = (0 - i7) % 64; + CPU.GPR[rt]._i32[0] = nRot < 32 ? CPU.GPR[ra]._i32[0] >> nRot : CPU.GPR[ra]._i32[0] >> 31; + CPU.GPR[rt]._i32[1] = nRot < 32 ? CPU.GPR[ra]._i32[1] >> nRot : CPU.GPR[ra]._i32[1] >> 31; + CPU.GPR[rt]._i32[2] = nRot < 32 ? CPU.GPR[ra]._i32[2] >> nRot : CPU.GPR[ra]._i32[2] >> 31; + CPU.GPR[rt]._i32[3] = nRot < 32 ? CPU.GPR[ra]._i32[3] >> nRot : CPU.GPR[ra]._i32[3] >> 31; + } + void SHLI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const u32 s = i7 & 0x3f; + + for (u32 j = 0; j < 4; ++j) + CPU.GPR[rt]._u32[j] = CPU.GPR[ra]._u32[j] << s; + } + void ROTHI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int nRot = i7 & 0xf; + + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._u16[h] << nRot) | (CPU.GPR[ra]._u16[h] >> (16 - nRot)); + } + void ROTHMI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int nRot = (0 - i7) % 32; + + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = nRot < 16 ? CPU.GPR[ra]._u16[h] >> nRot : 0; + } + void ROTMAHI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int nRot = (0 - i7) % 32; + + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._i16[h] = nRot < 16 ? CPU.GPR[ra]._i16[h] >> nRot : CPU.GPR[ra]._i16[h] >> 15; + } + void SHLHI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int nRot = i7 & 0x1f; + + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[0] = nRot > 15 ? 0 : CPU.GPR[ra]._u16[0] << nRot; + } + void A(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]; + CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]; + CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]; + CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]; + } + void AND(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] & CPU.GPR[rb]._u32[0]; + CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] & CPU.GPR[rb]._u32[1]; + CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] & CPU.GPR[rb]._u32[2]; + CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]; + } + void CG(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = ((CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]) < CPU.GPR[ra]._u32[0]) ? 1 : 0; + CPU.GPR[rt]._u32[1] = ((CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]) < CPU.GPR[ra]._u32[1]) ? 1 : 0; + CPU.GPR[rt]._u32[2] = ((CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]) < CPU.GPR[ra]._u32[2]) ? 1 : 0; + CPU.GPR[rt]._u32[3] = ((CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) < CPU.GPR[ra]._u32[3]) ? 1 : 0; + } + void AH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] + CPU.GPR[rb]._u16[h]; + } + void NAND(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] & CPU.GPR[rb]._u32[0]); + CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] & CPU.GPR[rb]._u32[1]); + CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] & CPU.GPR[rb]._u32[2]); + CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]); + } + void AVGB(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = (CPU.GPR[ra]._u8[b] + CPU.GPR[rb]._u8[b] + 1) >> 1; + } + void MTSPR(u32 rt, u32 sa) + { + UNIMPLEMENTED(); + if(sa != 0) + { + CPU.SPR[sa]._u128.hi = CPU.GPR[rt]._u128.hi; + CPU.SPR[sa]._u128.lo = CPU.GPR[rt]._u128.lo; + } + } + void WRCH(u32 ra, u32 rt) + { + UNIMPLEMENTED(); + CPU.WriteChannel(ra, CPU.GPR[rt]); + } + void BIZ(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + if(CPU.GPR[rt]._u32[3] == 0) + CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + } + void BINZ(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + if(CPU.GPR[rt]._u32[3] != 0) + CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + } + void BIHZ(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + if(CPU.GPR[rt]._u16[6] == 0) + CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + } + void BIHNZ(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + if(CPU.GPR[rt]._u16[6] != 0) + CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + } + void STOPD(u32 rc, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + Emu.Pause(); + } + void STQX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + u32 lsa = (CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) & 0x3fff0; + if(!CPU.IsGoodLSA(lsa)) + { + ConLog.Error("STQX: bad lsa (0x%x)", lsa); + Emu.Pause(); + return; + } + + CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); + } + void BI(u32 ra) + { + UNIMPLEMENTED(); + CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + } + void BISL(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + const u32 NewPC = CPU.GPR[ra]._u32[3]; + CPU.GPR[rt].Reset(); + CPU.GPR[rt]._u32[3] = CPU.PC + 4; + CPU.SetBranch(branchTarget(NewPC, 0)); + } + void IRET(u32 ra) + { + UNIMPLEMENTED(); + //SetBranch(SRR0); + } + void BISLED(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + } + void HBR(u32 p, u32 ro, u32 ra) + { + UNIMPLEMENTED(); + } + void GB(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[0] & 1) | + ((CPU.GPR[ra]._u32[1] & 1) << 1) | + ((CPU.GPR[ra]._u32[2] & 1) << 2) | + ((CPU.GPR[ra]._u32[3] & 1) << 3); + CPU.GPR[rt]._u32[2] = 0; + CPU.GPR[rt]._u64[0] = 0; + } + void GBH(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + u32 temp = 0; + for (int h = 0; h < 8; h++) + temp |= (CPU.GPR[ra]._u16[h] & 1) << h; + CPU.GPR[rt]._u32[3] = temp; + CPU.GPR[rt]._u32[2] = 0; + CPU.GPR[rt]._u64[0] = 0; + } + void GBB(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + u32 temp = 0; + for (int b = 0; b < 16; b++) + temp |= (CPU.GPR[ra]._u8[b] & 1) << b; + CPU.GPR[rt]._u32[3] = temp; + CPU.GPR[rt]._u32[2] = 0; + CPU.GPR[rt]._u64[0] = 0; + } + void FSM(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + const u32 pref = CPU.GPR[ra]._u32[3]; + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = (pref & (1 << w)) ? ~0 : 0; + } + void FSMH(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + const u32 pref = CPU.GPR[ra]._u32[3]; + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = (pref & (1 << h)) ? ~0 : 0; + } + void FSMB(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + const u32 pref = CPU.GPR[ra]._u32[3]; + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = (pref & (1 << b)) ? ~0 : 0; + } + void FREST(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + //CPU.GPR[rt]._m128 = _mm_rcp_ps(CPU.GPR[ra]._m128); + for (int i = 0; i < 4; i++) + CPU.GPR[rt]._f[i] = 1 / CPU.GPR[ra]._f[i]; + } + void FRSQEST(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + //const __u32x4 FloatAbsMask = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; + //CPU.GPR[rt]._m128 = _mm_rsqrt_ps(_mm_and_ps(CPU.GPR[ra]._m128, FloatAbsMask.m128)); + for (int i = 0; i < 4; i++) + CPU.GPR[rt]._f[i] = 1 / sqrt(abs(CPU.GPR[ra]._f[i])); + } + void LQX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + u32 a = CPU.GPR[ra]._u32[3], b = CPU.GPR[rb]._u32[3]; + + u32 lsa = (a + b) & 0x3fff0; + + if(!CPU.IsGoodLSA(lsa)) + { + ConLog.Error("LQX: bad lsa (0x%x)", lsa); + Emu.Pause(); + return; + } + + CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); + } + void ROTQBYBI(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0xf; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; + } + void ROTQMBYBI(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int s = (0 - (CPU.GPR[rb]._u32[3] >> 3)) & 0x1f; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt].Reset(); + for (int b = 0; b < 16 - s; b++) + CPU.GPR[rt]._u8[b] = temp._u8[b + s]; + } + void SHLQBYBI(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0x1f; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt].Reset(); + for (int b = s; b < 16; b++) + CPU.GPR[rt]._u8[b] = temp._u8[b - s]; + } + void CBX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0xF; + + CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; + CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; + CPU.GPR[rt]._u8[15 - t] = 0x03; + } + void CHX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0xE; + + CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; + CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; + CPU.GPR[rt]._u16[7 - (t >> 1)] = 0x0203; + } + void CWX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const u32 t = (CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) & 0xC; + + CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; + CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; + CPU.GPR[rt]._u32[3 - (t >> 2)] = 0x00010203; + } + void CDX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0x8; + + CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; + CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; + CPU.GPR[rt]._u64[1 - (t >> 3)] = (u64)0x0001020304050607; + } + void ROTQBI(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int t = CPU.GPR[rb]._u32[3] & 0x7; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt]._u32[0] = (temp._u32[0] << t) | (temp._u32[3] >> (32 - t)); + CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); + CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); + CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); + } + void ROTQMBI(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int t = (0 - CPU.GPR[rb]._u32[3]) & 0x7; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt]._u32[0] = (temp._u32[0] >> t) | (temp._u32[1] << (32 - t)); + CPU.GPR[rt]._u32[1] = (temp._u32[1] >> t) | (temp._u32[2] << (32 - t)); + CPU.GPR[rt]._u32[2] = (temp._u32[2] >> t) | (temp._u32[3] << (32 - t)); + CPU.GPR[rt]._u32[3] = (temp._u32[3] >> t); + } + void SHLQBI(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int t = CPU.GPR[rb]._u32[3] & 0x7; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt]._u32[0] = (temp._u32[0] << t); + CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); + CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); + CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); + } + void ROTQBY(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int s = CPU.GPR[rb]._u32[3] & 0xf; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + for (int b = 0; b < 16; ++b) + CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; + } + void ROTQMBY(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int s = (0 - CPU.GPR[rb]._u32[3]) & 0x1f; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt].Reset(); + for (int b = 0; b < 16 - s; b++) + CPU.GPR[rt]._u8[b] = temp._u8[b + s]; + } + void SHLQBY(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const int s = CPU.GPR[rb]._u32[3] & 0x1f; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt].Reset(); + for (int b = s; b < 16; b++) + CPU.GPR[rt]._u8[b] = temp._u8[b - s]; + } + void ORX(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[0] | CPU.GPR[ra]._u32[1] | CPU.GPR[ra]._u32[2] | CPU.GPR[ra]._u32[3]; + CPU.GPR[rt]._u32[2] = 0; + CPU.GPR[rt]._u64[0] = 0; + } + void CBD(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xF; + + CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; + CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; + CPU.GPR[rt]._u8[15 - t] = 0x03; + } + void CHD(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xE; + + CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; + CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; + CPU.GPR[rt]._u16[7 - (t >> 1)] = 0x0203; + } + void CWD(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xC; + + CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; + CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; + CPU.GPR[rt]._u32[3 - (t >> 2)] = 0x00010203; + } + void CDD(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int t = (CPU.GPR[ra]._u32[3] + i7) & 0x8; + + CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; + CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; + CPU.GPR[rt]._u64[1 - (t >> 3)] = (u64)0x0001020304050607; + } + void ROTQBII(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int s = i7 & 0x7; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt]._u32[0] = (temp._u32[0] << s) | (temp._u32[3] >> (32 - s)); + CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); + CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); + CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); + } + void ROTQMBII(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int s = (0 - i7) & 0x7; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt]._u32[0] = (temp._u32[0] >> s) | (temp._u32[1] << (32 - s)); + CPU.GPR[rt]._u32[1] = (temp._u32[1] >> s) | (temp._u32[2] << (32 - s)); + CPU.GPR[rt]._u32[2] = (temp._u32[2] >> s) | (temp._u32[3] << (32 - s)); + CPU.GPR[rt]._u32[3] = (temp._u32[3] >> s); + } + void SHLQBII(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int s = i7 & 0x7; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt]._u32[0] = (temp._u32[0] << s); + CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); + CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); + CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); + } + void ROTQBYI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int s = i7 & 0xf; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; + } + void ROTQMBYI(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const int s = (0 - i7) & 0x1f; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt].Reset(); + for (int b = 0; b < 16 - s; b++) + CPU.GPR[rt]._u8[b] = temp._u8[b + s]; + } + void SHLQBYI(u32 rt, u32 ra, s32 i7) + { + const int s = i7 & 0x1f; + XmmVar v0(c); + if (s == 0) + { + if (ra == rt) + { + // nop + } + else + { + // mov + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + } + else if (s > 15) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + // shift left + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.pslldq(v0, s); + c.movdqa(cpu_xmm(GPR[rt]), v0); + } + } + void NOP(u32 rt) + { + UNIMPLEMENTED(); + } + void CGT(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > CPU.GPR[rb]._i32[w] ? 0xffffffff : 0; + } + void XOR(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] ^ CPU.GPR[rb]._u32[w]; + } + void CGTH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._i16[h] > CPU.GPR[rb]._i16[h] ? 0xffff : 0; + } + void EQV(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] ^ (~CPU.GPR[rb]._u32[w]); + } + void CGTB(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._i8[b] > CPU.GPR[rb]._i8[b] ? 0xff : 0; + } + void SUMB(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + const SPU_GPR_hdr _a = CPU.GPR[ra]; + const SPU_GPR_hdr _b = CPU.GPR[rb]; + for (int w = 0; w < 4; w++) + { + CPU.GPR[rt]._u16[w*2] = _a._u8[w*4] + _a._u8[w*4 + 1] + _a._u8[w*4 + 2] + _a._u8[w*4 + 3]; + CPU.GPR[rt]._u16[w*2 + 1] = _b._u8[w*4] + _b._u8[w*4 + 1] + _b._u8[w*4 + 2] + _b._u8[w*4 + 3]; + } + } + //HGT uses signed values. HLGT uses unsigned values + void HGT(u32 rt, s32 ra, s32 rb) + { + UNIMPLEMENTED(); + if(CPU.GPR[ra]._i32[3] > CPU.GPR[rb]._i32[3]) CPU.Stop(); + } + void CLZ(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + { + int nPos; + + for (nPos = 0; nPos < 32; nPos++) + if (CPU.GPR[ra]._u32[w] & (1 << (31 - nPos))) + break; + + CPU.GPR[rt]._u32[w] = nPos; + } + } + void XSWD(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._i64[0] = (s64)CPU.GPR[ra]._i32[0]; + CPU.GPR[rt]._i64[1] = (s64)CPU.GPR[ra]._i32[2]; + } + void XSHW(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = (s32)CPU.GPR[ra]._i16[w*2]; + } + void CNTB(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt].Reset(); + for (int b = 0; b < 16; b++) + for (int i = 0; i < 8; i++) + CPU.GPR[rt]._u8[b] += (temp._u8[b] & (1 << i)) ? 1 : 0; + } + void XSBH(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._i16[h] = (s16)CPU.GPR[ra]._i8[h*2]; + } + void CLGT(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for(u32 i = 0; i < 4; ++i) + { + CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > CPU.GPR[rb]._u32[i]) ? 0xffffffff : 0x00000000; + } + } + void ANDC(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] & (~CPU.GPR[rb]._u32[w]); + } + void FCGT(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._f[0] > CPU.GPR[rb]._f[0] ? 0xffffffff : 0; + CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._f[1] > CPU.GPR[rb]._f[1] ? 0xffffffff : 0; + CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._f[2] > CPU.GPR[rb]._f[2] ? 0xffffffff : 0; + CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] > CPU.GPR[rb]._f[3] ? 0xffffffff : 0; + } + void DFCGT(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u64[0] = CPU.GPR[ra]._d[0] > CPU.GPR[rb]._d[0] ? 0xffffffffffffffff : 0; + CPU.GPR[rt]._u64[1] = CPU.GPR[ra]._d[1] > CPU.GPR[rb]._d[1] ? 0xffffffffffffffff : 0; + } + void FA(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] + CPU.GPR[rb]._f[0]; + CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] + CPU.GPR[rb]._f[1]; + CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] + CPU.GPR[rb]._f[2]; + CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] + CPU.GPR[rb]._f[3]; + } + void FS(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] - CPU.GPR[rb]._f[0]; + CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] - CPU.GPR[rb]._f[1]; + CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] - CPU.GPR[rb]._f[2]; + CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] - CPU.GPR[rb]._f[3]; + } + void FM(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; + CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; + CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; + CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; + } + void CLGTH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] > CPU.GPR[rb]._u16[h] ? 0xffff : 0; + } + void ORC(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] | (~CPU.GPR[rb]._u32[w]); + } + void FCMGT(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = fabs(CPU.GPR[ra]._f[0]) > fabs(CPU.GPR[rb]._f[0]) ? 0xffffffff : 0; + CPU.GPR[rt]._u32[1] = fabs(CPU.GPR[ra]._f[1]) > fabs(CPU.GPR[rb]._f[1]) ? 0xffffffff : 0; + CPU.GPR[rt]._u32[2] = fabs(CPU.GPR[ra]._f[2]) > fabs(CPU.GPR[rb]._f[2]) ? 0xffffffff : 0; + CPU.GPR[rt]._u32[3] = fabs(CPU.GPR[ra]._f[3]) > fabs(CPU.GPR[rb]._f[3]) ? 0xffffffff : 0; + } + void DFCMGT(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u64[0] = fabs(CPU.GPR[ra]._d[0]) > fabs(CPU.GPR[rb]._d[0]) ? 0xffffffffffffffff : 0; + CPU.GPR[rt]._u64[1] = fabs(CPU.GPR[ra]._d[1]) > fabs(CPU.GPR[rb]._d[1]) ? 0xffffffffffffffff : 0; + } + void DFA(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] + CPU.GPR[rb]._d[0]; + CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] + CPU.GPR[rb]._d[1]; + } + void DFS(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] - CPU.GPR[rb]._d[0]; + CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] - CPU.GPR[rb]._d[1]; + } + void DFM(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; + CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; + } + void CLGTB(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] > CPU.GPR[rb]._u8[b] ? 0xff : 0; + } + void HLGT(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + if(CPU.GPR[ra]._u32[3] > CPU.GPR[rb]._u32[3]) CPU.Stop(); + } + void DFMA(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._d[0] += CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; + CPU.GPR[rt]._d[1] += CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; + } + void DFMS(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0] - CPU.GPR[rt]._d[0]; + CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1] - CPU.GPR[rt]._d[1]; + } + void DFNMS(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._d[0] -= CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; + CPU.GPR[rt]._d[1] -= CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; + } + void DFNMA(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._d[0] = -(CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0] + CPU.GPR[rt]._d[0]); + CPU.GPR[rt]._d[1] = -(CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1] + CPU.GPR[rt]._d[1]); + } + void CEQ(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] == CPU.GPR[rb]._i32[w] ? 0xffffffff : 0; + } + void MPYHHU(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2+1] * CPU.GPR[rb]._u16[w*2+1]; + } + void ADDX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] + CPU.GPR[rb]._u32[w] + (CPU.GPR[rt]._u32[w] & 1); + } + void SFX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[rb]._u32[w] - CPU.GPR[ra]._u32[w] - (1 - (CPU.GPR[rt]._u32[w] & 1)); + } + void CGX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = ((u64)CPU.GPR[ra]._u32[w] + (u64)CPU.GPR[rb]._u32[w] + (u64)(CPU.GPR[rt]._u32[w] & 1)) >> 32; + } + void BGX(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + s64 nResult; + + for (int w = 0; w < 4; w++) + { + nResult = (u64)CPU.GPR[rb]._u32[w] - (u64)CPU.GPR[ra]._u32[w] - (u64)(1 - (CPU.GPR[rt]._u32[w] & 1)); + CPU.GPR[rt]._u32[w] = nResult < 0 ? 0 : 1; + } + } + void MPYHHA(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] += CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2+1]; + } + void MPYHHAU(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] += CPU.GPR[ra]._u16[w*2+1] * CPU.GPR[rb]._u16[w*2+1]; + } + //Forced bits to 0, hence the shift: + + void FSCRRD(u32 rt) + { + /*CPU.GPR[rt]._u128.lo = + CPU.FPSCR.Exception0 << 20 & + CPU.FPSCR.*/ + UNIMPLEMENTED(); + } + void FESD(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._d[0] = (double)CPU.GPR[ra]._f[1]; + CPU.GPR[rt]._d[1] = (double)CPU.GPR[ra]._f[3]; + } + void FRDS(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._f[1] = (float)CPU.GPR[ra]._d[0]; + CPU.GPR[rt]._u32[0] = 0x00000000; + CPU.GPR[rt]._f[3] = (float)CPU.GPR[ra]._d[1]; + CPU.GPR[rt]._u32[2] = 0x00000000; + } + void FSCRWR(u32 rt, u32 ra) + { + UNIMPLEMENTED(); + } + void DFTSV(u32 rt, u32 ra, s32 i7) + { + UNIMPLEMENTED(); + const u64 DoubleExpMask = 0x7ff0000000000000; + const u64 DoubleFracMask = 0x000fffffffffffff; + const u64 DoubleSignMask = 0x8000000000000000; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt].Reset(); + if (i7 & 1) //Negative Denorm Check (-, exp is zero, frac is non-zero) + for (int i = 0; i < 2; i++) + { + if (temp._u64[i] & DoubleFracMask) + if ((temp._u64[i] & (DoubleSignMask | DoubleExpMask)) == DoubleSignMask) + CPU.GPR[rt]._u64[i] = 0xffffffffffffffff; + } + if (i7 & 2) //Positive Denorm Check (+, exp is zero, frac is non-zero) + for (int i = 0; i < 2; i++) + { + if (temp._u64[i] & DoubleFracMask) + if ((temp._u64[i] & (DoubleSignMask | DoubleExpMask)) == 0) + CPU.GPR[rt]._u64[i] = 0xffffffffffffffff; + } + if (i7 & 4) //Negative Zero Check (-, exp is zero, frac is zero) + for (int i = 0; i < 2; i++) + { + if (temp._u64[i] == DoubleSignMask) + CPU.GPR[rt]._u64[i] = 0xffffffffffffffff; + } + if (i7 & 8) //Positive Zero Check (+, exp is zero, frac is zero) + for (int i = 0; i < 2; i++) + { + if (temp._u64[i] == 0) + CPU.GPR[rt]._u64[i] = 0xffffffffffffffff; + } + if (i7 & 16) //Negative Infinity Check (-, exp is 0x7ff, frac is zero) + for (int i = 0; i < 2; i++) + { + if (temp._u64[i] == (DoubleSignMask | DoubleExpMask)) + CPU.GPR[rt]._u64[i] = 0xffffffffffffffff; + } + if (i7 & 32) //Positive Infinity Check (+, exp is 0x7ff, frac is zero) + for (int i = 0; i < 2; i++) + { + if (temp._u64[i] == DoubleExpMask) + CPU.GPR[rt]._u64[i] = 0xffffffffffffffff; + } + if (i7 & 64) //Not-a-Number Check (any sign, exp is 0x7ff, frac is non-zero) + for (int i = 0; i < 2; i++) + { + if (temp._u64[i] & DoubleFracMask) + if ((temp._u64[i] & DoubleExpMask) == DoubleExpMask) + CPU.GPR[rt]._u64[i] = 0xffffffffffffffff; + } + } + void FCEQ(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._f[0] == CPU.GPR[rb]._f[0] ? 0xffffffff : 0; + CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._f[1] == CPU.GPR[rb]._f[1] ? 0xffffffff : 0; + CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._f[2] == CPU.GPR[rb]._f[2] ? 0xffffffff : 0; + CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] == CPU.GPR[rb]._f[3] ? 0xffffffff : 0; + } + void DFCEQ(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u64[0] = CPU.GPR[ra]._d[0] == CPU.GPR[rb]._d[0] ? 0xffffffffffffffff : 0; + CPU.GPR[rt]._u64[1] = CPU.GPR[ra]._d[1] == CPU.GPR[rb]._d[1] ? 0xffffffffffffffff : 0; + } + void MPY(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2]; + } + void MPYH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = (CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2]) << 16; + } + void MPYHH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2+1]; + } + void MPYS(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = (CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2]) >> 16; + } + void CEQH(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] == CPU.GPR[rb]._u16[h] ? 0xffff : 0; + } + void FCMEQ(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u32[0] = fabs(CPU.GPR[ra]._f[0]) == fabs(CPU.GPR[rb]._f[0]) ? 0xffffffff : 0; + CPU.GPR[rt]._u32[1] = fabs(CPU.GPR[ra]._f[1]) == fabs(CPU.GPR[rb]._f[1]) ? 0xffffffff : 0; + CPU.GPR[rt]._u32[2] = fabs(CPU.GPR[ra]._f[2]) == fabs(CPU.GPR[rb]._f[2]) ? 0xffffffff : 0; + CPU.GPR[rt]._u32[3] = fabs(CPU.GPR[ra]._f[3]) == fabs(CPU.GPR[rb]._f[3]) ? 0xffffffff : 0; + } + void DFCMEQ(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._u64[0] = fabs(CPU.GPR[ra]._d[0]) == fabs(CPU.GPR[rb]._d[0]) ? 0xffffffffffffffff : 0; + CPU.GPR[rt]._u64[1] = fabs(CPU.GPR[ra]._d[1]) == fabs(CPU.GPR[rb]._d[1]) ? 0xffffffffffffffff : 0; + } + void MPYU(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * CPU.GPR[rb]._u16[w*2]; + } + void CEQB(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] == CPU.GPR[rb]._u8[b] ? 0xff : 0; + } + void FI(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + //Floating Interpolation: ra will be ignored. + //It should work correctly if result of preceding FREST or FRSQEST is sufficiently exact + CPU.GPR[rt] = CPU.GPR[rb]; + } + void HEQ(u32 rt, u32 ra, u32 rb) + { + UNIMPLEMENTED(); + if(CPU.GPR[ra]._i32[3] == CPU.GPR[rb]._i32[3]) CPU.Stop(); + } + + //0 - 9 + void CFLTS(u32 rt, u32 ra, s32 i8) + { + UNIMPLEMENTED(); + const u32 scale = 173 - (i8 & 0xff); //unsigned immediate + for (int i = 0; i < 4; i++) + { + u32 exp = ((CPU.GPR[ra]._u32[i] >> 23) & 0xff) + scale; + + if (exp > 255) + exp = 255; + + CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] & 0x807fffff) | (exp << 23); + + CPU.GPR[rt]._u32[i] = (u32)CPU.GPR[rt]._f[i]; //trunc + } + //CPU.GPR[rt]._m128i = _mm_cvttps_epi32(CPU.GPR[rt]._m128); + } + void CFLTU(u32 rt, u32 ra, s32 i8) + { + UNIMPLEMENTED(); + const u32 scale = 173 - (i8 & 0xff); //unsigned immediate + for (int i = 0; i < 4; i++) + { + u32 exp = ((CPU.GPR[ra]._u32[i] >> 23) & 0xff) + scale; + + if (exp > 255) + exp = 255; + + if (CPU.GPR[ra]._u32[i] & 0x80000000) //if negative, result = 0 + CPU.GPR[rt]._u32[i] = 0; + else + { + CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] & 0x807fffff) | (exp << 23); + + if (CPU.GPR[rt]._f[i] > 0xffffffff) //if big, result = max + CPU.GPR[rt]._u32[i] = 0xffffffff; + else + CPU.GPR[rt]._u32[i] = floor(CPU.GPR[rt]._f[i]); + } + } + } + void CSFLT(u32 rt, u32 ra, s32 i8) + { + UNIMPLEMENTED(); + //CPU.GPR[rt]._m128 = _mm_cvtepi32_ps(CPU.GPR[ra]._m128i); + const u32 scale = 155 - (i8 & 0xff); //unsigned immediate + for (int i = 0; i < 4; i++) + { + CPU.GPR[rt]._f[i] = (s32)CPU.GPR[ra]._i32[i]; + + u32 exp = ((CPU.GPR[rt]._u32[i] >> 23) & 0xff) - scale; + + if (exp > 255) //< 0 + exp = 0; + + CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); + } + } + void CUFLT(u32 rt, u32 ra, s32 i8) + { + UNIMPLEMENTED(); + const u32 scale = 155 - (i8 & 0xff); //unsigned immediate + for (int i = 0; i < 4; i++) + { + CPU.GPR[rt]._f[i] = (float)CPU.GPR[ra]._u32[i]; + u32 exp = ((CPU.GPR[rt]._u32[i] >> 23) & 0xff) - scale; + + if (exp > 255) //< 0 + exp = 0; + + CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); + } + } + + //0 - 8 + void BRZ(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + if (CPU.GPR[rt]._u32[3] == 0) + CPU.SetBranch(branchTarget(CPU.PC, i16)); + } + void STQA(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + u32 lsa = (i16 << 2) & 0x3fff0; + if(!CPU.IsGoodLSA(lsa)) + { + ConLog.Error("STQA: bad lsa (0x%x)", lsa); + Emu.Pause(); + return; + } + + CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); + } + void BRNZ(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + if (CPU.GPR[rt]._u32[3] != 0) + CPU.SetBranch(branchTarget(CPU.PC, i16)); + } + void BRHZ(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + if (CPU.GPR[rt]._u16[6] == 0) + CPU.SetBranch(branchTarget(CPU.PC, i16)); + } + void BRHNZ(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + if (CPU.GPR[rt]._u16[6] != 0) + CPU.SetBranch(branchTarget(CPU.PC, i16)); + } + void STQR(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; + if(!CPU.IsGoodLSA(lsa)) + { + ConLog.Error("STQR: bad lsa (0x%x)", lsa); + Emu.Pause(); + return; + } + + CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); + } + void BRA(s32 i16) + { + UNIMPLEMENTED(); + CPU.SetBranch(branchTarget(0, i16)); + } + void LQA(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + u32 lsa = (i16 << 2) & 0x3fff0; + if(!CPU.IsGoodLSA(lsa)) + { + ConLog.Error("LQA: bad lsa (0x%x)", lsa); + Emu.Pause(); + return; + } + + CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); + } + void BRASL(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + CPU.GPR[rt].Reset(); + CPU.GPR[rt]._u32[3] = CPU.PC + 4; + CPU.SetBranch(branchTarget(0, i16)); + } + void BR(s32 i16) + { + UNIMPLEMENTED(); + CPU.SetBranch(branchTarget(CPU.PC, i16)); + } + void FSMBI(u32 rt, s32 i16) + { + XmmVar v0(c); + c.movaps(v0, imm_xmm(fsmbi_mask[i16 & 0xffff])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + void BRSL(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + CPU.GPR[rt].Reset(); + CPU.GPR[rt]._u32[3] = CPU.PC + 4; + CPU.SetBranch(branchTarget(CPU.PC, i16)); + } + void LQR(u32 rt, s32 i16) + { + u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; + + GpVar v0(c, kVarTypeUInt64); + GpVar v1(c, kVarTypeUInt64); + c.mov(v0, qword_ptr(*ls_var, lsa)); + c.mov(v1, qword_ptr(*ls_var, lsa + 8)); + c.bswap(v0); + c.bswap(v1); + c.mov(cpu_qword(GPR[rt]._u64[0]), v1); + c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + } + void IL(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._i32[0] = + CPU.GPR[rt]._i32[1] = + CPU.GPR[rt]._i32[2] = + CPU.GPR[rt]._i32[3] = i16; + } + void ILHU(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = i16 << 16; + } + void ILH(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._i16[h] = i16; + } + void IOHL(u32 rt, s32 i16) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] |= (i16 & 0xFFFF); + } + + + //0 - 7 + void ORI(u32 rt, u32 ra, s32 i10) + { + XmmVar v0(c); + if (i10 == 0) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else if (i10 == -1) + { + if (rt == ra) + { + // nop + } + else + { + // mov + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + } + else + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.orps(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + } + void ORHI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] | i10; + } + void ORBI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] | i10; + } + void SFI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = i10 - CPU.GPR[ra]._i32[w]; + } + void SFHI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._i16[h] = i10 - CPU.GPR[ra]._i16[h]; + } + void ANDI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] & i10; + } + void ANDHI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] & i10; + } + void ANDBI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] & i10; + } + void AI(u32 rt, u32 ra, s32 i10) + { + XmmVar v0(c); + if (i10 == 0) + { + if (rt == ra) + { + // nop + } + else + { + // mov + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + } + else + { + // add + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.paddd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + c.movdqa(cpu_xmm(GPR[rt]), v0); + } + } + void AHI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for(u32 h = 0; h < 8; ++h) + CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] + i10; + } + void STQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding + { + GpVar lsa(c, kVarTypeUInt32); + GpVar v0(c, kVarTypeUInt64); + GpVar v1(c, kVarTypeUInt64); + + c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); + if (i10) c.add(lsa, i10); + c.and_(lsa, 0x3fff0); + c.mov(v0, cpu_qword(GPR[rt]._u64[0])); + c.mov(v1, cpu_qword(GPR[rt]._u64[1])); + c.bswap(v0); + c.bswap(v1); + c.mov(qword_ptr(*ls_var, lsa, 0, 0), v1); + c.mov(qword_ptr(*ls_var, lsa, 0, 8), v0); + } + void LQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding + { + UNIMPLEMENTED(); + const u32 lsa = (CPU.GPR[ra]._i32[3] + i10) & 0x3fff0; + if(!CPU.IsGoodLSA(lsa)) + { + ConLog.Error("LQD: bad lsa (0x%x)", lsa); + Emu.Pause(); + return; + } + + CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); + } + void XORI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] ^ i10; + } + void XORHI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] ^ i10; + } + void XORBI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] ^ i10; + } + void CGTI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > i10 ? 0xffffffff : 0; + } + void CGTHI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._i16[h] > i10 ? 0xffff : 0; + } + void CGTBI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._i8[b] > (s8)(i10 & 0xff) ? 0xff : 0; + } + void HGTI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + if(CPU.GPR[ra]._i32[3] > i10) CPU.Stop(); + } + void CLGTI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for(u32 i = 0; i < 4; ++i) + { + CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > (u32)i10) ? 0xffffffff : 0x00000000; + } + } + void CLGTHI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for(u32 i = 0; i < 8; ++i) + { + CPU.GPR[rt]._u16[i] = (CPU.GPR[ra]._u16[i] > (u16)i10) ? 0xffff : 0x0000; + } + } + void CLGTBI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] > (u8)(i10 & 0xff) ? 0xff : 0; + } + void HLGTI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + if(CPU.GPR[ra]._u32[3] > (u32)i10) CPU.Stop(); + } + void MPYI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * i10; + } + void MPYUI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * (u16)(i10 & 0xffff); + } + void CEQI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for(u32 i = 0; i < 4; ++i) + CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._i32[i] == i10) ? 0xffffffff : 0x00000000; + } + void CEQHI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int h = 0; h < 8; h++) + CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._i16[h] == (s16)i10) ? 0xffff : 0; + } + void CEQBI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + for (int b = 0; b < 16; b++) + CPU.GPR[rt]._i8[b] = (CPU.GPR[ra]._i8[b] == (s8)(i10 & 0xff)) ? 0xff : 0; + } + void HEQI(u32 rt, u32 ra, s32 i10) + { + UNIMPLEMENTED(); + if(CPU.GPR[ra]._i32[3] == i10) CPU.Stop(); + } + + + //0 - 6 + void HBRA(s32 ro, s32 i16) + { //i16 is shifted left by 2 while decoding + //UNIMPLEMENTED(); + } + void HBRR(s32 ro, s32 i16) + { + //UNIMPLEMENTED(); + } + void ILA(u32 rt, u32 i18) + { + XmmVar v0(c); + c.movaps(v0, imm_xmm(s19_to_s32[i18 & 0x3ffff])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + + //0 - 3 + void SELB(u32 rt, u32 ra, u32 rb, u32 rc) + { + UNIMPLEMENTED(); + for(u64 i = 0; i < 2; ++i) + { + CPU.GPR[rt]._u64[i] = + ( CPU.GPR[rc]._u64[i] & CPU.GPR[rb]._u64[i]) | + (~CPU.GPR[rc]._u64[i] & CPU.GPR[ra]._u64[i]); + } + } + void SHUFB(u32 rt, u32 ra, u32 rb, u32 rc) + { + UNIMPLEMENTED(); + const SPU_GPR_hdr _a = CPU.GPR[ra]; + const SPU_GPR_hdr _b = CPU.GPR[rb]; + for (int i = 0; i < 16; i++) + { + u8 b = CPU.GPR[rc]._u8[i]; + if(b & 0x80) + { + if(b & 0x40) + { + if(b & 0x20) + CPU.GPR[rt]._u8[i] = 0x80; + else + CPU.GPR[rt]._u8[i] = 0xFF; + } + else + CPU.GPR[rt]._u8[i] = 0x00; + } + else + { + if(b & 0x10) + CPU.GPR[rt]._u8[i] = _b._u8[15 - (b & 0x0F)]; + else + CPU.GPR[rt]._u8[i] = _a._u8[15 - (b & 0x0F)]; + } + } + } + void MPYA(u32 rt, u32 ra, u32 rb, u32 rc) + { + UNIMPLEMENTED(); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2] + CPU.GPR[rc]._i32[w]; + } + void FNMS(u32 rt, u32 ra, u32 rb, u32 rc) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._f[0] = CPU.GPR[rc]._f[0] - CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; + CPU.GPR[rt]._f[1] = CPU.GPR[rc]._f[1] - CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; + CPU.GPR[rt]._f[2] = CPU.GPR[rc]._f[2] - CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; + CPU.GPR[rt]._f[3] = CPU.GPR[rc]._f[3] - CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; + } + void FMA(u32 rt, u32 ra, u32 rb, u32 rc) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] + CPU.GPR[rc]._f[0]; + CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] + CPU.GPR[rc]._f[1]; + CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] + CPU.GPR[rc]._f[2]; + CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] + CPU.GPR[rc]._f[3]; + } + void FMS(u32 rt, u32 ra, u32 rb, u32 rc) + { + UNIMPLEMENTED(); + CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] - CPU.GPR[rc]._f[0]; + CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] - CPU.GPR[rc]._f[1]; + CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] - CPU.GPR[rc]._f[2]; + CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] - CPU.GPR[rc]._f[3]; + } + + void UNK(u32 code, u32 opcode, u32 gcode) + { + UNK(fmt::Format("(SPURecompiler) Unimplemented opcode! (0x%08x, 0x%x, 0x%x)", code, opcode, gcode)); + } + + void UNK(const std::string& err) + { + ConLog.Error(err + fmt::Format(" #pc: 0x%x", CPU.PC)); + do_finalize = true; + Emu.Pause(); + } +}; \ No newline at end of file diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp new file mode 100644 index 0000000000..bf3662c399 --- /dev/null +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -0,0 +1,132 @@ +#include "stdafx.h" +#include "SPUInstrTable.h" +#include "SPUInterpreter.h" +#include "SPURecompiler.h" + +static const SPUImmTable g_spu_imm; + +SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) +: m_enc(new SPURecompiler(cpu, *this)) +, m_inter(new SPUInterpreter(cpu)) +, CPU(cpu) +, compiler(&runtime) +{ + memset(entry, 0, sizeof(entry)); +} + +SPURecompilerCore::~SPURecompilerCore() +{ + delete m_enc; + delete m_inter; +} + +void SPURecompilerCore::Decode(const u32 code) // decode instruction and run with interpreter +{ + (*SPU_instr::rrr_list)(m_inter, code); +} + +void SPURecompilerCore::Compile(u16 pos) +{ + compiler.addFunc(kFuncConvHost, FuncBuilder4()); + entry[pos].host = pos; + + GpVar cpu_var(compiler, kVarTypeIntPtr, "cpu"); + compiler.setArg(0, cpu_var); + compiler.alloc(cpu_var); + m_enc->cpu_var = &cpu_var; + + GpVar ls_var(compiler, kVarTypeIntPtr, "ls"); + compiler.setArg(1, ls_var); + compiler.alloc(ls_var); + m_enc->ls_var = &ls_var; + + GpVar imm_var(compiler, kVarTypeIntPtr, "imm"); + compiler.setArg(2, imm_var); + compiler.alloc(imm_var); + m_enc->imm_var = &imm_var; + + GpVar pos_var(compiler, kVarTypeUInt16, "pos"); + compiler.setArg(3, pos_var); + compiler.alloc(pos_var); + + while (true) + { + const u32 opcode = Memory.Read32(CPU.dmac.ls_offset + pos * 4); + m_enc->do_finalize = false; + (*SPU_instr::rrr_list)(m_enc, opcode); // compile single opcode + bool fin = m_enc->do_finalize; + entry[pos].valid = opcode; + + if (fin) break; + CPU.PC += 4; + pos++; + entry[pos].host = entry[pos - 1].host; + } + + compiler.xor_(pos_var, pos_var); + compiler.ret(pos_var); + compiler.endFunc(); + entry[entry[pos].host].pointer = compiler.make(); +} + +u8 SPURecompilerCore::DecodeMemory(const u64 address) +{ + const u64 m_offset = address - CPU.PC; + const u16 pos = (CPU.PC >> 2); + + u32* ls = (u32*)Memory.VirtualToRealAddr(m_offset); + + if (!pos) + { + ConLog.Error("SPURecompilerCore::DecodeMemory(): ls_addr = 0"); + Emu.Pause(); + return 0; + } + + if (entry[pos].pointer) + { + // check data (hard way) + bool is_valid = true; + for (u32 i = pos; i < entry[pos].count + pos; i++) + { + if (entry[i].valid != ls[i]) + { + is_valid = false; + break; + } + } + // invalidate if necessary + if (!is_valid) + { + // TODO + } + } + + if (!entry[pos].pointer) + { + // compile from current position to nearest dynamic or statically unresolved branch, zero data or something other + Compile(pos); + } + + if (!entry[pos].pointer) + { + ConLog.Error("SPURecompilerCore::DecodeMemory(ls_addr=0x%x): compilation failed", pos * sizeof(u32)); + Emu.Pause(); + return 0; + } + // jump + typedef u16(*Func)(void* _cpu, void* _ls, const SPUImmTable* _imm, u16 _pos); + + Func func = asmjit_cast(entry[entry[pos].host].pointer); + + void* cpu = (u8*)&CPU.GPR[0] - offsetof(SPUThread, GPR[0]); // ugly cpu base offset detection + + u16 res = pos == entry[pos].host ? 0 : pos; + res = func(cpu, ls, &g_spu_imm, res); + + ConLog.Write("func -> %d", res); + + return 0; + /*Decode(Memory.Read32(address)); + return 4;*/ +} \ No newline at end of file diff --git a/rpcs3/rpcs3.vcxproj b/rpcs3/rpcs3.vcxproj index 5aa56519bc..dc4dfcfc0b 100644 --- a/rpcs3/rpcs3.vcxproj +++ b/rpcs3/rpcs3.vcxproj @@ -69,20 +69,20 @@ - .\;..\wxWidgets\include;..\SDL-1.3.0-5538\include;..\SDL_image-1.2.10;..\pthreads-2.8.0;..\;..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86\Include;.\OpenAL\include;$(IncludePath) + .\;..\wxWidgets\include;..\SDL-1.3.0-5538\include;..\SDL_image-1.2.10;..\pthreads-2.8.0;..\;..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86\Include;.\OpenAL\include;$(IncludePath);..\asmjit\src\asmjit $(SolutionDir)bin\ ..\libs\$(Configuration)\;$(LibraryPath) $(ProjectName)-$(PlatformShortName)-dbg - .\;..\wxWidgets\include;..\SDL-1.3.0-5538\include;..\SDL_image-1.2.10;..\pthreads-2.8.0;..\;..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86_64\Include;.\OpenAL\include;$(IncludePath) + .\;..\wxWidgets\include;..\SDL-1.3.0-5538\include;..\SDL_image-1.2.10;..\pthreads-2.8.0;..\;..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86_64\Include;.\OpenAL\include;$(IncludePath);..\asmjit\src\asmjit $(SolutionDir)bin\ ..\libs\$(Configuration)\;$(LibraryPath) $(ProjectName)-$(PlatformShortName)-dbg false - .\;..\wxWidgets\include;..\SDL-1.3.0-5538\include;..\SDL_image-1.2.10;..\pthreads-2.8.0;..\;..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86\Include;.\OpenAL\include;$(IncludePath) + .\;..\wxWidgets\include;..\SDL-1.3.0-5538\include;..\SDL_image-1.2.10;..\pthreads-2.8.0;..\;..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86\Include;.\OpenAL\include;$(IncludePath);..\asmjit\src\asmjit $(SolutionDir)bin\ ..\libs\$(Configuration)\;$(LibraryPath) false @@ -91,7 +91,7 @@ false - .\;..\wxWidgets\include;..\SDL-1.3.0-5538\include;..\SDL_image-1.2.10;..\pthreads-2.8.0;..\;..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86_64\Include;.\OpenAL\include;$(IncludePath) + .\;..\wxWidgets\include;..\SDL-1.3.0-5538\include;..\SDL_image-1.2.10;..\pthreads-2.8.0;..\;..\ffmpeg\WindowsInclude;..\ffmpeg\Windows\x86_64\Include;.\OpenAL\include;$(IncludePath);..\asmjit\src\asmjit $(SolutionDir)bin\ ..\libs\$(Configuration)\;$(LibraryPath) false @@ -109,7 +109,7 @@ true - wxmsw31ud_adv.lib;wxbase31ud.lib;wxmsw31ud_core.lib;wxmsw31ud_aui.lib;wxtiffd.lib;wxjpegd.lib;wxpngd.lib;wxzlibd.lib;odbc32.lib;odbccp32.lib;comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;rpcrt4.lib;avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;OpenAL32.lib;EFX-Util.lib;%(AdditionalDependencies) + wxmsw31ud_adv.lib;wxbase31ud.lib;wxmsw31ud_core.lib;wxmsw31ud_aui.lib;wxtiffd.lib;wxjpegd.lib;wxpngd.lib;wxzlibd.lib;odbc32.lib;odbccp32.lib;comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;rpcrt4.lib;avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;OpenAL32.lib;EFX-Util.lib;asmjit.lib;%(AdditionalDependencies) %(IgnoreSpecificDefaultLibraries) false ..\wxWidgets\lib\vc_lib;..\ffmpeg\Windows\x86\lib;..\OpenAL\Win32 @@ -129,7 +129,7 @@ true - wxmsw31ud_adv.lib;wxbase31ud.lib;wxmsw31ud_core.lib;wxmsw31ud_aui.lib;wxtiffd.lib;wxjpegd.lib;wxpngd.lib;wxzlibd.lib;odbc32.lib;odbccp32.lib;comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;rpcrt4.lib;avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;OpenAL32.lib;EFX-Util.lib;%(AdditionalDependencies) + wxmsw31ud_adv.lib;wxbase31ud.lib;wxmsw31ud_core.lib;wxmsw31ud_aui.lib;wxtiffd.lib;wxjpegd.lib;wxpngd.lib;wxzlibd.lib;odbc32.lib;odbccp32.lib;comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;rpcrt4.lib;avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;OpenAL32.lib;EFX-Util.lib;asmjit.lib;%(AdditionalDependencies) %(IgnoreSpecificDefaultLibraries) false ..\wxWidgets\lib\vc_x64_lib;..\ffmpeg\Windows\x86_64\lib;..\OpenAL\Win64 @@ -161,7 +161,7 @@ true true true - wxmsw31u_adv.lib;wxbase31u.lib;wxmsw31u_core.lib;wxmsw31u_aui.lib;odbc32.lib;odbccp32.lib;comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;rpcrt4.lib;wxtiff.lib;wxjpeg.lib;wxpng.lib;wxzlib.lib;wxregexu.lib;wxexpat.lib;wsock32.lib;wininet.lib;avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;OpenAL32.lib;EFX-Util.lib + wxmsw31u_adv.lib;wxbase31u.lib;wxmsw31u_core.lib;wxmsw31u_aui.lib;odbc32.lib;odbccp32.lib;comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;rpcrt4.lib;wxtiff.lib;wxjpeg.lib;wxpng.lib;wxzlib.lib;wxregexu.lib;wxexpat.lib;wsock32.lib;wininet.lib;avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;OpenAL32.lib;EFX-Util.lib;asmjit.lib %(IgnoreSpecificDefaultLibraries) @@ -193,7 +193,7 @@ true true true - wxmsw31u_adv.lib;wxbase31u.lib;wxmsw31u_core.lib;wxmsw31u_aui.lib;odbc32.lib;odbccp32.lib;comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;rpcrt4.lib;wxtiff.lib;wxjpeg.lib;wxpng.lib;wxzlib.lib;wxregexu.lib;wxexpat.lib;wsock32.lib;wininet.lib;avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;OpenAL32.lib;EFX-Util.lib;%(AdditionalDependencies) + wxmsw31u_adv.lib;wxbase31u.lib;wxmsw31u_core.lib;wxmsw31u_aui.lib;odbc32.lib;odbccp32.lib;comctl32.lib;ws2_32.lib;shlwapi.lib;winmm.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;rpcrt4.lib;wxtiff.lib;wxjpeg.lib;wxpng.lib;wxzlib.lib;wxregexu.lib;wxexpat.lib;wsock32.lib;wininet.lib;avcodec.lib;avformat.lib;avutil.lib;swresample.lib;swscale.lib;OpenAL32.lib;EFX-Util.lib;asmjit.lib;%(AdditionalDependencies) %(IgnoreSpecificDefaultLibraries) @@ -227,6 +227,7 @@ + diff --git a/rpcs3/rpcs3.vcxproj.filters b/rpcs3/rpcs3.vcxproj.filters index 1c1ed0a365..1e00a296fe 100644 --- a/rpcs3/rpcs3.vcxproj.filters +++ b/rpcs3/rpcs3.vcxproj.filters @@ -487,6 +487,9 @@ Utilities + + Emu\Cell + From e614a7313c7ad4b1fbefb48431835d5f3baf29ae Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 6 Apr 2014 23:23:32 +0400 Subject: [PATCH 02/14] SPU JIT WIP --- asmjit.vcxproj | 10 +- rpcs3/Emu/Cell/PPCDecoder.h | 12 +- rpcs3/Emu/Cell/SPUInterpreter.h | 109 +-- rpcs3/Emu/Cell/SPURecompiler.h | 1158 ++++++++++++++++++++------ rpcs3/Emu/Cell/SPURecompilerCore.cpp | 33 +- rpcs3/Emu/Cell/SPUThread.cpp | 3 + rpcs3/Emu/Cell/SPUThread.h | 126 ++- rpcs3/rpcs3.vcxproj | 1 + rpcs3/rpcs3.vcxproj.filters | 3 + 9 files changed, 1056 insertions(+), 399 deletions(-) diff --git a/asmjit.vcxproj b/asmjit.vcxproj index 80ffab87f4..98dbfb40b4 100644 --- a/asmjit.vcxproj +++ b/asmjit.vcxproj @@ -92,12 +92,12 @@ - .\libs\$(Configuration)\ + .\libs\$(Configuration)_x86\ - .\libs\$(Configuration)\ + .\libs\$(Configuration)_x86\ @@ -115,7 +115,7 @@ Level3 Disabled - true + false ASMJIT_STATIC;_MBCS;%(PreprocessorDefinitions) @@ -139,7 +139,7 @@ MaxSpeed true true - true + false ASMJIT_STATIC;_UNICODE;UNICODE;%(PreprocessorDefinitions) @@ -154,7 +154,7 @@ MaxSpeed true true - true + false ASMJIT_STATIC;_UNICODE;UNICODE;%(PreprocessorDefinitions) diff --git a/rpcs3/Emu/Cell/PPCDecoder.h b/rpcs3/Emu/Cell/PPCDecoder.h index 3185a441a9..fa54d4112c 100644 --- a/rpcs3/Emu/Cell/PPCDecoder.h +++ b/rpcs3/Emu/Cell/PPCDecoder.h @@ -12,19 +12,19 @@ public: template -static InstrList<1 << CodeField::size, TO>* new_list(const CodeField& func, InstrCaller* error_func = nullptr) +static InstrList<(1 << (CodeField::size)), TO>* new_list(const CodeField& func, InstrCaller* error_func = nullptr) { - return new InstrList<1 << CodeField::size, TO>(func, error_func); + return new InstrList<(1 << (CodeField::size)), TO>(func, error_func); } template -static InstrList<1 << CodeField::size, TO>* new_list(InstrList* parent, int opcode, const CodeField& func, InstrCaller* error_func = nullptr) +static InstrList<(1 << (CodeField::size)), TO>* new_list(InstrList* parent, int opcode, const CodeField& func, InstrCaller* error_func = nullptr) { - return connect_list(parent, new InstrList<1 << CodeField::size, TO>(func, error_func), opcode); + return connect_list(parent, new InstrList<(1 << (CodeField::size)), TO>(func, error_func), opcode); } template -static InstrList<1 << CodeField::size, TO>* new_list(InstrList* parent, const CodeField& func, InstrCaller* error_func = nullptr) +static InstrList<(1 << (CodeField::size)), TO>* new_list(InstrList* parent, const CodeField& func, InstrCaller* error_func = nullptr) { - return connect_list(parent, new InstrList<1 << CodeField::size, TO>(func, error_func)); + return connect_list(parent, new InstrList<(1 << (CodeField::size)), TO>(func, error_func)); } \ No newline at end of file diff --git a/rpcs3/Emu/Cell/SPUInterpreter.h b/rpcs3/Emu/Cell/SPUInterpreter.h index dfca767c9c..fe55ed219e 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.h +++ b/rpcs3/Emu/Cell/SPUInterpreter.h @@ -32,121 +32,19 @@ private: //0 - 10 void STOP(u32 code) { - CPU.SetExitStatus(code); // exit code (not status) - - switch (code) - { - case 0x110: /* ===== sys_spu_thread_receive_event ===== */ - { - u32 spuq = 0; - if (!CPU.SPU.Out_MBox.Pop(spuq)) - { - ConLog.Error("sys_spu_thread_receive_event: cannot read Out_MBox"); - CPU.SPU.In_MBox.PushUncond(CELL_EINVAL); // ??? - return; - } - - if (CPU.SPU.In_MBox.GetCount()) - { - ConLog.Error("sys_spu_thread_receive_event(spuq=0x%x): In_MBox is not empty", spuq); - CPU.SPU.In_MBox.PushUncond(CELL_EBUSY); // ??? - return; - } - - if (Ini.HLELogging.GetValue()) - { - ConLog.Write("sys_spu_thread_receive_event(spuq=0x%x)", spuq); - } - - EventQueue* eq; - if (!CPU.SPUQs.GetEventQueue(FIX_SPUQ(spuq), eq)) - { - CPU.SPU.In_MBox.PushUncond(CELL_EINVAL); // TODO: check error value - return; - } - - u32 tid = GetCurrentSPUThread().GetId(); - - eq->sq.push(tid); // add thread to sleep queue - - while (true) - { - switch (eq->owner.trylock(tid)) - { - case SMR_OK: - if (!eq->events.count()) - { - eq->owner.unlock(tid); - break; - } - else - { - u32 next = (eq->protocol == SYS_SYNC_FIFO) ? eq->sq.pop() : eq->sq.pop_prio(); - if (next != tid) - { - eq->owner.unlock(tid, next); - break; - } - } - case SMR_SIGNAL: - { - sys_event_data event; - eq->events.pop(event); - eq->owner.unlock(tid); - CPU.SPU.In_MBox.PushUncond(CELL_OK); - CPU.SPU.In_MBox.PushUncond(event.data1); - CPU.SPU.In_MBox.PushUncond(event.data2); - CPU.SPU.In_MBox.PushUncond(event.data3); - return; - } - case SMR_FAILED: break; - default: eq->sq.invalidate(tid); CPU.SPU.In_MBox.PushUncond(CELL_ECANCELED); return; - } - - Sleep(1); - if (Emu.IsStopped()) - { - ConLog.Warning("sys_spu_thread_receive_event(spuq=0x%x) aborted", spuq); - eq->sq.invalidate(tid); - return; - } - } - } - break; - case 0x102: - if (!CPU.SPU.Out_MBox.GetCount()) - { - ConLog.Error("sys_spu_thread_exit (no status, code 0x102)"); - } - else if (Ini.HLELogging.GetValue()) - { - // the real exit status - ConLog.Write("sys_spu_thread_exit (status=0x%x)", CPU.SPU.Out_MBox.GetValue()); - } - CPU.Stop(); - break; - default: - if (!CPU.SPU.Out_MBox.GetCount()) - { - ConLog.Error("Unknown STOP code: 0x%x (no message)", code); - } - else - { - ConLog.Error("Unknown STOP code: 0x%x (message=0x%x)", code, CPU.SPU.Out_MBox.GetValue()); - } - CPU.Stop(); - break; - } + CPU.DoStop(code); } void LNOP() { } void SYNC(u32 Cbit) { + // This instruction must be used following a store instruction that modifies the instruction stream. _mm_mfence(); } void DSYNC() { + // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. _mm_mfence(); } void MFSPR(u32 rt, u32 sa) @@ -389,6 +287,7 @@ private: } void STOPD(u32 rc, u32 ra, u32 rb) { + UNIMPLEMENTED(); Emu.Pause(); } void STQX(u32 rt, u32 ra, u32 rb) diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index f78cdf8339..478d5d3d16 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -16,8 +16,12 @@ using namespace asmjit::host; struct SPUImmTable { - __m128i s19_to_s32[1 << 18]; + __m128i s19_to_s32[1 << 19]; __m128i fsmbi_mask[1 << 16]; + __m128 scale_to_float[256]; + __m128 scale_to_int[256]; + __m128i min_int; + __m128i max_int; SPUImmTable() { @@ -38,6 +42,34 @@ struct SPUImmTable fsmbi_mask[i].m128i_i8[j] = ((i >> j) & 0x1) ? 0xff : 0; } } + // scale table for (u)int -> float conversion + for (s32 i = 0; i < sizeof(scale_to_float) / sizeof(__m128); i++) + { + const float v = pow(2, i - 155); + scale_to_float[i].m128_f32[0] = v; + scale_to_float[i].m128_f32[1] = v; + scale_to_float[i].m128_f32[2] = v; + scale_to_float[i].m128_f32[3] = v; + } + // scale table for float -> (u)int conversion + for (s32 i = 0; i < sizeof(scale_to_int) / sizeof(__m128); i++) + { + const float v = pow(2, 173 - i); + scale_to_int[i].m128_f32[0] = v; + scale_to_int[i].m128_f32[1] = v; + scale_to_int[i].m128_f32[2] = v; + scale_to_int[i].m128_f32[3] = v; + } + // sign bit + min_int.m128i_u32[0] = 0x80000000; + min_int.m128i_u32[1] = 0x80000000; + min_int.m128i_u32[2] = 0x80000000; + min_int.m128i_u32[3] = 0x80000000; + // + max_int.m128i_u32[0] = 0x7fffffff; + max_int.m128i_u32[1] = 0x7fffffff; + max_int.m128i_u32[2] = 0x7fffffff; + max_int.m128i_u32[3] = 0x7fffffff; } }; @@ -46,10 +78,10 @@ class SPURecompiler; class SPURecompilerCore : public CPUDecoder { SPURecompiler* m_enc; - SPUInterpreter* m_inter; SPUThread& CPU; public: + SPUInterpreter* inter; JitRuntime runtime; Compiler compiler; @@ -74,14 +106,29 @@ public: virtual u8 DecodeMemory(const u64 address); }; -#define cpu_xmm(x) oword_ptr(*cpu_var, offsetof(SPUThread, x)) -#define cpu_qword(x) qword_ptr(*cpu_var, offsetof(SPUThread, x)) -#define cpu_dword(x,...) dword_ptr(*cpu_var, __VA_ARGS__, offsetof(SPUThread, x)) -#define cpu_word(x) word_ptr(*cpu_var, offsetof(SPUThread, x)) -#define cpu_byte(x) byte_ptr(*cpu_var, offsetof(SPUThread, x)) +#define cpu_xmm(x) oword_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 16) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 16") +#define cpu_qword(x) qword_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 8) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 8") +#define cpu_dword(x) dword_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 4) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 4") +#define cpu_word(x) word_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 2) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 2") +#define cpu_byte(x) byte_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 1) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 1") #define imm_xmm(x) oword_ptr(*imm_var, offsetof(SPUImmTable, x)) +#define WRAPPER_BEGIN(a0, a1, a2, a3) struct opcode_wrapper \ +{ \ + static void opcode(u32 a0, u32 a1, u32 a2, u32 a3) \ +{ \ + SPUThread& CPU = *(SPUThread*)GetCurrentCPUThread(); + +#define WRAPPER_END(a0, a1, a2, a3) } \ +}; \ + X86X64CallNode* call = c.call(imm_ptr(&opcode_wrapper::opcode), kFuncConvHost, FuncBuilder4()); \ + call->setArg(0, imm_u(a0)); \ + call->setArg(1, imm_u(a1)); \ + call->setArg(2, imm_u(a2)); \ + call->setArg(3, imm_u(a3)); + + class SPURecompiler : public SPUOpcodes { private: @@ -94,6 +141,7 @@ public: GpVar* cpu_var; GpVar* ls_var; GpVar* imm_var; + GpVar* pos_var; SPURecompiler(SPUThread& cpu, SPURecompilerCore& rec) : CPU(cpu), rec(rec), c(rec.compiler) { @@ -103,19 +151,31 @@ private: //0 - 10 void STOP(u32 code) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(code, xx, yy, zz); + CPU.DoStop(code); + WRAPPER_END(code, 0, 0, 0); + c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; + ConLog.Write("STOP(code=%d)", code); } void LNOP() { - UNIMPLEMENTED(); + /*c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; + ConLog.Write("LNOP()");*/ } void SYNC(u32 Cbit) { - UNIMPLEMENTED(); + // This instruction must be used following a store instruction that modifies the instruction stream. + c.mfence(); + c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; + ConLog.Write("SYNC()"); } void DSYNC() { - UNIMPLEMENTED(); + // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. + c.mfence(); } void MFSPR(u32 rt, u32 sa) { @@ -134,233 +194,326 @@ private: } void RDCH(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); CPU.ReadChannel(CPU.GPR[rt], ra); + WRAPPER_END(rt, ra, 0, 0); + // TODO } void RCHCNT(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); CPU.GPR[rt].Reset(); CPU.GPR[rt]._u32[3] = CPU.GetChannelCount(ra); + WRAPPER_END(rt, ra, 0, 0); + // TODO } void SF(u32 rt, u32 ra, u32 rb) { - XmmVar v0(c); + WRAPPER_BEGIN(rt, ra, rb, zz); + CPU.GPR[rt]._u32[0] = CPU.GPR[rb]._u32[0] - CPU.GPR[ra]._u32[0]; + CPU.GPR[rt]._u32[1] = CPU.GPR[rb]._u32[1] - CPU.GPR[ra]._u32[1]; + CPU.GPR[rt]._u32[2] = CPU.GPR[rb]._u32[2] - CPU.GPR[ra]._u32[2]; + CPU.GPR[rt]._u32[3] = CPU.GPR[rb]._u32[3] - CPU.GPR[ra]._u32[3]; + WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c); if (ra == rb) { // zero c.xorps(v0, v0); c.movaps(cpu_xmm(GPR[rt]), v0); } + else { // sub from c.movdqa(v0, cpu_xmm(GPR[rb])); c.psubd(v0, cpu_xmm(GPR[ra])); c.movdqa(cpu_xmm(GPR[rt]), v0); - } + }*/ } void OR(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]; + WRAPPER_END(rt, ra, rb, 0); + // TODO } void BG(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] > CPU.GPR[rb]._u32[0] ? 0 : 1; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] > CPU.GPR[rb]._u32[1] ? 0 : 1; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] > CPU.GPR[rb]._u32[2] ? 0 : 1; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] > CPU.GPR[rb]._u32[3] ? 0 : 1; + WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c); + if (ra == rb) + { + // load {1,1,1,1} + c.movaps(v0, imm_xmm(s19_to_s32[1])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + // compare if-greater-then + c.movdqa(v0, cpu_xmm(GPR[rb])); + c.psubd(v0, cpu_xmm(GPR[ra])); + c.psrad(v0, 32); + c.paddd(v0, imm_xmm(s19_to_s32[1])); + c.movdqa(cpu_xmm(GPR[rt]), v0); + }*/ } void SFH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = CPU.GPR[rb]._u16[h] - CPU.GPR[ra]._u16[h]; + WRAPPER_END(rt, ra, rb, 0); } void NOR(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]); CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]); CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]); CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]); + WRAPPER_END(rt, ra, rb, 0); + // TODO } void ABSDB(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = CPU.GPR[rb]._u8[b] > CPU.GPR[ra]._u8[b] ? CPU.GPR[rb]._u8[b] - CPU.GPR[ra]._u8[b] : CPU.GPR[ra]._u8[b] - CPU.GPR[rb]._u8[b]; + WRAPPER_END(rt, ra, rb, 0); } void ROT(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = (CPU.GPR[ra]._u32[0] << (CPU.GPR[rb]._u32[0] & 0x1f)) | (CPU.GPR[ra]._u32[0] >> (32 - (CPU.GPR[rb]._u32[0] & 0x1f))); CPU.GPR[rt]._u32[1] = (CPU.GPR[ra]._u32[1] << (CPU.GPR[rb]._u32[1] & 0x1f)) | (CPU.GPR[ra]._u32[1] >> (32 - (CPU.GPR[rb]._u32[1] & 0x1f))); CPU.GPR[rt]._u32[2] = (CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x1f)) | (CPU.GPR[ra]._u32[2] >> (32 - (CPU.GPR[rb]._u32[2] & 0x1f))); CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x1f)) | (CPU.GPR[ra]._u32[3] >> (32 - (CPU.GPR[rb]._u32[3] & 0x1f))); + WRAPPER_END(rt, ra, rb, 0); } void ROTM(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) % 64) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) % 64) : 0; CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) % 64) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) % 64) : 0; CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) % 64) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) % 64) : 0; CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) % 64) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) % 64) : 0; + WRAPPER_END(rt, ra, rb, 0); } void ROTMA(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._i32[0]) % 64) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._i32[0]) % 64) : CPU.GPR[ra]._i32[0] >> 31; CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._i32[1]) % 64) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._i32[1]) % 64) : CPU.GPR[ra]._i32[1] >> 31; CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._i32[2]) % 64) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._i32[2]) % 64) : CPU.GPR[ra]._i32[2] >> 31; CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._i32[3]) % 64) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._i32[3]) % 64) : CPU.GPR[ra]._i32[3] >> 31; + WRAPPER_END(rt, ra, rb, 0); } void SHL(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = (CPU.GPR[rb]._u32[0] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[0] << (CPU.GPR[rb]._u32[0] & 0x3f); CPU.GPR[rt]._u32[1] = (CPU.GPR[rb]._u32[1] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[1] << (CPU.GPR[rb]._u32[1] & 0x3f); CPU.GPR[rt]._u32[2] = (CPU.GPR[rb]._u32[2] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x3f); CPU.GPR[rt]._u32[3] = (CPU.GPR[rb]._u32[3] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x3f); + WRAPPER_END(rt, ra, rb, 0); + // AVX2: masking with 0x3f + VPSLLVD may be better + /*for (u32 i = 0; i < 4; i++) + { + GpVar v0(c, kVarTypeUInt32); + c.mov(v0, cpu_dword(GPR[ra]._u32[i])); + GpVar shift(c, kVarTypeUInt32); + c.mov(shift, cpu_dword(GPR[rb]._u32[i])); + GpVar z(c); + c.xor_(z, z); + c.test(shift, 0x20); + c.cmovnz(v0, z); + c.shl(v0, shift); + c.mov(cpu_dword(GPR[rt]._u32[i]), v0); + }*/ } void ROTH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0xf)) | (CPU.GPR[ra]._u16[h] >> (16 - (CPU.GPR[rb]._u16[h] & 0xf))); + WRAPPER_END(rt, ra, rb, 0); } void ROTHM(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) % 32) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) % 32) : 0; + WRAPPER_END(rt, ra, rb, 0); } void ROTMAH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._i16[h]) % 32) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._i16[h]) % 32) : CPU.GPR[ra]._i16[h] >> 15; + WRAPPER_END(rt, ra, rb, 0); } void SHLH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = (CPU.GPR[rb]._u16[h] & 0x1f) > 15 ? 0 : CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0x1f); + WRAPPER_END(rt, ra, rb, 0); } void ROTI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int nRot = i7 & 0x1f; CPU.GPR[rt]._u32[0] = (CPU.GPR[ra]._u32[0] << nRot) | (CPU.GPR[ra]._u32[0] >> (32 - nRot)); CPU.GPR[rt]._u32[1] = (CPU.GPR[ra]._u32[1] << nRot) | (CPU.GPR[ra]._u32[1] >> (32 - nRot)); CPU.GPR[rt]._u32[2] = (CPU.GPR[ra]._u32[2] << nRot) | (CPU.GPR[ra]._u32[2] >> (32 - nRot)); CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[3] << nRot) | (CPU.GPR[ra]._u32[3] >> (32 - nRot)); + WRAPPER_END(rt, ra, i7, 0); } void ROTMI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); - const int nRot = (0 - i7) % 64; + WRAPPER_BEGIN(rt, ra, i7, zz); + const int nRot = (0 - (s32)i7) % 64; CPU.GPR[rt]._u32[0] = nRot < 32 ? CPU.GPR[ra]._u32[0] >> nRot : 0; CPU.GPR[rt]._u32[1] = nRot < 32 ? CPU.GPR[ra]._u32[1] >> nRot : 0; CPU.GPR[rt]._u32[2] = nRot < 32 ? CPU.GPR[ra]._u32[2] >> nRot : 0; CPU.GPR[rt]._u32[3] = nRot < 32 ? CPU.GPR[ra]._u32[3] >> nRot : 0; + WRAPPER_END(rt, ra, i7, 0); + // TODO } void ROTMAI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); - const int nRot = (0 - i7) % 64; + WRAPPER_BEGIN(rt, ra, i7, zz); + const int nRot = (0 - (s32)i7) % 64; CPU.GPR[rt]._i32[0] = nRot < 32 ? CPU.GPR[ra]._i32[0] >> nRot : CPU.GPR[ra]._i32[0] >> 31; CPU.GPR[rt]._i32[1] = nRot < 32 ? CPU.GPR[ra]._i32[1] >> nRot : CPU.GPR[ra]._i32[1] >> 31; CPU.GPR[rt]._i32[2] = nRot < 32 ? CPU.GPR[ra]._i32[2] >> nRot : CPU.GPR[ra]._i32[2] >> 31; CPU.GPR[rt]._i32[3] = nRot < 32 ? CPU.GPR[ra]._i32[3] >> nRot : CPU.GPR[ra]._i32[3] >> 31; + WRAPPER_END(rt, ra, i7, 0); } void SHLI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const u32 s = i7 & 0x3f; - for (u32 j = 0; j < 4; ++j) CPU.GPR[rt]._u32[j] = CPU.GPR[ra]._u32[j] << s; + WRAPPER_END(rt, ra, i7, 0); + // TODO } void ROTHI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int nRot = i7 & 0xf; - for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._u16[h] << nRot) | (CPU.GPR[ra]._u16[h] >> (16 - nRot)); + WRAPPER_END(rt, ra, i7, 0); } void ROTHMI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); - const int nRot = (0 - i7) % 32; - + WRAPPER_BEGIN(rt, ra, i7, zz); + const int nRot = (0 - (s32)i7) % 32; for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = nRot < 16 ? CPU.GPR[ra]._u16[h] >> nRot : 0; + WRAPPER_END(rt, ra, i7, 0); } void ROTMAHI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); - const int nRot = (0 - i7) % 32; - + WRAPPER_BEGIN(rt, ra, i7, zz); + const int nRot = (0 - (s32)i7) % 32; for (int h = 0; h < 8; h++) CPU.GPR[rt]._i16[h] = nRot < 16 ? CPU.GPR[ra]._i16[h] >> nRot : CPU.GPR[ra]._i16[h] >> 15; + WRAPPER_END(rt, ra, i7, 0); } void SHLHI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int nRot = i7 & 0x1f; - for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[0] = nRot > 15 ? 0 : CPU.GPR[ra]._u16[0] << nRot; + WRAPPER_END(rt, ra, i7, 0); } void A(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]; + WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.paddd(v0, cpu_xmm(GPR[rb])); + c.movdqa(cpu_xmm(GPR[rt]), v0);*/ } void AND(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] & CPU.GPR[rb]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] & CPU.GPR[rb]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] & CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]; + WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c); + if (ra == rb) + { + if (rt == ra) + { + // nop + } + else + { + // mov + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + } + else + { + // and + c.movaps(v0, cpu_xmm(GPR[ra])); + c.andps(v0, cpu_xmm(GPR[rb])); + c.movaps(cpu_xmm(GPR[rt]), v0); + }*/ } void CG(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = ((CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]) < CPU.GPR[ra]._u32[0]) ? 1 : 0; CPU.GPR[rt]._u32[1] = ((CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]) < CPU.GPR[ra]._u32[1]) ? 1 : 0; CPU.GPR[rt]._u32[2] = ((CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]) < CPU.GPR[ra]._u32[2]) ? 1 : 0; CPU.GPR[rt]._u32[3] = ((CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) < CPU.GPR[ra]._u32[3]) ? 1 : 0; + WRAPPER_END(rt, ra, rb, 0); + // TODO } void AH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] + CPU.GPR[rb]._u16[h]; + WRAPPER_END(rt, ra, rb, 0); } void NAND(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] & CPU.GPR[rb]._u32[0]); CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] & CPU.GPR[rb]._u32[1]); CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] & CPU.GPR[rb]._u32[2]); CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]); + WRAPPER_END(rt, ra, rb, 0); } void AVGB(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = (CPU.GPR[ra]._u8[b] + CPU.GPR[rb]._u8[b] + 1) >> 1; + WRAPPER_END(rt, ra, rb, 0); } void MTSPR(u32 rt, u32 sa) { @@ -373,8 +526,40 @@ private: } void WRCH(u32 ra, u32 rt) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(ra, rt, yy, zz); CPU.WriteChannel(ra, CPU.GPR[rt]); + WRAPPER_END(ra, rt, 0, 0); + /*GpVar v(c, kVarTypeUInt32); + c.mov(v, cpu_dword(GPR[rt]._u32[3])); + switch (ra) + { + case MFC_LSA: + c.mov(cpu_dword(MFC1.LSA.m_value[0]), v); + break; + + case MFC_EAH: + c.mov(cpu_dword(MFC1.EAH.m_value[0]), v); + break; + + case MFC_EAL: + c.mov(cpu_dword(MFC1.EAL.m_value[0]), v); + break; + + case MFC_Size: + c.mov(cpu_word(MFC1.Size_Tag.m_val16[1]), v); + break; + + case MFC_TagID: + c.mov(cpu_word(MFC1.Size_Tag.m_val16[0]), v); + break; + + default: + { + X86X64CallNode* call = c.call(imm_ptr(&WRCH_wrapper::WRCH), kFuncConvHost, FuncBuilder2()); + call->setArg(0, imm_u(ra)); + call->setArg(1, v); + } + }*/ } void BIZ(u32 rt, u32 ra) { @@ -407,7 +592,7 @@ private: } void STQX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); u32 lsa = (CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) & 0x3fff0; if(!CPU.IsGoodLSA(lsa)) { @@ -417,19 +602,27 @@ private: } CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); + WRAPPER_END(rt, ra, rb, 0); } void BI(u32 ra) { - UNIMPLEMENTED(); - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + do_finalize = true; + c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); + c.shr(*pos_var, 2); + //ConLog.Write("BI(ra=%d)", ra); } void BISL(u32 rt, u32 ra) { - UNIMPLEMENTED(); - const u32 NewPC = CPU.GPR[ra]._u32[3]; - CPU.GPR[rt].Reset(); - CPU.GPR[rt]._u32[3] = CPU.PC + 4; - CPU.SetBranch(branchTarget(NewPC, 0)); + do_finalize = true; + c.int3(); + c.xor_(*pos_var, *pos_var); + c.mov(cpu_dword(GPR[rt]._u32[0]), *pos_var); + c.mov(cpu_dword(GPR[rt]._u32[1]), *pos_var); + c.mov(cpu_dword(GPR[rt]._u32[2]), *pos_var); + c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); + c.mov(cpu_dword(GPR[rt]._u32[3]), (CPU.PC >> 2) + 1); + c.shr(*pos_var, 2); + ConLog.Write("BISL(rt=%d,ra=%d)", rt, ra); } void IRET(u32 ra) { @@ -442,77 +635,90 @@ private: } void HBR(u32 p, u32 ro, u32 ra) { - UNIMPLEMENTED(); } void GB(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[0] & 1) | ((CPU.GPR[ra]._u32[1] & 1) << 1) | ((CPU.GPR[ra]._u32[2] & 1) << 2) | ((CPU.GPR[ra]._u32[3] & 1) << 3); CPU.GPR[rt]._u32[2] = 0; CPU.GPR[rt]._u64[0] = 0; + WRAPPER_END(rt, ra, 0, 0); + // TODO } void GBH(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); u32 temp = 0; for (int h = 0; h < 8; h++) temp |= (CPU.GPR[ra]._u16[h] & 1) << h; CPU.GPR[rt]._u32[3] = temp; CPU.GPR[rt]._u32[2] = 0; CPU.GPR[rt]._u64[0] = 0; + WRAPPER_END(rt, ra, 0, 0); } void GBB(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); u32 temp = 0; for (int b = 0; b < 16; b++) temp |= (CPU.GPR[ra]._u8[b] & 1) << b; CPU.GPR[rt]._u32[3] = temp; CPU.GPR[rt]._u32[2] = 0; CPU.GPR[rt]._u64[0] = 0; + WRAPPER_END(rt, ra, 0, 0); } void FSM(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); const u32 pref = CPU.GPR[ra]._u32[3]; for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = (pref & (1 << w)) ? ~0 : 0; + WRAPPER_END(rt, ra, 0, 0); } void FSMH(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); const u32 pref = CPU.GPR[ra]._u32[3]; for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = (pref & (1 << h)) ? ~0 : 0; + WRAPPER_END(rt, ra, 0, 0); } void FSMB(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); const u32 pref = CPU.GPR[ra]._u32[3]; for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = (pref & (1 << b)) ? ~0 : 0; + WRAPPER_END(rt, ra, 0, 0); } void FREST(u32 rt, u32 ra) { - UNIMPLEMENTED(); - //CPU.GPR[rt]._m128 = _mm_rcp_ps(CPU.GPR[ra]._m128); + WRAPPER_BEGIN(rt, ra, yy, zz); for (int i = 0; i < 4; i++) CPU.GPR[rt]._f[i] = 1 / CPU.GPR[ra]._f[i]; + WRAPPER_END(rt, ra, 0, 0); + /*XmmVar v0(c); + c.rcpps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void FRSQEST(u32 rt, u32 ra) { - UNIMPLEMENTED(); - //const __u32x4 FloatAbsMask = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; - //CPU.GPR[rt]._m128 = _mm_rsqrt_ps(_mm_and_ps(CPU.GPR[ra]._m128, FloatAbsMask.m128)); + WRAPPER_BEGIN(rt, ra, yy, zz); for (int i = 0; i < 4; i++) CPU.GPR[rt]._f[i] = 1 / sqrt(abs(CPU.GPR[ra]._f[i])); + WRAPPER_END(rt, ra, 0, 0); + /*XmmVar v0(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + c.andps(v0, imm_xmm(max_int)); + c.rsqrtps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void LQX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); u32 a = CPU.GPR[ra]._u32[3], b = CPU.GPR[rb]._u32[3]; u32 lsa = (a + b) & 0x3fff0; @@ -525,218 +731,290 @@ private: } CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); + WRAPPER_END(rt, ra, rb, 0); } void ROTQBYBI(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0xf; const SPU_GPR_hdr temp = CPU.GPR[ra]; for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; + WRAPPER_END(rt, ra, rb, 0); } void ROTQMBYBI(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int s = (0 - (CPU.GPR[rb]._u32[3] >> 3)) & 0x1f; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = 0; b < 16 - s; b++) CPU.GPR[rt]._u8[b] = temp._u8[b + s]; + WRAPPER_END(rt, ra, rb, 0); } void SHLQBYBI(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0x1f; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = s; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[b - s]; + WRAPPER_END(rt, ra, rb, 0); } void CBX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0xF; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u8[15 - t] = 0x03; + WRAPPER_END(rt, ra, rb, 0); } void CHX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0xE; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u16[7 - (t >> 1)] = 0x0203; + WRAPPER_END(rt, ra, rb, 0); } void CWX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const u32 t = (CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) & 0xC; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u32[3 - (t >> 2)] = 0x00010203; + WRAPPER_END(rt, ra, rb, 0); } void CDX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0x8; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u64[1 - (t >> 3)] = (u64)0x0001020304050607; + WRAPPER_END(rt, ra, rb, 0); } void ROTQBI(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int t = CPU.GPR[rb]._u32[3] & 0x7; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt]._u32[0] = (temp._u32[0] << t) | (temp._u32[3] >> (32 - t)); CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); + WRAPPER_END(rt, ra, rb, 0); } void ROTQMBI(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int t = (0 - CPU.GPR[rb]._u32[3]) & 0x7; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt]._u32[0] = (temp._u32[0] >> t) | (temp._u32[1] << (32 - t)); CPU.GPR[rt]._u32[1] = (temp._u32[1] >> t) | (temp._u32[2] << (32 - t)); CPU.GPR[rt]._u32[2] = (temp._u32[2] >> t) | (temp._u32[3] << (32 - t)); CPU.GPR[rt]._u32[3] = (temp._u32[3] >> t); + WRAPPER_END(rt, ra, rb, 0); } void SHLQBI(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int t = CPU.GPR[rb]._u32[3] & 0x7; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt]._u32[0] = (temp._u32[0] << t); CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); + WRAPPER_END(rt, ra, rb, 0); } void ROTQBY(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int s = CPU.GPR[rb]._u32[3] & 0xf; const SPU_GPR_hdr temp = CPU.GPR[ra]; for (int b = 0; b < 16; ++b) CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; + WRAPPER_END(rt, ra, rb, 0); } void ROTQMBY(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int s = (0 - CPU.GPR[rb]._u32[3]) & 0x1f; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = 0; b < 16 - s; b++) CPU.GPR[rt]._u8[b] = temp._u8[b + s]; + WRAPPER_END(rt, ra, rb, 0); } void SHLQBY(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const int s = CPU.GPR[rb]._u32[3] & 0x1f; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = s; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[b - s]; + WRAPPER_END(rt, ra, rb, 0); } void ORX(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[0] | CPU.GPR[ra]._u32[1] | CPU.GPR[ra]._u32[2] | CPU.GPR[ra]._u32[3]; CPU.GPR[rt]._u32[2] = 0; CPU.GPR[rt]._u64[0] = 0; + WRAPPER_END(rt, ra, 0, 0); } void CBD(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xF; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u8[15 - t] = 0x03; + WRAPPER_END(rt, ra, i7, 0); } void CHD(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xE; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u16[7 - (t >> 1)] = 0x0203; + WRAPPER_END(rt, ra, i7, 0); } void CWD(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xC; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u32[3 - (t >> 2)] = 0x00010203; + WRAPPER_END(rt, ra, i7, 0); + // TODO } void CDD(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int t = (CPU.GPR[ra]._u32[3] + i7) & 0x8; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u64[1 - (t >> 3)] = (u64)0x0001020304050607; + WRAPPER_END(rt, ra, i7, 0); } void ROTQBII(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int s = i7 & 0x7; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt]._u32[0] = (temp._u32[0] << s) | (temp._u32[3] >> (32 - s)); CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); + WRAPPER_END(rt, ra, i7, 0); } void ROTQMBII(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); - const int s = (0 - i7) & 0x7; + WRAPPER_BEGIN(rt, ra, i7, zz); + const int s = (0 - (s32)i7) & 0x7; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt]._u32[0] = (temp._u32[0] >> s) | (temp._u32[1] << (32 - s)); CPU.GPR[rt]._u32[1] = (temp._u32[1] >> s) | (temp._u32[2] << (32 - s)); CPU.GPR[rt]._u32[2] = (temp._u32[2] >> s) | (temp._u32[3] << (32 - s)); CPU.GPR[rt]._u32[3] = (temp._u32[3] >> s); + WRAPPER_END(rt, ra, i7, 0); } void SHLQBII(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int s = i7 & 0x7; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt]._u32[0] = (temp._u32[0] << s); CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); + WRAPPER_END(rt, ra, i7, 0); } void ROTQBYI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const int s = i7 & 0xf; const SPU_GPR_hdr temp = CPU.GPR[ra]; for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; + WRAPPER_END(rt, ra, i7, 0); + /*const int s = i7 & 0xf; + + XmmVar v0(c); + XmmVar v1(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.movdqa(v1, v0); + c.pslldq(v0, s); + c.psrldq(v1, 0xf - s); + c.por(v0, v1); + c.movdqa(cpu_xmm(GPR[rt]), v0);*/ } void ROTQMBYI(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); - const int s = (0 - i7) & 0x1f; + WRAPPER_BEGIN(rt, ra, i7, zz); + const int s = (0 - (s32)i7) & 0x1f; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = 0; b < 16 - s; b++) CPU.GPR[rt]._u8[b] = temp._u8[b + s]; + WRAPPER_END(rt, ra, i7, 0); + /*const int s = (0 - i7) & 0x1f; + + XmmVar v0(c); + if (s == 0) + { + if (ra == rt) + { + // nop + } + else + { + // mov + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + } + else if (s > 15) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + // shift right + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.psrldq(v0, s); + c.movdqa(cpu_xmm(GPR[rt]), v0); + }*/ } void SHLQBYI(u32 rt, u32 ra, s32 i7) { + WRAPPER_BEGIN(rt, ra, i7, zz); const int s = i7 & 0x1f; + const SPU_GPR_hdr temp = CPU.GPR[ra]; + CPU.GPR[rt].Reset(); + for (int b = s; b < 16; b++) + CPU.GPR[rt]._u8[b] = temp._u8[b - s]; + WRAPPER_END(rt, ra, i7, 0); + /*const int s = i7 & 0x1f; + XmmVar v0(c); if (s == 0) { @@ -763,45 +1041,49 @@ private: c.movdqa(v0, cpu_xmm(GPR[ra])); c.pslldq(v0, s); c.movdqa(cpu_xmm(GPR[rt]), v0); - } + }*/ } void NOP(u32 rt) { - UNIMPLEMENTED(); } void CGT(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > CPU.GPR[rb]._i32[w] ? 0xffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void XOR(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] ^ CPU.GPR[rb]._u32[w]; + WRAPPER_END(rt, ra, rb, 0); } void CGTH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._i16[h] > CPU.GPR[rb]._i16[h] ? 0xffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void EQV(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] ^ (~CPU.GPR[rb]._u32[w]); + WRAPPER_END(rt, ra, rb, 0); } void CGTB(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._i8[b] > CPU.GPR[rb]._i8[b] ? 0xff : 0; + WRAPPER_END(rt, ra, rb, 0); } void SUMB(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); const SPU_GPR_hdr _a = CPU.GPR[ra]; const SPU_GPR_hdr _b = CPU.GPR[rb]; for (int w = 0; w < 4; w++) @@ -809,6 +1091,7 @@ private: CPU.GPR[rt]._u16[w*2] = _a._u8[w*4] + _a._u8[w*4 + 1] + _a._u8[w*4 + 2] + _a._u8[w*4 + 3]; CPU.GPR[rt]._u16[w*2 + 1] = _b._u8[w*4] + _b._u8[w*4 + 1] + _b._u8[w*4 + 2] + _b._u8[w*4 + 3]; } + WRAPPER_END(rt, ra, rb, 0); } //HGT uses signed values. HLGT uses unsigned values void HGT(u32 rt, s32 ra, s32 rb) @@ -818,7 +1101,7 @@ private: } void CLZ(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); for (int w = 0; w < 4; w++) { int nPos; @@ -829,135 +1112,178 @@ private: CPU.GPR[rt]._u32[w] = nPos; } + WRAPPER_END(rt, ra, 0, 0); } void XSWD(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); CPU.GPR[rt]._i64[0] = (s64)CPU.GPR[ra]._i32[0]; CPU.GPR[rt]._i64[1] = (s64)CPU.GPR[ra]._i32[2]; + WRAPPER_END(rt, ra, 0, 0); } void XSHW(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = (s32)CPU.GPR[ra]._i16[w*2]; + WRAPPER_END(rt, ra, 0, 0); } void CNTB(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = 0; b < 16; b++) for (int i = 0; i < 8; i++) CPU.GPR[rt]._u8[b] += (temp._u8[b] & (1 << i)) ? 1 : 0; + WRAPPER_END(rt, ra, 0, 0); } void XSBH(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._i16[h] = (s16)CPU.GPR[ra]._i8[h*2]; + WRAPPER_END(rt, ra, 0, 0); } void CLGT(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); - for(u32 i = 0; i < 4; ++i) + WRAPPER_BEGIN(rt, ra, rb, zz); + for (u32 i = 0; i < 4; ++i) { CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > CPU.GPR[rb]._u32[i]) ? 0xffffffff : 0x00000000; } + WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c); + if (ra == rb) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + // compare if-greater-then + c.movdqa(v0, cpu_xmm(GPR[rb])); + c.psubd(v0, cpu_xmm(GPR[ra])); + c.psrad(v0, 32); + c.movdqa(cpu_xmm(GPR[rt]), v0); + }*/ } void ANDC(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] & (~CPU.GPR[rb]._u32[w]); + WRAPPER_END(rt, ra, rb, 0); + // TODO } void FCGT(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._f[0] > CPU.GPR[rb]._f[0] ? 0xffffffff : 0; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._f[1] > CPU.GPR[rb]._f[1] ? 0xffffffff : 0; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._f[2] > CPU.GPR[rb]._f[2] ? 0xffffffff : 0; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] > CPU.GPR[rb]._f[3] ? 0xffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); + // TODO } void DFCGT(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u64[0] = CPU.GPR[ra]._d[0] > CPU.GPR[rb]._d[0] ? 0xffffffffffffffff : 0; CPU.GPR[rt]._u64[1] = CPU.GPR[ra]._d[1] > CPU.GPR[rb]._d[1] ? 0xffffffffffffffff : 0; + WRAPPER_END(rt, ra, rb, 0);; } void FA(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] + CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] + CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] + CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] + CPU.GPR[rb]._f[3]; + WRAPPER_END(rt, ra, rb, 0); + // TODO } void FS(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] - CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] - CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] - CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] - CPU.GPR[rb]._f[3]; + WRAPPER_END(rt, ra, rb, 0); + // TODO } void FM(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; + WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + c.mulps(v0, cpu_xmm(GPR[rb])); + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void CLGTH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] > CPU.GPR[rb]._u16[h] ? 0xffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void ORC(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] | (~CPU.GPR[rb]._u32[w]); + WRAPPER_END(rt, ra, rb, 0); } void FCMGT(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = fabs(CPU.GPR[ra]._f[0]) > fabs(CPU.GPR[rb]._f[0]) ? 0xffffffff : 0; CPU.GPR[rt]._u32[1] = fabs(CPU.GPR[ra]._f[1]) > fabs(CPU.GPR[rb]._f[1]) ? 0xffffffff : 0; CPU.GPR[rt]._u32[2] = fabs(CPU.GPR[ra]._f[2]) > fabs(CPU.GPR[rb]._f[2]) ? 0xffffffff : 0; CPU.GPR[rt]._u32[3] = fabs(CPU.GPR[ra]._f[3]) > fabs(CPU.GPR[rb]._f[3]) ? 0xffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void DFCMGT(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u64[0] = fabs(CPU.GPR[ra]._d[0]) > fabs(CPU.GPR[rb]._d[0]) ? 0xffffffffffffffff : 0; CPU.GPR[rt]._u64[1] = fabs(CPU.GPR[ra]._d[1]) > fabs(CPU.GPR[rb]._d[1]) ? 0xffffffffffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void DFA(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] + CPU.GPR[rb]._d[0]; CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] + CPU.GPR[rb]._d[1]; + WRAPPER_END(rt, ra, rb, 0); } void DFS(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] - CPU.GPR[rb]._d[0]; CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] - CPU.GPR[rb]._d[1]; + WRAPPER_END(rt, ra, rb, 0); } void DFM(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; + WRAPPER_END(rt, ra, rb, 0); } void CLGTB(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] > CPU.GPR[rb]._u8[b] ? 0xff : 0; + WRAPPER_END(rt, ra, rb, 0); } void HLGT(u32 rt, u32 ra, u32 rb) { @@ -966,61 +1292,80 @@ private: } void DFMA(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._d[0] += CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; CPU.GPR[rt]._d[1] += CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; + WRAPPER_END(rt, ra, rb, 0); } void DFMS(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0] - CPU.GPR[rt]._d[0]; CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1] - CPU.GPR[rt]._d[1]; + WRAPPER_END(rt, ra, rb, 0); } void DFNMS(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._d[0] -= CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; CPU.GPR[rt]._d[1] -= CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; + WRAPPER_END(rt, ra, rb, 0); } void DFNMA(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._d[0] = -(CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0] + CPU.GPR[rt]._d[0]); CPU.GPR[rt]._d[1] = -(CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1] + CPU.GPR[rt]._d[1]); + WRAPPER_END(rt, ra, rb, 0); } void CEQ(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] == CPU.GPR[rb]._i32[w] ? 0xffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void MPYHHU(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2+1] * CPU.GPR[rb]._u16[w*2+1]; + WRAPPER_END(rt, ra, rb, 0); } void ADDX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] + CPU.GPR[rb]._u32[w] + (CPU.GPR[rt]._u32[w] & 1); + WRAPPER_END(rt, ra, rb, 0); + // TODO } void SFX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[rb]._u32[w] - CPU.GPR[ra]._u32[w] - (1 - (CPU.GPR[rt]._u32[w] & 1)); + WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c), v1(c), v2(c); + c.movdqa(v1, imm_xmm(s19_to_s32[1])); + c.movdqa(v0, cpu_xmm(GPR[rb])); + c.movdqa(v2, cpu_xmm(GPR[rt])); + c.psubd(v0, cpu_xmm(GPR[ra])); + c.pand(v2, v1); + c.paddd(v0, v2); + c.psubd(v0, v1); + c.movdqa(cpu_xmm(GPR[rt]), v0);*/ } void CGX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = ((u64)CPU.GPR[ra]._u32[w] + (u64)CPU.GPR[rb]._u32[w] + (u64)(CPU.GPR[rt]._u32[w] & 1)) >> 32; + WRAPPER_END(rt, ra, rb, 0); } void BGX(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); s64 nResult; for (int w = 0; w < 4; w++) @@ -1028,18 +1373,21 @@ private: nResult = (u64)CPU.GPR[rb]._u32[w] - (u64)CPU.GPR[ra]._u32[w] - (u64)(1 - (CPU.GPR[rt]._u32[w] & 1)); CPU.GPR[rt]._u32[w] = nResult < 0 ? 0 : 1; } + WRAPPER_END(rt, ra, rb, 0); } void MPYHHA(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] += CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2+1]; + WRAPPER_END(rt, ra, rb, 0); } void MPYHHAU(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] += CPU.GPR[ra]._u16[w*2+1] * CPU.GPR[rb]._u16[w*2+1]; + WRAPPER_END(rt, ra, rb, 0); } //Forced bits to 0, hence the shift: @@ -1052,17 +1400,19 @@ private: } void FESD(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); CPU.GPR[rt]._d[0] = (double)CPU.GPR[ra]._f[1]; CPU.GPR[rt]._d[1] = (double)CPU.GPR[ra]._f[3]; + WRAPPER_END(rt, ra, 0, 0); } void FRDS(u32 rt, u32 ra) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, yy, zz); CPU.GPR[rt]._f[1] = (float)CPU.GPR[ra]._d[0]; CPU.GPR[rt]._u32[0] = 0x00000000; CPU.GPR[rt]._f[3] = (float)CPU.GPR[ra]._d[1]; CPU.GPR[rt]._u32[2] = 0x00000000; + WRAPPER_END(rt, ra, 0, 0); } void FSCRWR(u32 rt, u32 ra) { @@ -1070,7 +1420,7 @@ private: } void DFTSV(u32 rt, u32 ra, s32 i7) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i7, zz); const u64 DoubleExpMask = 0x7ff0000000000000; const u64 DoubleFracMask = 0x000fffffffffffff; const u64 DoubleSignMask = 0x8000000000000000; @@ -1121,83 +1471,99 @@ private: if ((temp._u64[i] & DoubleExpMask) == DoubleExpMask) CPU.GPR[rt]._u64[i] = 0xffffffffffffffff; } + WRAPPER_END(rt, ra, i7, 0); } void FCEQ(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._f[0] == CPU.GPR[rb]._f[0] ? 0xffffffff : 0; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._f[1] == CPU.GPR[rb]._f[1] ? 0xffffffff : 0; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._f[2] == CPU.GPR[rb]._f[2] ? 0xffffffff : 0; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] == CPU.GPR[rb]._f[3] ? 0xffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void DFCEQ(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u64[0] = CPU.GPR[ra]._d[0] == CPU.GPR[rb]._d[0] ? 0xffffffffffffffff : 0; CPU.GPR[rt]._u64[1] = CPU.GPR[ra]._d[1] == CPU.GPR[rb]._d[1] ? 0xffffffffffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void MPY(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2]; + WRAPPER_END(rt, ra, rb, 0); } void MPYH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = (CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2]) << 16; + WRAPPER_END(rt, ra, rb, 0); + // TODO } void MPYHH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2+1]; + WRAPPER_END(rt, ra, rb, 0); } void MPYS(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = (CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2]) >> 16; + WRAPPER_END(rt, ra, rb, 0); } void CEQH(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] == CPU.GPR[rb]._u16[h] ? 0xffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void FCMEQ(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = fabs(CPU.GPR[ra]._f[0]) == fabs(CPU.GPR[rb]._f[0]) ? 0xffffffff : 0; CPU.GPR[rt]._u32[1] = fabs(CPU.GPR[ra]._f[1]) == fabs(CPU.GPR[rb]._f[1]) ? 0xffffffff : 0; CPU.GPR[rt]._u32[2] = fabs(CPU.GPR[ra]._f[2]) == fabs(CPU.GPR[rb]._f[2]) ? 0xffffffff : 0; CPU.GPR[rt]._u32[3] = fabs(CPU.GPR[ra]._f[3]) == fabs(CPU.GPR[rb]._f[3]) ? 0xffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void DFCMEQ(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u64[0] = fabs(CPU.GPR[ra]._d[0]) == fabs(CPU.GPR[rb]._d[0]) ? 0xffffffffffffffff : 0; CPU.GPR[rt]._u64[1] = fabs(CPU.GPR[ra]._d[1]) == fabs(CPU.GPR[rb]._d[1]) ? 0xffffffffffffffff : 0; + WRAPPER_END(rt, ra, rb, 0); } void MPYU(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * CPU.GPR[rb]._u16[w*2]; + WRAPPER_END(rt, ra, rb, 0); + // TODO } void CEQB(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] == CPU.GPR[rb]._u8[b] ? 0xff : 0; + WRAPPER_END(rt, ra, rb, 0); } void FI(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); - //Floating Interpolation: ra will be ignored. - //It should work correctly if result of preceding FREST or FRSQEST is sufficiently exact + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt] = CPU.GPR[rb]; + WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c); + c.movaps(v0, cpu_xmm(GPR[rb])); + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void HEQ(u32 rt, u32 ra, u32 rb) { @@ -1208,30 +1574,38 @@ private: //0 - 9 void CFLTS(u32 rt, u32 ra, s32 i8) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i8, zz); const u32 scale = 173 - (i8 & 0xff); //unsigned immediate for (int i = 0; i < 4; i++) { u32 exp = ((CPU.GPR[ra]._u32[i] >> 23) & 0xff) + scale; - if (exp > 255) + if (exp > 255) exp = 255; CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] & 0x807fffff) | (exp << 23); CPU.GPR[rt]._u32[i] = (u32)CPU.GPR[rt]._f[i]; //trunc } - //CPU.GPR[rt]._m128i = _mm_cvttps_epi32(CPU.GPR[rt]._m128); + WRAPPER_END(rt, ra, i8, 0); + /*XmmVar v0(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + if (i8 != 173) + { + c.mulps(v0, imm_xmm(scale_to_int[i8 & 0xff])); // scale + } + c.cvtps2dq(v0, v0); // convert to ints + c.movdqa(cpu_xmm(GPR[rt]), v0);*/ } void CFLTU(u32 rt, u32 ra, s32 i8) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i8, zz); const u32 scale = 173 - (i8 & 0xff); //unsigned immediate for (int i = 0; i < 4; i++) { u32 exp = ((CPU.GPR[ra]._u32[i] >> 23) & 0xff) + scale; - if (exp > 255) + if (exp > 255) exp = 255; if (CPU.GPR[ra]._u32[i] & 0x80000000) //if negative, result = 0 @@ -1246,11 +1620,21 @@ private: CPU.GPR[rt]._u32[i] = floor(CPU.GPR[rt]._f[i]); } } + WRAPPER_END(rt, ra, i8, 0); + /*XmmVar v0(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + if (i8 != 173) + { + c.mulps(v0, imm_xmm(scale_to_int[i8 & 0xff])); // scale + } + // TODO: handle negative values and convert to unsigned value + // c.int3(); + c.cvtps2dq(v0, v0); // convert to signed ints + c.movdqa(cpu_xmm(GPR[rt]), v0);*/ } void CSFLT(u32 rt, u32 ra, s32 i8) { - UNIMPLEMENTED(); - //CPU.GPR[rt]._m128 = _mm_cvtepi32_ps(CPU.GPR[ra]._m128i); + WRAPPER_BEGIN(rt, ra, i8, zz); const u32 scale = 155 - (i8 & 0xff); //unsigned immediate for (int i = 0; i < 4; i++) { @@ -1263,10 +1647,19 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); } + WRAPPER_END(rt, ra, i8, 0); + /*XmmVar v0(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.cvtdq2ps(v0, v0); // convert to floats + if (i8 != 155) + { + c.mulps(v0, imm_xmm(scale_to_float[i8 & 0xff])); // scale + } + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void CUFLT(u32 rt, u32 ra, s32 i8) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i8, zz); const u32 scale = 155 - (i8 & 0xff); //unsigned immediate for (int i = 0; i < 4; i++) { @@ -1278,18 +1671,33 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); } + WRAPPER_END(rt, ra, i8, 0); + /*XmmVar v0(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + // TODO: convert from unsigned value + // c.int3(); + c.cvtdq2ps(v0, v0); // convert to floats as signed + if (i8 != 155) + { + c.mulps(v0, imm_xmm(scale_to_float[i8 & 0xff])); // scale + } + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } //0 - 8 void BRZ(u32 rt, s32 i16) { - UNIMPLEMENTED(); - if (CPU.GPR[rt]._u32[3] == 0) - CPU.SetBranch(branchTarget(CPU.PC, i16)); + do_finalize = true; + GpVar pos_next(c, kVarTypeUInt32); + c.mov(pos_next, (CPU.PC >> 2) + 1); + c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); + c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); + c.cmovnz(*pos_var, pos_next); + //ConLog.Write("BRZ(rt=%d,i16=%d)", rt, i16); } void STQA(u32 rt, s32 i16) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, i16, yy, zz); u32 lsa = (i16 << 2) & 0x3fff0; if(!CPU.IsGoodLSA(lsa)) { @@ -1299,30 +1707,43 @@ private: } CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); + WRAPPER_END(rt, i16, 0, 0); } void BRNZ(u32 rt, s32 i16) { - UNIMPLEMENTED(); - if (CPU.GPR[rt]._u32[3] != 0) - CPU.SetBranch(branchTarget(CPU.PC, i16)); + do_finalize = true; + GpVar pos_next(c, kVarTypeUInt32); + c.mov(pos_next, (CPU.PC >> 2) + 1); + c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); + c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); + c.cmovz(*pos_var, pos_next); + //ConLog.Write("BRNZ(rt=%d,i16=%d)", rt, i16); } void BRHZ(u32 rt, s32 i16) { - UNIMPLEMENTED(); - if (CPU.GPR[rt]._u16[6] == 0) - CPU.SetBranch(branchTarget(CPU.PC, i16)); + do_finalize = true; + GpVar pos_next(c, kVarTypeUInt32); + c.mov(pos_next, (CPU.PC >> 2) + 1); + c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); + c.cmp(cpu_word(GPR[rt]._u16[6]), 0); + c.cmovnz(*pos_var, pos_next); + ConLog.Write("BRHZ(rt=%d,i16=%d)", rt, i16); } void BRHNZ(u32 rt, s32 i16) { - UNIMPLEMENTED(); - if (CPU.GPR[rt]._u16[6] != 0) - CPU.SetBranch(branchTarget(CPU.PC, i16)); + do_finalize = true; + GpVar pos_next(c, kVarTypeUInt32); + c.mov(pos_next, (CPU.PC >> 2) + 1); + c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); + c.cmp(cpu_word(GPR[rt]._u16[6]), 0); + c.cmovz(*pos_var, pos_next); + ConLog.Write("BRHNZ(rt=%d,i16=%d)", rt, i16); } void STQR(u32 rt, s32 i16) { - UNIMPLEMENTED(); - u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; - if(!CPU.IsGoodLSA(lsa)) + WRAPPER_BEGIN(rt, i16, PC, zz); + u32 lsa = branchTarget(PC, i16) & 0x3fff0; + if (!CPU.IsGoodLSA(lsa)) { ConLog.Error("STQR: bad lsa (0x%x)", lsa); Emu.Pause(); @@ -1330,6 +1751,17 @@ private: } CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); + WRAPPER_END(rt, i16, CPU.PC, 0); + /*u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; + + GpVar v0(c, kVarTypeUInt64); + GpVar v1(c, kVarTypeUInt64); + c.mov(v0, cpu_qword(GPR[rt]._u64[0])); + c.mov(v1, cpu_qword(GPR[rt]._u64[1])); + c.bswap(v0); + c.bswap(v1); + c.mov(qword_ptr(*ls_var, lsa), v1); + c.mov(qword_ptr(*ls_var, lsa + 8), v0);*/ } void BRA(s32 i16) { @@ -1338,7 +1770,7 @@ private: } void LQA(u32 rt, s32 i16) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, i16, yy, zz); u32 lsa = (i16 << 2) & 0x3fff0; if(!CPU.IsGoodLSA(lsa)) { @@ -1348,6 +1780,7 @@ private: } CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); + WRAPPER_END(rt, i16, 0, 0); } void BRASL(u32 rt, s32 i16) { @@ -1358,25 +1791,57 @@ private: } void BR(s32 i16) { - UNIMPLEMENTED(); - CPU.SetBranch(branchTarget(CPU.PC, i16)); + do_finalize = true; + c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); + //ConLog.Write("BR(i16=%d)", i16); } void FSMBI(u32 rt, s32 i16) { - XmmVar v0(c); + WRAPPER_BEGIN(rt, i16, yy, zz); + const u32 s = i16; + + for (u32 j = 0; j < 16; ++j) + { + if ((s >> j) & 0x1) + { + CPU.GPR[rt]._u8[j] = 0xFF; + } + else + { + CPU.GPR[rt]._u8[j] = 0x00; + } + } + WRAPPER_END(rt, i16, 0, 0); + /*XmmVar v0(c); c.movaps(v0, imm_xmm(fsmbi_mask[i16 & 0xffff])); - c.movaps(cpu_xmm(GPR[rt]), v0); + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void BRSL(u32 rt, s32 i16) { - UNIMPLEMENTED(); - CPU.GPR[rt].Reset(); - CPU.GPR[rt]._u32[3] = CPU.PC + 4; - CPU.SetBranch(branchTarget(CPU.PC, i16)); + GpVar v0(c, kVarTypeUInt64); + c.xor_(v0, v0); + c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + c.mov(cpu_qword(GPR[rt]._u64[0]), v0); + c.mov(cpu_dword(GPR[rt]._u32[3]), CPU.PC + 4); + + do_finalize = true; + c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); + //ConLog.Write("BRSL(rt=%d,i16=%d)", rt, i16); } void LQR(u32 rt, s32 i16) { - u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; + WRAPPER_BEGIN(rt, i16, PC, zz); + u32 lsa = branchTarget(PC, i16) & 0x3fff0; + if (!CPU.IsGoodLSA(lsa)) + { + ConLog.Error("LQR: bad lsa (0x%x)", lsa); + Emu.Pause(); + return; + } + + CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); + WRAPPER_END(rt, i16, CPU.PC, 0); + /*u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); @@ -1385,47 +1850,96 @@ private: c.bswap(v0); c.bswap(v1); c.mov(cpu_qword(GPR[rt]._u64[0]), v1); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + c.mov(cpu_qword(GPR[rt]._u64[1]), v0);*/ } void IL(u32 rt, s32 i16) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, i16, yy, zz); CPU.GPR[rt]._i32[0] = CPU.GPR[rt]._i32[1] = CPU.GPR[rt]._i32[2] = CPU.GPR[rt]._i32[3] = i16; + WRAPPER_END(rt, i16, 0, 0); + /*XmmVar v0(c); + if (i16 == 0) + { + c.xorps(v0, v0); + } + else if (i16 == -1) + { + c.cmpps(v0, v0, 0); + } + else + { + c.movaps(v0, imm_xmm(s19_to_s32[i16 & 0x7ffff])); + } + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void ILHU(u32 rt, s32 i16) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, i16, yy, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = i16 << 16; + WRAPPER_END(rt, i16, 0, 0); + /*XmmVar v0(c); + if (i16 == 0) + { + c.xorps(v0, v0); + } + else if (i16 == -1) + { + c.cmpps(v0, v0, 0); + c.pslld(v0, 16); + } + else + { + c.movaps(v0, imm_xmm(s19_to_s32[i16 & 0x7ffff])); + c.pslld(v0, 16); + } + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void ILH(u32 rt, s32 i16) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, i16, yy, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = i16; + CPU.GPR[rt]._i16[h] = (s32)i16; + WRAPPER_END(rt, i16, 0, 0); } void IOHL(u32 rt, s32 i16) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, i16, yy, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] |= (i16 & 0xFFFF); + WRAPPER_END(rt, i16, 0, 0); + /*XmmVar v0(c); + if (i16 == 0) + { + // nop + } + else + { + c.movaps(v0, cpu_xmm(GPR[rt])); + c.orps(v0, imm_xmm(s19_to_s32[i16 & 0xffff])); + c.movaps(cpu_xmm(GPR[rt]), v0); + }*/ } //0 - 7 void ORI(u32 rt, u32 ra, s32 i10) { - XmmVar v0(c); - if (i10 == 0) + WRAPPER_BEGIN(rt, ra, i10, zz); + for (u32 i = 0; i < 4; ++i) + CPU.GPR[rt]._i32[i] = CPU.GPR[ra]._i32[i] | (s32)i10; + WRAPPER_END(rt, ra, i10, 0); + /*XmmVar v0(c); + if (i10 == -1) { - // zero - c.xorps(v0, v0); + // fill with 1 + c.cmpps(v0, v0, 0); c.movaps(cpu_xmm(GPR[rt]), v0); } - else if (i10 == -1) + else if (i10 == 0) { if (rt == ra) { @@ -1443,53 +1957,68 @@ private: c.movaps(v0, cpu_xmm(GPR[ra])); c.orps(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); c.movaps(cpu_xmm(GPR[rt]), v0); - } + }*/ } void ORHI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] | i10; + CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] | (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void ORBI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int b = 0; b < 16; b++) - CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] | i10; + CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] | (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void SFI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = i10 - CPU.GPR[ra]._i32[w]; + CPU.GPR[rt]._i32[w] = (s32)i10 - CPU.GPR[ra]._i32[w]; + WRAPPER_END(rt, ra, i10, 0); + // TODO } void SFHI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = i10 - CPU.GPR[ra]._i16[h]; + CPU.GPR[rt]._i16[h] = (s32)i10 - CPU.GPR[ra]._i16[h]; + WRAPPER_END(rt, ra, i10, 0); } void ANDI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] & i10; + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] & (s32)i10; + WRAPPER_END(rt, ra, i10, 0); + // TODO } void ANDHI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] & i10; + CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] & (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void ANDBI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int b = 0; b < 16; b++) - CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] & i10; + CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] & (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void AI(u32 rt, u32 ra, s32 i10) { - XmmVar v0(c); + WRAPPER_BEGIN(rt, ra, i10, zz); + CPU.GPR[rt]._i32[0] = CPU.GPR[ra]._i32[0] + i10; + CPU.GPR[rt]._i32[1] = CPU.GPR[ra]._i32[1] + i10; + CPU.GPR[rt]._i32[2] = CPU.GPR[ra]._i32[2] + i10; + CPU.GPR[rt]._i32[3] = CPU.GPR[ra]._i32[3] + i10; + WRAPPER_END(rt, ra, i10, 0); + /*XmmVar v0(c); if (i10 == 0) { if (rt == ra) @@ -1509,17 +2038,28 @@ private: c.movdqa(v0, cpu_xmm(GPR[ra])); c.paddd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); c.movdqa(cpu_xmm(GPR[rt]), v0); - } + }*/ } void AHI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for(u32 h = 0; h < 8; ++h) - CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] + i10; + CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] + (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void STQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding { - GpVar lsa(c, kVarTypeUInt32); + WRAPPER_BEGIN(rt, i10, ra, zz); + const u32 lsa = (CPU.GPR[ra]._i32[3] + i10) & 0x3fff0; + if (!CPU.IsGoodLSA(lsa)) + { + ConLog.Error("STQD: bad lsa (0x%x)", lsa); + Emu.Pause(); + return; + } + CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); + WRAPPER_END(rt, i10, ra, 0); + /*GpVar lsa(c, kVarTypeUInt32); GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); @@ -1531,13 +2071,13 @@ private: c.bswap(v0); c.bswap(v1); c.mov(qword_ptr(*ls_var, lsa, 0, 0), v1); - c.mov(qword_ptr(*ls_var, lsa, 0, 8), v0); + c.mov(qword_ptr(*ls_var, lsa, 0, 8), v0);*/ } void LQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, i10, ra, zz); const u32 lsa = (CPU.GPR[ra]._i32[3] + i10) & 0x3fff0; - if(!CPU.IsGoodLSA(lsa)) + if (!CPU.IsGoodLSA(lsa)) { ConLog.Error("LQD: bad lsa (0x%x)", lsa); Emu.Pause(); @@ -1545,42 +2085,63 @@ private: } CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); + WRAPPER_END(rt, i10, ra, 0); + /*GpVar lsa(c, kVarTypeUInt32); + GpVar v0(c, kVarTypeUInt64); + GpVar v1(c, kVarTypeUInt64); + + c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); + if (i10) c.add(lsa, i10); + c.and_(lsa, 0x3fff0); + c.mov(v0, qword_ptr(*ls_var, lsa, 0, 0)); + c.mov(v1, qword_ptr(*ls_var, lsa, 0, 8)); + c.bswap(v0); + c.bswap(v1); + c.mov(cpu_qword(GPR[rt]._u64[0]), v1); + c.mov(cpu_qword(GPR[rt]._u64[1]), v0);*/ } void XORI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] ^ i10; + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] ^ (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void XORHI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] ^ i10; + CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] ^ (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void XORBI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int b = 0; b < 16; b++) - CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] ^ i10; + CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] ^ (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void CGTI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > i10 ? 0xffffffff : 0; + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > (s32)i10 ? 0xffffffff : 0; + WRAPPER_END(rt, ra, i10, 0); + // TODO } void CGTHI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._i16[h] > i10 ? 0xffff : 0; + CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._i16[h] > (s32)i10 ? 0xffff : 0; + WRAPPER_END(rt, ra, i10, 0); } void CGTBI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._i8[b] > (s8)(i10 & 0xff) ? 0xff : 0; + WRAPPER_END(rt, ra, i10, 0); } void HGTI(u32 rt, u32 ra, s32 i10) { @@ -1589,25 +2150,48 @@ private: } void CLGTI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); - for(u32 i = 0; i < 4; ++i) + WRAPPER_BEGIN(rt, ra, i10, zz); + for (int w = 0; w < 4; w++) + CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > (s32)i10 ? 0xffffffff : 0; + WRAPPER_END(rt, ra, i10, 0); + /*XmmVar v0(c); + if (i10 == -1) { - CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > (u32)i10) ? 0xffffffff : 0x00000000; + // zero result + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); } + else + { + if (i10 == 0) + { + // load zero + c.pxor(v0, v0); + } + else + { + c.movdqa(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + } + c.psubd(v0, cpu_xmm(GPR[ra])); + c.psrad(v0, 32); + c.movdqa(cpu_xmm(GPR[rt]), v0); + }*/ } void CLGTHI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for(u32 i = 0; i < 8; ++i) { - CPU.GPR[rt]._u16[i] = (CPU.GPR[ra]._u16[i] > (u16)i10) ? 0xffff : 0x0000; + CPU.GPR[rt]._u16[i] = (CPU.GPR[ra]._u16[i] > (u16)(s32)i10) ? 0xffff : 0x0000; } + WRAPPER_END(rt, ra, i10, 0); } void CLGTBI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] > (u8)(i10 & 0xff) ? 0xff : 0; + WRAPPER_END(rt, ra, i10, 0); } void HLGTI(u32 rt, u32 ra, s32 i10) { @@ -1616,36 +2200,43 @@ private: } void MPYI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * i10; + CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * (s32)i10; + WRAPPER_END(rt, ra, i10, 0); } void MPYUI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * (u16)(i10 & 0xffff); + WRAPPER_END(rt, ra, i10, 0); } void CEQI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for(u32 i = 0; i < 4; ++i) - CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._i32[i] == i10) ? 0xffffffff : 0x00000000; + CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._i32[i] == (s32)i10) ? 0xffffffff : 0x00000000; + WRAPPER_END(rt, ra, i10, 0); + // TODO } void CEQHI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._i16[h] == (s16)i10) ? 0xffff : 0; + CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._i16[h] == (s16)(s32)i10) ? 0xffff : 0; + WRAPPER_END(rt, ra, i10, 0); } void CEQBI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._i8[b] = (CPU.GPR[ra]._i8[b] == (s8)(i10 & 0xff)) ? 0xff : 0; + WRAPPER_END(rt, ra, i10, 0); } void HEQI(u32 rt, u32 ra, s32 i10) { + // TODO UNIMPLEMENTED(); if(CPU.GPR[ra]._i32[3] == i10) CPU.Stop(); } @@ -1662,35 +2253,57 @@ private: } void ILA(u32 rt, u32 i18) { - XmmVar v0(c); - c.movaps(v0, imm_xmm(s19_to_s32[i18 & 0x3ffff])); - c.movaps(cpu_xmm(GPR[rt]), v0); + WRAPPER_BEGIN(rt, i18, yy, zz); + CPU.GPR[rt]._u32[0] = + CPU.GPR[rt]._u32[1] = + CPU.GPR[rt]._u32[2] = + CPU.GPR[rt]._u32[3] = i18 & 0x3FFFF; + WRAPPER_END(rt, i18, 0, 0); + /*XmmVar v0(c); + if (i18 == 0) + { + c.xorps(v0, v0); + } + else + { + c.movaps(v0, imm_xmm(s19_to_s32[i18 & 0x3ffff])); + } + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } //0 - 3 void SELB(u32 rt, u32 ra, u32 rb, u32 rc) { - UNIMPLEMENTED(); - for(u64 i = 0; i < 2; ++i) + WRAPPER_BEGIN(rt, ra, rb, rc); + for (u64 i = 0; i < 2; ++i) { CPU.GPR[rt]._u64[i] = - ( CPU.GPR[rc]._u64[i] & CPU.GPR[rb]._u64[i]) | + (CPU.GPR[rc]._u64[i] & CPU.GPR[rb]._u64[i]) | (~CPU.GPR[rc]._u64[i] & CPU.GPR[ra]._u64[i]); } + WRAPPER_END(rt, ra, rb, rc); + /*XmmVar v0(c); + XmmVar v1(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(v1, cpu_xmm(GPR[rc])); + c.andnps(v0, v1); + c.andps(v1, cpu_xmm(GPR[rb])); + c.orps(v0, v1); + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void SHUFB(u32 rt, u32 ra, u32 rb, u32 rc) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, rc); const SPU_GPR_hdr _a = CPU.GPR[ra]; const SPU_GPR_hdr _b = CPU.GPR[rb]; for (int i = 0; i < 16; i++) { u8 b = CPU.GPR[rc]._u8[i]; - if(b & 0x80) + if (b & 0x80) { - if(b & 0x40) + if (b & 0x40) { - if(b & 0x20) + if (b & 0x20) CPU.GPR[rt]._u8[i] = 0x80; else CPU.GPR[rt]._u8[i] = 0xFF; @@ -1700,42 +2313,59 @@ private: } else { - if(b & 0x10) + if (b & 0x10) CPU.GPR[rt]._u8[i] = _b._u8[15 - (b & 0x0F)]; else CPU.GPR[rt]._u8[i] = _a._u8[15 - (b & 0x0F)]; } } + WRAPPER_END(rt, ra, rb, rc); + // TODO } void MPYA(u32 rt, u32 ra, u32 rb, u32 rc) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, rc); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2] + CPU.GPR[rc]._i32[w]; + WRAPPER_END(rt, ra, rb, rc); } void FNMS(u32 rt, u32 ra, u32 rb, u32 rc) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[rc]._f[0] - CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[rc]._f[1] - CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[rc]._f[2] - CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[rc]._f[3] - CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; + WRAPPER_END(rt, ra, rb, rc); + /*XmmVar v0(c), v1(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + c.mulps(v0, cpu_xmm(GPR[rb])); + c.movaps(v1, cpu_xmm(GPR[rc])); + c.subps(v1, v0); + c.movaps(cpu_xmm(GPR[rt]), v1);*/ } void FMA(u32 rt, u32 ra, u32 rb, u32 rc) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] + CPU.GPR[rc]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] + CPU.GPR[rc]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] + CPU.GPR[rc]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] + CPU.GPR[rc]._f[3]; + WRAPPER_END(rt, ra, rb, rc); + /*XmmVar v0(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + c.mulps(v0, cpu_xmm(GPR[rb])); + c.addps(v0, cpu_xmm(GPR[rc])); + c.movaps(cpu_xmm(GPR[rt]), v0);*/ } void FMS(u32 rt, u32 ra, u32 rb, u32 rc) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] - CPU.GPR[rc]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] - CPU.GPR[rc]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] - CPU.GPR[rc]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] - CPU.GPR[rc]._f[3]; + WRAPPER_END(rt, ra, rb, rc); } void UNK(u32 code, u32 opcode, u32 gcode) diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index bf3662c399..940ef44a4a 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -7,7 +7,7 @@ static const SPUImmTable g_spu_imm; SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) : m_enc(new SPURecompiler(cpu, *this)) -, m_inter(new SPUInterpreter(cpu)) +, inter(new SPUInterpreter(cpu)) , CPU(cpu) , compiler(&runtime) { @@ -17,17 +17,17 @@ SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) SPURecompilerCore::~SPURecompilerCore() { delete m_enc; - delete m_inter; + delete inter; } void SPURecompilerCore::Decode(const u32 code) // decode instruction and run with interpreter { - (*SPU_instr::rrr_list)(m_inter, code); + (*SPU_instr::rrr_list)(inter, code); } void SPURecompilerCore::Compile(u16 pos) { - compiler.addFunc(kFuncConvHost, FuncBuilder4()); + compiler.addFunc(kFuncConvHost, FuncBuilder4()); entry[pos].host = pos; GpVar cpu_var(compiler, kVarTypeIntPtr, "cpu"); @@ -45,15 +45,26 @@ void SPURecompilerCore::Compile(u16 pos) compiler.alloc(imm_var); m_enc->imm_var = &imm_var; - GpVar pos_var(compiler, kVarTypeUInt16, "pos"); + GpVar pos_var(compiler, kVarTypeUInt32, "pos"); compiler.setArg(3, pos_var); compiler.alloc(pos_var); + m_enc->pos_var = &pos_var; + + compiler.xor_(pos_var, pos_var); + while (true) { const u32 opcode = Memory.Read32(CPU.dmac.ls_offset + pos * 4); m_enc->do_finalize = false; - (*SPU_instr::rrr_list)(m_enc, opcode); // compile single opcode + if (opcode) + { + (*SPU_instr::rrr_list)(m_enc, opcode); // compile single opcode + } + else + { + m_enc->do_finalize = true; + } bool fin = m_enc->do_finalize; entry[pos].valid = opcode; @@ -63,7 +74,6 @@ void SPURecompilerCore::Compile(u16 pos) entry[pos].host = entry[pos - 1].host; } - compiler.xor_(pos_var, pos_var); compiler.ret(pos_var); compiler.endFunc(); entry[entry[pos].host].pointer = compiler.make(); @@ -74,6 +84,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) const u64 m_offset = address - CPU.PC; const u16 pos = (CPU.PC >> 2); + //ConLog.Write("DecodeMemory: pos=%d", pos); u32* ls = (u32*)Memory.VirtualToRealAddr(m_offset); if (!pos) @@ -115,16 +126,16 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) return 0; } // jump - typedef u16(*Func)(void* _cpu, void* _ls, const SPUImmTable* _imm, u16 _pos); + typedef u32(*Func)(void* _cpu, void* _ls, const SPUImmTable* _imm, u32 _pos); Func func = asmjit_cast(entry[entry[pos].host].pointer); void* cpu = (u8*)&CPU.GPR[0] - offsetof(SPUThread, GPR[0]); // ugly cpu base offset detection - u16 res = pos == entry[pos].host ? 0 : pos; - res = func(cpu, ls, &g_spu_imm, res); + u16 res = (pos == entry[pos].host) ? 0 : pos; + res = (u16)func(cpu, ls, &g_spu_imm, res); - ConLog.Write("func -> %d", res); + CPU.SetBranch((u64)res << 2); return 0; /*Decode(Memory.Read32(address)); diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index e3cc2b8db0..f93e39f4d5 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -3,6 +3,7 @@ #include "Emu/Cell/SPUDecoder.h" #include "Emu/Cell/SPUInterpreter.h" #include "Emu/Cell/SPUDisAsm.h" +#include "Emu/Cell/SPURecompiler.h" SPUThread& GetCurrentSPUThread() { @@ -75,6 +76,8 @@ void SPUThread::DoRun() break; case 1: + m_dec = new SPURecompilerCore(*this); + break; case 2: m_dec = new SPUDecoder(*new SPUInterpreter(*this)); break; diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 5c947664c6..a581130473 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -213,20 +213,21 @@ public: union SPU_GPR_hdr { + u32 _u32[4]; + float _f[4]; u128 _u128; s128 _i128; __m128 _m128; __m128i _m128i; u64 _u64[2]; s64 _i64[2]; - u32 _u32[4]; s32 _i32[4]; u16 _u16[8]; s16 _i16[8]; u8 _u8[16]; s8 _i8[16]; double _d[2]; - float _f[4]; + SPU_GPR_hdr() {} @@ -243,9 +244,9 @@ union SPU_GPR_hdr union SPU_SPR_hdr { + u32 _u32[4]; u128 _u128; s128 _i128; - u32 _u32[4]; SPU_SPR_hdr() {} @@ -299,19 +300,19 @@ public: #else static const bool x86 = true; #endif - - private: union _CRT_ALIGN(8) { struct { volatile u32 m_index; u32 m_value[max_count]; }; + struct { + volatile u32 m_index2; + u16 m_val16[max_count * 2]; + }; volatile u64 m_indval; }; std::mutex m_lock; - public: - Channel() { Init(); @@ -586,7 +587,7 @@ public: } } - Sleep(1); // hack + //Sleep(1); // hack switch(cmd & ~(MFC_BARRIER_MASK | MFC_FENCE_MASK | MFC_LIST_MASK | MFC_RESULT_MASK)) { @@ -1125,6 +1126,115 @@ public: if (Emu.IsStopped()) ConLog.Warning("%s(%s) aborted", __FUNCTION__, spu_ch_name[ch]); } + void DoStop(u32 code) + { + SetExitStatus(code); // exit code (not status) + + switch (code) + { + case 0x110: /* ===== sys_spu_thread_receive_event ===== */ + { + u32 spuq = 0; + if (!SPU.Out_MBox.Pop(spuq)) + { + ConLog.Error("sys_spu_thread_receive_event: cannot read Out_MBox"); + SPU.In_MBox.PushUncond(CELL_EINVAL); // ??? + return; + } + + if (SPU.In_MBox.GetCount()) + { + ConLog.Error("sys_spu_thread_receive_event(spuq=0x%x): In_MBox is not empty", spuq); + SPU.In_MBox.PushUncond(CELL_EBUSY); // ??? + return; + } + + if (Ini.HLELogging.GetValue()) + { + ConLog.Write("sys_spu_thread_receive_event(spuq=0x%x)", spuq); + } + + EventQueue* eq; + if (!SPUQs.GetEventQueue(FIX_SPUQ(spuq), eq)) + { + SPU.In_MBox.PushUncond(CELL_EINVAL); // TODO: check error value + return; + } + + u32 tid = GetId(); + + eq->sq.push(tid); // add thread to sleep queue + + while (true) + { + switch (eq->owner.trylock(tid)) + { + case SMR_OK: + if (!eq->events.count()) + { + eq->owner.unlock(tid); + break; + } + else + { + u32 next = (eq->protocol == SYS_SYNC_FIFO) ? eq->sq.pop() : eq->sq.pop_prio(); + if (next != tid) + { + eq->owner.unlock(tid, next); + break; + } + } + case SMR_SIGNAL: + { + sys_event_data event; + eq->events.pop(event); + eq->owner.unlock(tid); + SPU.In_MBox.PushUncond(CELL_OK); + SPU.In_MBox.PushUncond(event.data1); + SPU.In_MBox.PushUncond(event.data2); + SPU.In_MBox.PushUncond(event.data3); + return; + } + case SMR_FAILED: break; + default: eq->sq.invalidate(tid); SPU.In_MBox.PushUncond(CELL_ECANCELED); return; + } + + Sleep(1); + if (Emu.IsStopped()) + { + ConLog.Warning("sys_spu_thread_receive_event(spuq=0x%x) aborted", spuq); + eq->sq.invalidate(tid); + return; + } + } + } + break; + case 0x102: + if (!SPU.Out_MBox.GetCount()) + { + ConLog.Error("sys_spu_thread_exit (no status, code 0x102)"); + } + else if (Ini.HLELogging.GetValue()) + { + // the real exit status + ConLog.Write("sys_spu_thread_exit (status=0x%x)", SPU.Out_MBox.GetValue()); + } + Stop(); + break; + default: + if (!SPU.Out_MBox.GetCount()) + { + ConLog.Error("Unknown STOP code: 0x%x (no message)", code); + } + else + { + ConLog.Error("Unknown STOP code: 0x%x (message=0x%x)", code, SPU.Out_MBox.GetValue()); + } + Stop(); + break; + } + } + bool IsGoodLSA(const u32 lsa) const { return Memory.IsGoodAddr(lsa + m_offset) && lsa < 0x40000; } virtual u8 ReadLS8 (const u32 lsa) const { return Memory.Read8 (lsa + m_offset); } // m_offset & 0x3fffc ????? virtual u16 ReadLS16 (const u32 lsa) const { return Memory.Read16 (lsa + m_offset); } diff --git a/rpcs3/rpcs3.vcxproj b/rpcs3/rpcs3.vcxproj index dc4dfcfc0b..cf6ff47c3d 100644 --- a/rpcs3/rpcs3.vcxproj +++ b/rpcs3/rpcs3.vcxproj @@ -393,6 +393,7 @@ + diff --git a/rpcs3/rpcs3.vcxproj.filters b/rpcs3/rpcs3.vcxproj.filters index 1e00a296fe..3589bf81ee 100644 --- a/rpcs3/rpcs3.vcxproj.filters +++ b/rpcs3/rpcs3.vcxproj.filters @@ -702,5 +702,8 @@ Utilities + + Include + \ No newline at end of file From f9b68bc01266847201d9f255f785b4af3c09b843 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 7 Apr 2014 17:06:13 +0400 Subject: [PATCH 03/14] SPU JIT fix Nothing changed in interpreter. --- rpcs3/Emu/Cell/SPUInterpreter.h | 121 ++++++++++++--- rpcs3/Emu/Cell/SPURecompiler.h | 215 ++++++++++++++++++--------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 43 ++++-- rpcs3/Emu/Cell/SPUThread.h | 6 +- 4 files changed, 280 insertions(+), 105 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUInterpreter.h b/rpcs3/Emu/Cell/SPUInterpreter.h index fe55ed219e..353cccb537 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.h +++ b/rpcs3/Emu/Cell/SPUInterpreter.h @@ -4,6 +4,7 @@ #include "Emu/Memory/Memory.h" #include "Emu/Cell/SPUThread.h" #include "Emu/SysCalls/SysCalls.h" +#include "Crypto/sha1.h" #define UNIMPLEMENTED() UNK(__FUNCTION__) @@ -14,6 +15,8 @@ __m128d m128d; } __u32x4; */ +#define LOG2_OPCODE(...) //unsigned char cs[20]; sha1(&Memory[CPU.dmac.ls_offset], 256*1024, cs); ConLog.Write("Mem Dump: 0x%llx", *(u64*)cs); ConLog.Write(__FUNCTION__ "(): " __VA_ARGS__) + class SPUInterpreter : public SPUOpcodes { private: @@ -267,23 +270,55 @@ private: } void BIZ(u32 rt, u32 ra) { - if(CPU.GPR[rt]._u32[3] == 0) - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); + if (CPU.GPR[rt]._u32[3] == 0) + { + LOG2_OPCODE("taken (0x%llx)", target); + CPU.SetBranch(target); + } + else + { + LOG2_OPCODE("not taken (0x%llx)", target); + } } void BINZ(u32 rt, u32 ra) { - if(CPU.GPR[rt]._u32[3] != 0) - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); + if (CPU.GPR[rt]._u32[3] != 0) + { + LOG2_OPCODE("taken (0x%llx)", target); + CPU.SetBranch(target); + } + else + { + LOG2_OPCODE("not taken (0x%llx)", target); + } } void BIHZ(u32 rt, u32 ra) { - if(CPU.GPR[rt]._u16[6] == 0) - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); + if (CPU.GPR[rt]._u16[6] == 0) + { + LOG2_OPCODE("taken (0x%llx)", target); + CPU.SetBranch(target); + } + else + { + LOG2_OPCODE("not taken (0x%llx)", target); + } } void BIHNZ(u32 rt, u32 ra) { - if(CPU.GPR[rt]._u16[6] != 0) - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); + if (CPU.GPR[rt]._u16[6] != 0) + { + LOG2_OPCODE("taken (0x%llx)", target); + CPU.SetBranch(target); + } + else + { + LOG2_OPCODE("not taken (0x%llx)", target); + } } void STOPD(u32 rc, u32 ra, u32 rb) { @@ -304,14 +339,17 @@ private: } void BI(u32 ra) { - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); + LOG2_OPCODE("branch (0x%llx)", target); + CPU.SetBranch(target); } void BISL(u32 rt, u32 ra) { - const u32 NewPC = CPU.GPR[ra]._u32[3]; + u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); CPU.GPR[rt].Reset(); CPU.GPR[rt]._u32[3] = CPU.PC + 4; - CPU.SetBranch(branchTarget(NewPC, 0)); + LOG2_OPCODE("branch (0x%llx)", target); + CPU.SetBranch(target); } void IRET(u32 ra) { @@ -1048,8 +1086,16 @@ private: //0 - 8 void BRZ(u32 rt, s32 i16) { + u64 target = branchTarget(CPU.PC, i16); if (CPU.GPR[rt]._u32[3] == 0) - CPU.SetBranch(branchTarget(CPU.PC, i16)); + { + LOG2_OPCODE("taken (0x%llx)", target); + CPU.SetBranch(target); + } + else + { + LOG2_OPCODE("not taken (0x%llx)", target); + } } void STQA(u32 rt, s32 i16) { @@ -1065,18 +1111,42 @@ private: } void BRNZ(u32 rt, s32 i16) { + u64 target = branchTarget(CPU.PC, i16); if (CPU.GPR[rt]._u32[3] != 0) - CPU.SetBranch(branchTarget(CPU.PC, i16)); + { + LOG2_OPCODE("taken (0x%llx)", target); + CPU.SetBranch(target); + } + else + { + LOG2_OPCODE("not taken (0x%llx)", target); + } } void BRHZ(u32 rt, s32 i16) { - if (CPU.GPR[rt]._u16[6] == 0) - CPU.SetBranch(branchTarget(CPU.PC, i16)); + u64 target = branchTarget(CPU.PC, i16); + if (CPU.GPR[rt]._u16[6] == 0) + { + LOG2_OPCODE("taken (0x%llx)", target); + CPU.SetBranch(target); + } + else + { + LOG2_OPCODE("not taken (0x%llx)", target); + } } void BRHNZ(u32 rt, s32 i16) { - if (CPU.GPR[rt]._u16[6] != 0) - CPU.SetBranch(branchTarget(CPU.PC, i16)); + u64 target = branchTarget(CPU.PC, i16); + if (CPU.GPR[rt]._u16[6] != 0) + { + LOG2_OPCODE("taken (0x%llx)", target); + CPU.SetBranch(target); + } + else + { + LOG2_OPCODE("not taken (0x%llx)", target); + } } void STQR(u32 rt, s32 i16) { @@ -1092,7 +1162,9 @@ private: } void BRA(s32 i16) { - CPU.SetBranch(branchTarget(0, i16)); + u64 target = branchTarget(0, i16); + LOG2_OPCODE("branch (0x%llx)", target); + CPU.SetBranch(target); } void LQA(u32 rt, s32 i16) { @@ -1108,13 +1180,17 @@ private: } void BRASL(u32 rt, s32 i16) { + u64 target = branchTarget(0, i16); CPU.GPR[rt].Reset(); CPU.GPR[rt]._u32[3] = CPU.PC + 4; - CPU.SetBranch(branchTarget(0, i16)); + LOG2_OPCODE("branch (0x%llx)", target); + CPU.SetBranch(target); } void BR(s32 i16) { - CPU.SetBranch(branchTarget(CPU.PC, i16)); + u64 target = branchTarget(CPU.PC, i16); + LOG2_OPCODE("branch (0x%llx)", target); + CPU.SetBranch(target); } void FSMBI(u32 rt, s32 i16) { @@ -1134,9 +1210,11 @@ private: } void BRSL(u32 rt, s32 i16) { + u64 target = branchTarget(CPU.PC, i16); CPU.GPR[rt].Reset(); CPU.GPR[rt]._u32[3] = CPU.PC + 4; - CPU.SetBranch(branchTarget(CPU.PC, i16)); + LOG2_OPCODE("branch (0x%llx)", target); + CPU.SetBranch(target); } void LQR(u32 rt, s32 i16) { @@ -1236,6 +1314,7 @@ private: Emu.Pause(); return; } + //ConLog.Write("STQD(lsa=0x%x): GPR[%d] (0x%llx%llx)", lsa, rt, CPU.GPR[rt]._u64[1], CPU.GPR[rt]._u64[0]); CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); } void LQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 478d5d3d16..a1e636ee2b 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -87,7 +87,7 @@ public: struct SPURecEntry { - u16 host; // absolute position of first instruction of current block + //u16 host; // absolute position of first instruction of current block (not used now) u16 count; // count of instructions compiled from current point (and to be checked) u32 valid; // copy of valid opcode for validation void* pointer; // pointer to executable memory object @@ -114,6 +114,8 @@ public: #define imm_xmm(x) oword_ptr(*imm_var, offsetof(SPUImmTable, x)) +#define LOG_OPCODE(...) //ConLog.Write(__FUNCTION__ "()" __VA_ARGS__) + #define WRAPPER_BEGIN(a0, a1, a2, a3) struct opcode_wrapper \ { \ static void opcode(u32 a0, u32 a1, u32 a2, u32 a3) \ @@ -122,11 +124,13 @@ public: #define WRAPPER_END(a0, a1, a2, a3) } \ }; \ + c.mov(cpu_qword(PC), (u32)CPU.PC); \ X86X64CallNode* call = c.call(imm_ptr(&opcode_wrapper::opcode), kFuncConvHost, FuncBuilder4()); \ call->setArg(0, imm_u(a0)); \ call->setArg(1, imm_u(a1)); \ call->setArg(2, imm_u(a2)); \ - call->setArg(3, imm_u(a3)); + call->setArg(3, imm_u(a3)); \ + LOG_OPCODE(); class SPURecompiler : public SPUOpcodes @@ -156,26 +160,31 @@ private: WRAPPER_END(code, 0, 0, 0); c.mov(*pos_var, (CPU.PC >> 2) + 1); do_finalize = true; - ConLog.Write("STOP(code=%d)", code); } void LNOP() { - /*c.mov(*pos_var, (CPU.PC >> 2) + 1); + c.mov(cpu_qword(PC), (u32)CPU.PC); + /* do_finalize = true; - ConLog.Write("LNOP()");*/ + c.mov(*pos_var, (CPU.PC >> 2) + 1); + */ + LOG_OPCODE(); } void SYNC(u32 Cbit) { + c.mov(cpu_qword(PC), (u32)CPU.PC); // This instruction must be used following a store instruction that modifies the instruction stream. c.mfence(); c.mov(*pos_var, (CPU.PC >> 2) + 1); do_finalize = true; - ConLog.Write("SYNC()"); + LOG_OPCODE(); } void DSYNC() { + c.mov(cpu_qword(PC), (u32)CPU.PC); // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. c.mfence(); + LOG_OPCODE(); } void MFSPR(u32 rt, u32 sa) { @@ -563,27 +572,55 @@ private: } void BIZ(u32 rt, u32 ra) { - UNIMPLEMENTED(); - if(CPU.GPR[rt]._u32[3] == 0) - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + c.mov(cpu_qword(PC), (u32)CPU.PC); + do_finalize = true; + + GpVar pos_next(c, kVarTypeUInt32); + c.mov(pos_next, (u32)CPU.PC + 4); + c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); + c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); + c.cmovne(*pos_var, pos_next); + c.shr(*pos_var, 2); + LOG_OPCODE(); } void BINZ(u32 rt, u32 ra) { - UNIMPLEMENTED(); - if(CPU.GPR[rt]._u32[3] != 0) - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + c.mov(cpu_qword(PC), (u32)CPU.PC); + do_finalize = true; + + GpVar pos_next(c, kVarTypeUInt32); + c.mov(pos_next, (u32)CPU.PC + 4); + c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); + c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); + c.cmove(*pos_var, pos_next); + c.shr(*pos_var, 2); + LOG_OPCODE(); } void BIHZ(u32 rt, u32 ra) { - UNIMPLEMENTED(); - if(CPU.GPR[rt]._u16[6] == 0) - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + c.mov(cpu_qword(PC), (u32)CPU.PC); + do_finalize = true; + + GpVar pos_next(c, kVarTypeUInt32); + c.mov(pos_next, (u32)CPU.PC + 4); + c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); + c.cmp(cpu_word(GPR[rt]._u16[6]), 0); + c.cmovne(*pos_var, pos_next); + c.shr(*pos_var, 2); + LOG_OPCODE(); } void BIHNZ(u32 rt, u32 ra) { - UNIMPLEMENTED(); - if(CPU.GPR[rt]._u16[6] != 0) - CPU.SetBranch(branchTarget(CPU.GPR[ra]._u32[3], 0)); + c.mov(cpu_qword(PC), (u32)CPU.PC); + do_finalize = true; + + GpVar pos_next(c, kVarTypeUInt32); + c.mov(pos_next, (u32)CPU.PC + 4); + c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); + c.cmp(cpu_word(GPR[rt]._u16[6]), 0); + c.cmove(*pos_var, pos_next); + c.shr(*pos_var, 2); + LOG_OPCODE(); } void STOPD(u32 rc, u32 ra, u32 rb) { @@ -606,23 +643,26 @@ private: } void BI(u32 ra) { + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; + c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); c.shr(*pos_var, 2); - //ConLog.Write("BI(ra=%d)", ra); + LOG_OPCODE(); } void BISL(u32 rt, u32 ra) { + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - c.int3(); + c.xor_(*pos_var, *pos_var); c.mov(cpu_dword(GPR[rt]._u32[0]), *pos_var); c.mov(cpu_dword(GPR[rt]._u32[1]), *pos_var); c.mov(cpu_dword(GPR[rt]._u32[2]), *pos_var); c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); - c.mov(cpu_dword(GPR[rt]._u32[3]), (CPU.PC >> 2) + 1); + c.mov(cpu_dword(GPR[rt]._u32[3]), (u32)CPU.PC + 4); c.shr(*pos_var, 2); - ConLog.Write("BISL(rt=%d,ra=%d)", rt, ra); + LOG_OPCODE(); } void IRET(u32 ra) { @@ -635,6 +675,7 @@ private: } void HBR(u32 p, u32 ro, u32 ra) { + LOG_OPCODE(); } void GB(u32 rt, u32 ra) { @@ -885,7 +926,7 @@ private: void CHD(u32 rt, u32 ra, s32 i7) { WRAPPER_BEGIN(rt, ra, i7, zz); - const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xE; + const int t = (CPU.GPR[ra]._u32[3] + (s32)i7) & 0xE; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; @@ -895,7 +936,7 @@ private: void CWD(u32 rt, u32 ra, s32 i7) { WRAPPER_BEGIN(rt, ra, i7, zz); - const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xC; + const int t = (CPU.GPR[ra]._u32[3] + (s32)i7) & 0xC; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; @@ -1045,6 +1086,7 @@ private: } void NOP(u32 rt) { + LOG_OPCODE(); } void CGT(u32 rt, u32 ra, u32 rb) { @@ -1096,8 +1138,11 @@ private: //HGT uses signed values. HLGT uses unsigned values void HGT(u32 rt, s32 ra, s32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); if(CPU.GPR[ra]._i32[3] > CPU.GPR[rb]._i32[3]) CPU.Stop(); + WRAPPER_END(rt, ra, rb, 0); + c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; } void CLZ(u32 rt, u32 ra) { @@ -1287,8 +1332,11 @@ private: } void HLGT(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); if(CPU.GPR[ra]._u32[3] > CPU.GPR[rb]._u32[3]) CPU.Stop(); + WRAPPER_END(rt, ra, rb, 0); + c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; } void DFMA(u32 rt, u32 ra, u32 rb) { @@ -1567,8 +1615,11 @@ private: } void HEQ(u32 rt, u32 ra, u32 rb) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, rb, zz); if(CPU.GPR[ra]._i32[3] == CPU.GPR[rb]._i32[3]) CPU.Stop(); + WRAPPER_END(rt, ra, rb, 0); + c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; } //0 - 9 @@ -1687,13 +1738,15 @@ private: //0 - 8 void BRZ(u32 rt, s32 i16) { + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; + GpVar pos_next(c, kVarTypeUInt32); c.mov(pos_next, (CPU.PC >> 2) + 1); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); - c.cmovnz(*pos_var, pos_next); - //ConLog.Write("BRZ(rt=%d,i16=%d)", rt, i16); + c.cmovne(*pos_var, pos_next); + LOG_OPCODE(); } void STQA(u32 rt, s32 i16) { @@ -1711,45 +1764,50 @@ private: } void BRNZ(u32 rt, s32 i16) { + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; + GpVar pos_next(c, kVarTypeUInt32); c.mov(pos_next, (CPU.PC >> 2) + 1); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); - c.cmovz(*pos_var, pos_next); - //ConLog.Write("BRNZ(rt=%d,i16=%d)", rt, i16); + c.cmove(*pos_var, pos_next); + LOG_OPCODE(); } void BRHZ(u32 rt, s32 i16) { + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; + GpVar pos_next(c, kVarTypeUInt32); c.mov(pos_next, (CPU.PC >> 2) + 1); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); c.cmp(cpu_word(GPR[rt]._u16[6]), 0); c.cmovnz(*pos_var, pos_next); - ConLog.Write("BRHZ(rt=%d,i16=%d)", rt, i16); + LOG_OPCODE(); } void BRHNZ(u32 rt, s32 i16) { + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; + GpVar pos_next(c, kVarTypeUInt32); c.mov(pos_next, (CPU.PC >> 2) + 1); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); c.cmp(cpu_word(GPR[rt]._u16[6]), 0); c.cmovz(*pos_var, pos_next); - ConLog.Write("BRHNZ(rt=%d,i16=%d)", rt, i16); + LOG_OPCODE(); } void STQR(u32 rt, s32 i16) { WRAPPER_BEGIN(rt, i16, PC, zz); - u32 lsa = branchTarget(PC, i16) & 0x3fff0; + u32 lsa = branchTarget(PC, (s32)i16) & 0x3fff0; if (!CPU.IsGoodLSA(lsa)) { ConLog.Error("STQR: bad lsa (0x%x)", lsa); Emu.Pause(); return; } - CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); WRAPPER_END(rt, i16, CPU.PC, 0); /*u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; @@ -1765,8 +1823,11 @@ private: } void BRA(s32 i16) { - UNIMPLEMENTED(); - CPU.SetBranch(branchTarget(0, i16)); + c.mov(cpu_qword(PC), (u32)CPU.PC); + do_finalize = true; + + c.mov(*pos_var, branchTarget(0, i16) >> 2); + LOG_OPCODE(); } void LQA(u32 rt, s32 i16) { @@ -1784,16 +1845,24 @@ private: } void BRASL(u32 rt, s32 i16) { - UNIMPLEMENTED(); - CPU.GPR[rt].Reset(); - CPU.GPR[rt]._u32[3] = CPU.PC + 4; - CPU.SetBranch(branchTarget(0, i16)); + c.mov(cpu_qword(PC), (u32)CPU.PC); + do_finalize = true; + + GpVar v0(c, kVarTypeUInt64); + c.xor_(v0, v0); + c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + c.mov(cpu_qword(GPR[rt]._u64[0]), v0); + c.mov(cpu_dword(GPR[rt]._u32[3]), (u32)CPU.PC + 4); + c.mov(*pos_var, branchTarget(0, i16) >> 2); + LOG_OPCODE(); } void BR(s32 i16) { + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; + c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); - //ConLog.Write("BR(i16=%d)", i16); + LOG_OPCODE(); } void FSMBI(u32 rt, s32 i16) { @@ -1818,27 +1887,27 @@ private: } void BRSL(u32 rt, s32 i16) { + c.mov(cpu_qword(PC), (u32)CPU.PC); + do_finalize = true; + GpVar v0(c, kVarTypeUInt64); c.xor_(v0, v0); c.mov(cpu_qword(GPR[rt]._u64[1]), v0); c.mov(cpu_qword(GPR[rt]._u64[0]), v0); - c.mov(cpu_dword(GPR[rt]._u32[3]), CPU.PC + 4); - - do_finalize = true; + c.mov(cpu_dword(GPR[rt]._u32[3]), (u32)CPU.PC + 4); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); - //ConLog.Write("BRSL(rt=%d,i16=%d)", rt, i16); + LOG_OPCODE(); } void LQR(u32 rt, s32 i16) { WRAPPER_BEGIN(rt, i16, PC, zz); - u32 lsa = branchTarget(PC, i16) & 0x3fff0; + u32 lsa = branchTarget(PC, (s32)i16) & 0x3fff0; if (!CPU.IsGoodLSA(lsa)) { ConLog.Error("LQR: bad lsa (0x%x)", lsa); Emu.Pause(); return; } - CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); WRAPPER_END(rt, i16, CPU.PC, 0); /*u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; @@ -1858,7 +1927,7 @@ private: CPU.GPR[rt]._i32[0] = CPU.GPR[rt]._i32[1] = CPU.GPR[rt]._i32[2] = - CPU.GPR[rt]._i32[3] = i16; + CPU.GPR[rt]._i32[3] = (s32)i16; WRAPPER_END(rt, i16, 0, 0); /*XmmVar v0(c); if (i16 == 0) @@ -1879,7 +1948,7 @@ private: { WRAPPER_BEGIN(rt, i16, yy, zz); for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = i16 << 16; + CPU.GPR[rt]._i32[w] = (s32)i16 << 16; WRAPPER_END(rt, i16, 0, 0); /*XmmVar v0(c); if (i16 == 0) @@ -2013,10 +2082,10 @@ private: void AI(u32 rt, u32 ra, s32 i10) { WRAPPER_BEGIN(rt, ra, i10, zz); - CPU.GPR[rt]._i32[0] = CPU.GPR[ra]._i32[0] + i10; - CPU.GPR[rt]._i32[1] = CPU.GPR[ra]._i32[1] + i10; - CPU.GPR[rt]._i32[2] = CPU.GPR[ra]._i32[2] + i10; - CPU.GPR[rt]._i32[3] = CPU.GPR[ra]._i32[3] + i10; + CPU.GPR[rt]._i32[0] = CPU.GPR[ra]._i32[0] + (s32)i10; + CPU.GPR[rt]._i32[1] = CPU.GPR[ra]._i32[1] + (s32)i10; + CPU.GPR[rt]._i32[2] = CPU.GPR[ra]._i32[2] + (s32)i10; + CPU.GPR[rt]._i32[3] = CPU.GPR[ra]._i32[3] + (s32)i10; WRAPPER_END(rt, ra, i10, 0); /*XmmVar v0(c); if (i10 == 0) @@ -2050,13 +2119,14 @@ private: void STQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding { WRAPPER_BEGIN(rt, i10, ra, zz); - const u32 lsa = (CPU.GPR[ra]._i32[3] + i10) & 0x3fff0; + const u32 lsa = (CPU.GPR[ra]._i32[3] + (s32)i10) & 0x3fff0; if (!CPU.IsGoodLSA(lsa)) { ConLog.Error("STQD: bad lsa (0x%x)", lsa); Emu.Pause(); return; } + //ConLog.Write("wrapper::STQD (lsa=0x%x): GPR[%d] (0x%llx%llx)", lsa, rt, CPU.GPR[rt]._u64[1], CPU.GPR[rt]._u64[0]); CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); WRAPPER_END(rt, i10, ra, 0); /*GpVar lsa(c, kVarTypeUInt32); @@ -2076,7 +2146,7 @@ private: void LQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding { WRAPPER_BEGIN(rt, i10, ra, zz); - const u32 lsa = (CPU.GPR[ra]._i32[3] + i10) & 0x3fff0; + const u32 lsa = (CPU.GPR[ra]._i32[3] + (s32)i10) & 0x3fff0; if (!CPU.IsGoodLSA(lsa)) { ConLog.Error("LQD: bad lsa (0x%x)", lsa); @@ -2145,14 +2215,19 @@ private: } void HGTI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); - if(CPU.GPR[ra]._i32[3] > i10) CPU.Stop(); + WRAPPER_BEGIN(rt, ra, i10, zz); + if(CPU.GPR[ra]._i32[3] > (s32)i10) CPU.Stop(); + WRAPPER_END(rt, ra, i10, 0); + c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; } void CLGTI(u32 rt, u32 ra, s32 i10) { WRAPPER_BEGIN(rt, ra, i10, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > (s32)i10 ? 0xffffffff : 0; + for (u32 i = 0; i < 4; ++i) + { + CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > (u32)i10) ? 0xffffffff : 0x00000000; + } WRAPPER_END(rt, ra, i10, 0); /*XmmVar v0(c); if (i10 == -1) @@ -2182,7 +2257,7 @@ private: WRAPPER_BEGIN(rt, ra, i10, zz); for(u32 i = 0; i < 8; ++i) { - CPU.GPR[rt]._u16[i] = (CPU.GPR[ra]._u16[i] > (u16)(s32)i10) ? 0xffff : 0x0000; + CPU.GPR[rt]._u16[i] = (CPU.GPR[ra]._u16[i] > (u16)i10) ? 0xffff : 0x0000; } WRAPPER_END(rt, ra, i10, 0); } @@ -2195,8 +2270,11 @@ private: } void HLGTI(u32 rt, u32 ra, s32 i10) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, ra, i10, zz); if(CPU.GPR[ra]._u32[3] > (u32)i10) CPU.Stop(); + WRAPPER_END(rt, ra, i10, 0); + c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; } void MPYI(u32 rt, u32 ra, s32 i10) { @@ -2236,20 +2314,22 @@ private: } void HEQI(u32 rt, u32 ra, s32 i10) { - // TODO - UNIMPLEMENTED(); - if(CPU.GPR[ra]._i32[3] == i10) CPU.Stop(); + WRAPPER_BEGIN(rt, ra, i10, zz); + if(CPU.GPR[ra]._i32[3] == (s32)i10) CPU.Stop(); + WRAPPER_END(rt, ra, i10, 0); + c.mov(*pos_var, (CPU.PC >> 2) + 1); + do_finalize = true; } //0 - 6 void HBRA(s32 ro, s32 i16) { //i16 is shifted left by 2 while decoding - //UNIMPLEMENTED(); + LOG_OPCODE(); } void HBRR(s32 ro, s32 i16) { - //UNIMPLEMENTED(); + LOG_OPCODE(); } void ILA(u32 rt, u32 i18) { @@ -2376,6 +2456,7 @@ private: void UNK(const std::string& err) { ConLog.Error(err + fmt::Format(" #pc: 0x%x", CPU.PC)); + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; Emu.Pause(); } diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index 940ef44a4a..e7ecf4755e 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -28,7 +28,8 @@ void SPURecompilerCore::Decode(const u32 code) // decode instruction and run wit void SPURecompilerCore::Compile(u16 pos) { compiler.addFunc(kFuncConvHost, FuncBuilder4()); - entry[pos].host = pos; + const u16 start = pos; + entry[start].count = 0; GpVar cpu_var(compiler, kVarTypeIntPtr, "cpu"); compiler.setArg(0, cpu_var); @@ -60,32 +61,32 @@ void SPURecompilerCore::Compile(u16 pos) if (opcode) { (*SPU_instr::rrr_list)(m_enc, opcode); // compile single opcode + entry[start].count++; } else { m_enc->do_finalize = true; } bool fin = m_enc->do_finalize; - entry[pos].valid = opcode; + entry[pos].valid = re(opcode); if (fin) break; CPU.PC += 4; pos++; - entry[pos].host = entry[pos - 1].host; } compiler.ret(pos_var); compiler.endFunc(); - entry[entry[pos].host].pointer = compiler.make(); + entry[start].pointer = compiler.make(); } u8 SPURecompilerCore::DecodeMemory(const u64 address) { - const u64 m_offset = address - CPU.PC; + const u64 m_offset = CPU.dmac.ls_offset; const u16 pos = (CPU.PC >> 2); //ConLog.Write("DecodeMemory: pos=%d", pos); - u32* ls = (u32*)Memory.VirtualToRealAddr(m_offset); + u32* ls = (u32*)&Memory[m_offset]; if (!pos) { @@ -98,7 +99,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) { // check data (hard way) bool is_valid = true; - for (u32 i = pos; i < entry[pos].count + pos; i++) + for (u32 i = pos; i < (u32)(entry[pos].count + pos); i++) { if (entry[i].valid != ls[i]) { @@ -110,6 +111,9 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) if (!is_valid) { // TODO + ConLog.Error("SPURecompilerCore::DecodeMemory(ls_addr=0x%x): code has changed", pos * sizeof(u32)); + Emu.Pause(); + return 0; } } @@ -117,6 +121,12 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) { // compile from current position to nearest dynamic or statically unresolved branch, zero data or something other Compile(pos); + if (entry[pos].valid == 0) + { + ConLog.Error("SPURecompilerCore::Compile(ls_addr=0x%x): branch to 0x0 opcode", pos * sizeof(u32)); + Emu.Pause(); + return 0; + } } if (!entry[pos].pointer) @@ -128,16 +138,23 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) // jump typedef u32(*Func)(void* _cpu, void* _ls, const SPUImmTable* _imm, u32 _pos); - Func func = asmjit_cast(entry[entry[pos].host].pointer); + Func func = asmjit_cast(entry[pos].pointer); void* cpu = (u8*)&CPU.GPR[0] - offsetof(SPUThread, GPR[0]); // ugly cpu base offset detection - u16 res = (pos == entry[pos].host) ? 0 : pos; - res = (u16)func(cpu, ls, &g_spu_imm, res); + u16 res = pos; + res = (u16)func(cpu, &Memory[m_offset], &g_spu_imm, res); - CPU.SetBranch((u64)res << 2); - - return 0; + LOG2_OPCODE("SPURecompilerCore::DecodeMemory(ls_addr=0x%x): NewPC = 0x%llx", address, (u64)res << 2); + if ((res - 1) == (CPU.PC >> 2)) + { + return 4; + } + else + { + CPU.SetBranch((u64)res << 2); + return 0; + } /*Decode(Memory.Read32(address)); return 4;*/ } \ No newline at end of file diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index a581130473..71d7bbc05e 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -300,19 +300,17 @@ public: #else static const bool x86 = true; #endif + private: union _CRT_ALIGN(8) { struct { volatile u32 m_index; u32 m_value[max_count]; }; - struct { - volatile u32 m_index2; - u16 m_val16[max_count * 2]; - }; volatile u64 m_indval; }; std::mutex m_lock; + public: Channel() { Init(); From 4e9dc8ab9c34813c270031979235c5ae1f3e98d1 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 7 Apr 2014 22:27:30 +0400 Subject: [PATCH 04/14] SPU JIT: more asm --- rpcs3/Emu/Cell/SPURecompiler.h | 729 +++++++++++++++++++-------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 7 + 2 files changed, 512 insertions(+), 224 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index a1e636ee2b..cc14ca3435 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -28,7 +28,7 @@ struct SPUImmTable // signed numbers table for (u32 i = 0; i < sizeof(s19_to_s32) / sizeof(__m128i); i++) { - const u32 v = (i & 0x40000) ? (i | 0xfff8000) : i; + const u32 v = (i & 0x40000) ? (i | 0xfff80000) : i; s19_to_s32[i].m128i_i32[0] = v; s19_to_s32[i].m128i_i32[1] = v; s19_to_s32[i].m128i_i32[2] = v; @@ -122,7 +122,7 @@ public: { \ SPUThread& CPU = *(SPUThread*)GetCurrentCPUThread(); -#define WRAPPER_END(a0, a1, a2, a3) } \ +#define WRAPPER_END(a0, a1, a2, a3) LOG2_OPCODE(); } \ }; \ c.mov(cpu_qword(PC), (u32)CPU.PC); \ X86X64CallNode* call = c.call(imm_ptr(&opcode_wrapper::opcode), kFuncConvHost, FuncBuilder4()); \ @@ -155,19 +155,24 @@ private: //0 - 10 void STOP(u32 code) { - WRAPPER_BEGIN(code, xx, yy, zz); - CPU.DoStop(code); - WRAPPER_END(code, 0, 0, 0); + struct STOP_wrapper + { + static void STOP(u32 code) + { + SPUThread& CPU = *(SPUThread*)GetCurrentCPUThread(); + CPU.DoStop(code); + LOG2_OPCODE(); + } + }; + c.mov(cpu_qword(PC), (u32)CPU.PC); + X86X64CallNode* call = c.call(imm_ptr(&STOP_wrapper::STOP), kFuncConvHost, FuncBuilder1()); + call->setArg(0, imm_u(code)); c.mov(*pos_var, (CPU.PC >> 2) + 1); do_finalize = true; + LOG_OPCODE(); } void LNOP() { - c.mov(cpu_qword(PC), (u32)CPU.PC); - /* - do_finalize = true; - c.mov(*pos_var, (CPU.PC >> 2) + 1); - */ LOG_OPCODE(); } void SYNC(u32 Cbit) @@ -181,14 +186,13 @@ private: } void DSYNC() { - c.mov(cpu_qword(PC), (u32)CPU.PC); // This instruction forces all earlier load, store, and channel instructions to complete before proceeding. c.mfence(); LOG_OPCODE(); } void MFSPR(u32 rt, u32 sa) { - UNIMPLEMENTED(); + WRAPPER_BEGIN(rt, sa, yy, zz); //If register is a dummy register (register labeled 0x0) if(sa == 0x0) { @@ -200,6 +204,7 @@ private: CPU.GPR[rt]._u128.hi = CPU.SPR[sa]._u128.hi; CPU.GPR[rt]._u128.lo = CPU.SPR[sa]._u128.lo; } + WRAPPER_END(rt, sa, 0, 0); } void RDCH(u32 rt, u32 ra) { @@ -218,13 +223,13 @@ private: } void SF(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[rb]._u32[0] - CPU.GPR[ra]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[rb]._u32[1] - CPU.GPR[ra]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[rb]._u32[2] - CPU.GPR[ra]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[rb]._u32[3] - CPU.GPR[ra]._u32[3]; - WRAPPER_END(rt, ra, rb, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { // zero @@ -237,30 +242,49 @@ private: c.movdqa(v0, cpu_xmm(GPR[rb])); c.psubd(v0, cpu_xmm(GPR[ra])); c.movdqa(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); } void OR(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]; - WRAPPER_END(rt, ra, rb, 0); - // TODO + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); + if (ra == rb) + { + // mov + if (ra != rt) + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + // else nop + } + else + { + // or + c.movaps(v0, cpu_xmm(GPR[ra])); + c.orps(v0, cpu_xmm(GPR[rb])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void BG(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] > CPU.GPR[rb]._u32[0] ? 0 : 1; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] > CPU.GPR[rb]._u32[1] ? 0 : 1; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] > CPU.GPR[rb]._u32[2] ? 0 : 1; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] > CPU.GPR[rb]._u32[3] ? 0 : 1; - WRAPPER_END(rt, ra, rb, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { - // load {1,1,1,1} + // load { 1, 1, 1, 1 } c.movaps(v0, imm_xmm(s19_to_s32[1])); c.movaps(cpu_xmm(GPR[rt]), v0); } @@ -270,9 +294,11 @@ private: c.movdqa(v0, cpu_xmm(GPR[rb])); c.psubd(v0, cpu_xmm(GPR[ra])); c.psrad(v0, 32); + // add 1 c.paddd(v0, imm_xmm(s19_to_s32[1])); c.movdqa(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); } void SFH(u32 rt, u32 ra, u32 rb) { @@ -283,13 +309,18 @@ private: } void NOR(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]); CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]); CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]); CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]); - WRAPPER_END(rt, ra, rb, 0); - // TODO + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + if (ra != rb) c.orps(v0, cpu_xmm(GPR[rb])); + c.xorps(v0, imm_xmm(s19_to_s32[0x7ffff])); + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void ABSDB(u32 rt, u32 ra, u32 rb) { @@ -346,7 +377,8 @@ private: c.cmovnz(v0, z); c.shl(v0, shift); c.mov(cpu_dword(GPR[rt]._u32[i]), v0); - }*/ + } + LOG_OPCODE();*/ } void ROTH(u32 rt, u32 ra, u32 rb) { @@ -388,14 +420,39 @@ private: } void ROTMI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) % 64; + /*WRAPPER_BEGIN(rt, ra, i7, zz); + const int nRot = (0 - (s32)i7) % 64; // ??? CPU.GPR[rt]._u32[0] = nRot < 32 ? CPU.GPR[ra]._u32[0] >> nRot : 0; CPU.GPR[rt]._u32[1] = nRot < 32 ? CPU.GPR[ra]._u32[1] >> nRot : 0; CPU.GPR[rt]._u32[2] = nRot < 32 ? CPU.GPR[ra]._u32[2] >> nRot : 0; CPU.GPR[rt]._u32[3] = nRot < 32 ? CPU.GPR[ra]._u32[3] >> nRot : 0; - WRAPPER_END(rt, ra, i7, 0); - // TODO + WRAPPER_END(rt, ra, i7, 0);*/ + const int nRot = (0 - i7) & 0x3f; // !!! + XmmVar v0(c); + if (nRot > 31) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else if (nRot == 0) + { + // mov + if (ra != rt) + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + // else nop + } + else + { + // shift right logical + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.psrld(v0, nRot); + c.movdqa(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void ROTMAI(u32 rt, u32 ra, s32 i7) { @@ -409,12 +466,37 @@ private: } void SHLI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); + /*WRAPPER_BEGIN(rt, ra, i7, zz); const u32 s = i7 & 0x3f; for (u32 j = 0; j < 4; ++j) CPU.GPR[rt]._u32[j] = CPU.GPR[ra]._u32[j] << s; - WRAPPER_END(rt, ra, i7, 0); - // TODO + WRAPPER_END(rt, ra, i7, 0);*/ + const int s = i7 & 0x3f; + XmmVar v0(c); + if (s > 31) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else if (s == 0) + { + // mov + if (ra != rt) + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + // else nop + } + else + { + // shift left + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.pslld(v0, s); + c.movdqa(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void ROTHI(u32 rt, u32 ra, s32 i7) { @@ -450,38 +532,43 @@ private: } void A(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]; - WRAPPER_END(rt, ra, rb, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[ra])); - c.paddd(v0, cpu_xmm(GPR[rb])); - c.movdqa(cpu_xmm(GPR[rt]), v0);*/ + if (ra == rb) + { + c.paddd(v0, v0); + } + else + { + c.paddd(v0, cpu_xmm(GPR[rb])); + } + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void AND(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] & CPU.GPR[rb]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] & CPU.GPR[rb]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] & CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]; - WRAPPER_END(rt, ra, rb, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { - if (rt == ra) - { - // nop - } - else + if (rt != ra) { // mov c.movaps(v0, cpu_xmm(GPR[ra])); c.movaps(cpu_xmm(GPR[rt]), v0); } + // else nop } else { @@ -489,7 +576,8 @@ private: c.movaps(v0, cpu_xmm(GPR[ra])); c.andps(v0, cpu_xmm(GPR[rb])); c.movaps(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); } void CG(u32 rt, u32 ra, u32 rb) { @@ -737,25 +825,27 @@ private: } void FREST(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); + /*WRAPPER_BEGIN(rt, ra, yy, zz); for (int i = 0; i < 4; i++) CPU.GPR[rt]._f[i] = 1 / CPU.GPR[ra]._f[i]; - WRAPPER_END(rt, ra, 0, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, 0, 0);*/ + XmmVar v0(c); c.rcpps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void FRSQEST(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); + /*WRAPPER_BEGIN(rt, ra, yy, zz); for (int i = 0; i < 4; i++) CPU.GPR[rt]._f[i] = 1 / sqrt(abs(CPU.GPR[ra]._f[i])); - WRAPPER_END(rt, ra, 0, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, 0, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); c.andps(v0, imm_xmm(max_int)); c.rsqrtps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void LQX(u32 rt, u32 ra, u32 rb) { @@ -989,47 +1079,42 @@ private: } void ROTQBYI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); + /*WRAPPER_BEGIN(rt, ra, i7, zz); const int s = i7 & 0xf; const SPU_GPR_hdr temp = CPU.GPR[ra]; for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; - WRAPPER_END(rt, ra, i7, 0); - /*const int s = i7 & 0xf; - - XmmVar v0(c); - XmmVar v1(c); + WRAPPER_END(rt, ra, i7, 0);*/ + const int s = i7 & 0xf; + XmmVar v0(c), v1(c); c.movdqa(v0, cpu_xmm(GPR[ra])); c.movdqa(v1, v0); c.pslldq(v0, s); c.psrldq(v1, 0xf - s); c.por(v0, v1); - c.movdqa(cpu_xmm(GPR[rt]), v0);*/ + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void ROTQMBYI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); + /*WRAPPER_BEGIN(rt, ra, i7, zz); const int s = (0 - (s32)i7) & 0x1f; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = 0; b < 16 - s; b++) CPU.GPR[rt]._u8[b] = temp._u8[b + s]; - WRAPPER_END(rt, ra, i7, 0); - /*const int s = (0 - i7) & 0x1f; - + WRAPPER_END(rt, ra, i7, 0);*/ + const int s = (0 - i7) & 0x1f; XmmVar v0(c); if (s == 0) { - if (ra == rt) - { - // nop - } - else + if (ra != rt) { // mov c.movaps(v0, cpu_xmm(GPR[ra])); c.movaps(cpu_xmm(GPR[rt]), v0); } + // else nop } else if (s > 15) { @@ -1043,32 +1128,29 @@ private: c.movdqa(v0, cpu_xmm(GPR[ra])); c.psrldq(v0, s); c.movdqa(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); } void SHLQBYI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); + /*WRAPPER_BEGIN(rt, ra, i7, zz); const int s = i7 & 0x1f; const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = s; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[b - s]; - WRAPPER_END(rt, ra, i7, 0); - /*const int s = i7 & 0x1f; - + WRAPPER_END(rt, ra, i7, 0);*/ + const int s = i7 & 0x1f; XmmVar v0(c); if (s == 0) { - if (ra == rt) - { - // nop - } - else + if (ra != rt) { // mov c.movaps(v0, cpu_xmm(GPR[ra])); c.movaps(cpu_xmm(GPR[rt]), v0); } + // else nop } else if (s > 15) { @@ -1082,7 +1164,8 @@ private: c.movdqa(v0, cpu_xmm(GPR[ra])); c.pslldq(v0, s); c.movdqa(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); } void NOP(u32 rt) { @@ -1209,28 +1292,57 @@ private: { // compare if-greater-then c.movdqa(v0, cpu_xmm(GPR[rb])); - c.psubd(v0, cpu_xmm(GPR[ra])); - c.psrad(v0, 32); + // (not implemented) c.movdqa(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); + */ } void ANDC(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] & (~CPU.GPR[rb]._u32[w]); - WRAPPER_END(rt, ra, rb, 0); - // TODO + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); + if (ra == rb) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + // and not + c.movaps(v0, cpu_xmm(GPR[rb])); + c.andnps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void FCGT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._f[0] > CPU.GPR[rb]._f[0] ? 0xffffffff : 0; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._f[1] > CPU.GPR[rb]._f[1] ? 0xffffffff : 0; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._f[2] > CPU.GPR[rb]._f[2] ? 0xffffffff : 0; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] > CPU.GPR[rb]._f[3] ? 0xffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); - // TODO + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); + if (ra == rb) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + // not-less-or-equal + c.movaps(v0, cpu_xmm(GPR[ra])); + c.cmpps(v0, cpu_xmm(GPR[rb]), 6); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void DFCGT(u32 rt, u32 ra, u32 rb) { @@ -1241,36 +1353,68 @@ private: } void FA(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] + CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] + CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] + CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] + CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, 0); - // TODO + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); + c.movaps(v0, cpu_xmm(GPR[ra])); + if (ra == rb) + { + c.addps(v0, v0); + } + else + { + c.addps(v0, cpu_xmm(GPR[rb])); + } + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void FS(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] - CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] - CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] - CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] - CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, 0); - // TODO + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); + if (ra == rb) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.subps(v0, cpu_xmm(GPR[rb])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void FM(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); - c.mulps(v0, cpu_xmm(GPR[rb])); - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + if (ra == rb) + { + c.mulps(v0, v0); + } + else + { + c.mulps(v0, cpu_xmm(GPR[rb])); + } + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void CLGTH(u32 rt, u32 ra, u32 rb) { @@ -1382,27 +1526,41 @@ private: } void ADDX(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] + CPU.GPR[rb]._u32[w] + (CPU.GPR[rt]._u32[w] & 1); - WRAPPER_END(rt, ra, rb, 0); - // TODO + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); + c.movdqa(v0, cpu_xmm(GPR[rt])); + c.pand(v0, imm_xmm(s19_to_s32[1])); + c.paddd(v0, cpu_xmm(GPR[ra])); + c.paddd(v0, cpu_xmm(GPR[rb])); + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void SFX(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[rb]._u32[w] - CPU.GPR[ra]._u32[w] - (1 - (CPU.GPR[rt]._u32[w] & 1)); - WRAPPER_END(rt, ra, rb, 0); - /*XmmVar v0(c), v1(c), v2(c); - c.movdqa(v1, imm_xmm(s19_to_s32[1])); - c.movdqa(v0, cpu_xmm(GPR[rb])); - c.movdqa(v2, cpu_xmm(GPR[rt])); - c.psubd(v0, cpu_xmm(GPR[ra])); - c.pand(v2, v1); - c.paddd(v0, v2); + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c), v1(c); + c.movdqa(v1, cpu_xmm(GPR[rt])); + c.pandn(v1, imm_xmm(s19_to_s32[1])); + if (ra == rb) + { + // load zero + c.pxor(v0, v0); + } + else + { + // sub + c.movdqa(v0, cpu_xmm(GPR[rb])); + c.psubd(v0, cpu_xmm(GPR[ra])); + } c.psubd(v0, v1); - c.movdqa(cpu_xmm(GPR[rt]), v0);*/ + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void CGX(u32 rt, u32 ra, u32 rb) { @@ -1591,11 +1749,27 @@ private: } void MPYU(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * CPU.GPR[rb]._u16[w*2]; - WRAPPER_END(rt, ra, rb, 0); - // TODO + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + if (ra == rb) + { + c.pand(v0, imm_xmm(s19_to_s32[0xffff])); + c.pmulld(v0, v0); + } + else + { + XmmVar v1(c); + c.movdqa(v1, imm_xmm(s19_to_s32[0xffff])); // load mask + c.pand(v0, v1); // clear high words of each dword + c.pand(v1, cpu_xmm(GPR[rb])); + c.pmulld(v0, v1); + } + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void CEQB(u32 rt, u32 ra, u32 rb) { @@ -1606,12 +1780,13 @@ private: } void FI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt] = CPU.GPR[rb]; - WRAPPER_END(rt, ra, rb, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[rb])); - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void HEQ(u32 rt, u32 ra, u32 rb) { @@ -1625,7 +1800,7 @@ private: //0 - 9 void CFLTS(u32 rt, u32 ra, s32 i8) { - WRAPPER_BEGIN(rt, ra, i8, zz); + /*WRAPPER_BEGIN(rt, ra, i8, zz); const u32 scale = 173 - (i8 & 0xff); //unsigned immediate for (int i = 0; i < 4; i++) { @@ -1638,15 +1813,16 @@ private: CPU.GPR[rt]._u32[i] = (u32)CPU.GPR[rt]._f[i]; //trunc } - WRAPPER_END(rt, ra, i8, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, i8, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (i8 != 173) { c.mulps(v0, imm_xmm(scale_to_int[i8 & 0xff])); // scale } c.cvtps2dq(v0, v0); // convert to ints - c.movdqa(cpu_xmm(GPR[rt]), v0);*/ + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void CFLTU(u32 rt, u32 ra, s32 i8) { @@ -1681,11 +1857,12 @@ private: // TODO: handle negative values and convert to unsigned value // c.int3(); c.cvtps2dq(v0, v0); // convert to signed ints - c.movdqa(cpu_xmm(GPR[rt]), v0);*/ + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE();*/ } void CSFLT(u32 rt, u32 ra, s32 i8) { - WRAPPER_BEGIN(rt, ra, i8, zz); + /*WRAPPER_BEGIN(rt, ra, i8, zz); const u32 scale = 155 - (i8 & 0xff); //unsigned immediate for (int i = 0; i < 4; i++) { @@ -1698,15 +1875,16 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); } - WRAPPER_END(rt, ra, i8, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, i8, 0);*/ + XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[ra])); c.cvtdq2ps(v0, v0); // convert to floats if (i8 != 155) { c.mulps(v0, imm_xmm(scale_to_float[i8 & 0xff])); // scale } - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void CUFLT(u32 rt, u32 ra, s32 i8) { @@ -1732,7 +1910,8 @@ private: { c.mulps(v0, imm_xmm(scale_to_float[i8 & 0xff])); // scale } - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE();*/ } //0 - 8 @@ -1800,7 +1979,7 @@ private: } void STQR(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, PC, zz); + /*WRAPPER_BEGIN(rt, i16, PC, zz); u32 lsa = branchTarget(PC, (s32)i16) & 0x3fff0; if (!CPU.IsGoodLSA(lsa)) { @@ -1809,9 +1988,8 @@ private: return; } CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); - WRAPPER_END(rt, i16, CPU.PC, 0); - /*u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; - + WRAPPER_END(rt, i16, CPU.PC, 0);*/ + u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); c.mov(v0, cpu_qword(GPR[rt]._u64[0])); @@ -1819,7 +1997,8 @@ private: c.bswap(v0); c.bswap(v1); c.mov(qword_ptr(*ls_var, lsa), v1); - c.mov(qword_ptr(*ls_var, lsa + 8), v0);*/ + c.mov(qword_ptr(*ls_var, lsa + 8), v0); + LOG_OPCODE(); } void BRA(s32 i16) { @@ -1866,7 +2045,7 @@ private: } void FSMBI(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, yy, zz); + /*WRAPPER_BEGIN(rt, i16, yy, zz); const u32 s = i16; for (u32 j = 0; j < 16; ++j) @@ -1880,10 +2059,11 @@ private: CPU.GPR[rt]._u8[j] = 0x00; } } - WRAPPER_END(rt, i16, 0, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, i16, 0, 0);*/ + XmmVar v0(c); c.movaps(v0, imm_xmm(fsmbi_mask[i16 & 0xffff])); - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void BRSL(u32 rt, s32 i16) { @@ -1900,7 +2080,7 @@ private: } void LQR(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, PC, zz); + /*WRAPPER_BEGIN(rt, i16, PC, zz); u32 lsa = branchTarget(PC, (s32)i16) & 0x3fff0; if (!CPU.IsGoodLSA(lsa)) { @@ -1909,9 +2089,8 @@ private: return; } CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); - WRAPPER_END(rt, i16, CPU.PC, 0); - /*u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; - + WRAPPER_END(rt, i16, CPU.PC, 0);*/ + u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); c.mov(v0, qword_ptr(*ls_var, lsa)); @@ -1919,17 +2098,18 @@ private: c.bswap(v0); c.bswap(v1); c.mov(cpu_qword(GPR[rt]._u64[0]), v1); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0);*/ + c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + LOG_OPCODE(); } void IL(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, yy, zz); + /*WRAPPER_BEGIN(rt, i16, yy, zz); CPU.GPR[rt]._i32[0] = CPU.GPR[rt]._i32[1] = CPU.GPR[rt]._i32[2] = CPU.GPR[rt]._i32[3] = (s32)i16; - WRAPPER_END(rt, i16, 0, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, i16, 0, 0);*/ + XmmVar v0(c); if (i16 == 0) { c.xorps(v0, v0); @@ -1942,15 +2122,16 @@ private: { c.movaps(v0, imm_xmm(s19_to_s32[i16 & 0x7ffff])); } - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void ILHU(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, yy, zz); + /*WRAPPER_BEGIN(rt, i16, yy, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = (s32)i16 << 16; - WRAPPER_END(rt, i16, 0, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, i16, 0, 0);*/ + XmmVar v0(c); if (i16 == 0) { c.xorps(v0, v0); @@ -1965,7 +2146,7 @@ private: c.movaps(v0, imm_xmm(s19_to_s32[i16 & 0x7ffff])); c.pslld(v0, 16); } - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); } void ILH(u32 rt, s32 i16) { @@ -1976,11 +2157,11 @@ private: } void IOHL(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, yy, zz); + /*WRAPPER_BEGIN(rt, i16, yy, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] |= (i16 & 0xFFFF); - WRAPPER_END(rt, i16, 0, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, i16, 0, 0);*/ + XmmVar v0(c); if (i16 == 0) { // nop @@ -1990,18 +2171,19 @@ private: c.movaps(v0, cpu_xmm(GPR[rt])); c.orps(v0, imm_xmm(s19_to_s32[i16 & 0xffff])); c.movaps(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); } //0 - 7 void ORI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); + /*WRAPPER_BEGIN(rt, ra, i10, zz); for (u32 i = 0; i < 4; ++i) CPU.GPR[rt]._i32[i] = CPU.GPR[ra]._i32[i] | (s32)i10; - WRAPPER_END(rt, ra, i10, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); if (i10 == -1) { // fill with 1 @@ -2010,23 +2192,21 @@ private: } else if (i10 == 0) { - if (rt == ra) - { - // nop - } - else + if (rt != ra) { // mov c.movaps(v0, cpu_xmm(GPR[ra])); c.movaps(cpu_xmm(GPR[rt]), v0); } + // else nop } else { c.movaps(v0, cpu_xmm(GPR[ra])); c.orps(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); c.movaps(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); } void ORHI(u32 rt, u32 ra, s32 i10) { @@ -2044,11 +2224,26 @@ private: } void SFI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); + /*WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = (s32)i10 - CPU.GPR[ra]._i32[w]; - WRAPPER_END(rt, ra, i10, 0); - // TODO + WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); + if (i10 == 0) + { + c.pxor(v0, v0); + } + else if (i10 == -1) + { + c.pcmpeqd(v0, v0); + } + else + { + c.movdqa(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + } + c.psubd(v0, cpu_xmm(GPR[ra])); + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void SFHI(u32 rt, u32 ra, s32 i10) { @@ -2059,11 +2254,34 @@ private: } void ANDI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); + /*WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] & (s32)i10; - WRAPPER_END(rt, ra, i10, 0); - // TODO + WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); + if (i10 == 0) + { + // zero + c.xorps(v0, v0); + c.movaps(v0, cpu_xmm(GPR[ra])); + } + else if (i10 == -1) + { + // mov + if (ra != rt) + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + // else nop + } + else + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.andps(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void ANDHI(u32 rt, u32 ra, s32 i10) { @@ -2081,25 +2299,22 @@ private: } void AI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); + /*WRAPPER_BEGIN(rt, ra, i10, zz); CPU.GPR[rt]._i32[0] = CPU.GPR[ra]._i32[0] + (s32)i10; CPU.GPR[rt]._i32[1] = CPU.GPR[ra]._i32[1] + (s32)i10; CPU.GPR[rt]._i32[2] = CPU.GPR[ra]._i32[2] + (s32)i10; CPU.GPR[rt]._i32[3] = CPU.GPR[ra]._i32[3] + (s32)i10; - WRAPPER_END(rt, ra, i10, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); if (i10 == 0) { - if (rt == ra) - { - // nop - } - else + if (rt != ra) { // mov c.movaps(v0, cpu_xmm(GPR[ra])); c.movaps(cpu_xmm(GPR[rt]), v0); } + // else nop } else { @@ -2107,7 +2322,8 @@ private: c.movdqa(v0, cpu_xmm(GPR[ra])); c.paddd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); c.movdqa(cpu_xmm(GPR[rt]), v0); - }*/ + } + LOG_OPCODE(); } void AHI(u32 rt, u32 ra, s32 i10) { @@ -2118,7 +2334,7 @@ private: } void STQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding { - WRAPPER_BEGIN(rt, i10, ra, zz); + /*WRAPPER_BEGIN(rt, i10, ra, zz); const u32 lsa = (CPU.GPR[ra]._i32[3] + (s32)i10) & 0x3fff0; if (!CPU.IsGoodLSA(lsa)) { @@ -2128,8 +2344,8 @@ private: } //ConLog.Write("wrapper::STQD (lsa=0x%x): GPR[%d] (0x%llx%llx)", lsa, rt, CPU.GPR[rt]._u64[1], CPU.GPR[rt]._u64[0]); CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); - WRAPPER_END(rt, i10, ra, 0); - /*GpVar lsa(c, kVarTypeUInt32); + WRAPPER_END(rt, i10, ra, 0);*/ + GpVar lsa(c, kVarTypeUInt32); GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); @@ -2141,11 +2357,12 @@ private: c.bswap(v0); c.bswap(v1); c.mov(qword_ptr(*ls_var, lsa, 0, 0), v1); - c.mov(qword_ptr(*ls_var, lsa, 0, 8), v0);*/ + c.mov(qword_ptr(*ls_var, lsa, 0, 8), v0); + LOG_OPCODE(); } void LQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding { - WRAPPER_BEGIN(rt, i10, ra, zz); + /*WRAPPER_BEGIN(rt, i10, ra, zz); const u32 lsa = (CPU.GPR[ra]._i32[3] + (s32)i10) & 0x3fff0; if (!CPU.IsGoodLSA(lsa)) { @@ -2155,8 +2372,8 @@ private: } CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); - WRAPPER_END(rt, i10, ra, 0); - /*GpVar lsa(c, kVarTypeUInt32); + WRAPPER_END(rt, i10, ra, 0);*/ + GpVar lsa(c, kVarTypeUInt32); GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); @@ -2168,7 +2385,8 @@ private: c.bswap(v0); c.bswap(v1); c.mov(cpu_qword(GPR[rt]._u64[0]), v1); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0);*/ + c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + LOG_OPCODE(); } void XORI(u32 rt, u32 ra, s32 i10) { @@ -2193,11 +2411,15 @@ private: } void CGTI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); + /*WRAPPER_BEGIN(rt, ra, i10, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > (s32)i10 ? 0xffffffff : 0; - WRAPPER_END(rt, ra, i10, 0); - // TODO + WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.pcmpgtd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void CGTHI(u32 rt, u32 ra, s32 i10) { @@ -2247,8 +2469,7 @@ private: { c.movdqa(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); } - c.psubd(v0, cpu_xmm(GPR[ra])); - c.psrad(v0, 32); + // (not implemented) c.movdqa(cpu_xmm(GPR[rt]), v0); }*/ } @@ -2292,11 +2513,15 @@ private: } void CEQI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); + /*WRAPPER_BEGIN(rt, ra, i10, zz); for(u32 i = 0; i < 4; ++i) CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._i32[i] == (s32)i10) ? 0xffffffff : 0x00000000; - WRAPPER_END(rt, ra, i10, 0); - // TODO + WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.pcmpeqd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + c.movdqa(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void CEQHI(u32 rt, u32 ra, s32 i10) { @@ -2333,13 +2558,13 @@ private: } void ILA(u32 rt, u32 i18) { - WRAPPER_BEGIN(rt, i18, yy, zz); + /*WRAPPER_BEGIN(rt, i18, yy, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[rt]._u32[1] = CPU.GPR[rt]._u32[2] = CPU.GPR[rt]._u32[3] = i18 & 0x3FFFF; - WRAPPER_END(rt, i18, 0, 0); - /*XmmVar v0(c); + WRAPPER_END(rt, i18, 0, 0);*/ + XmmVar v0(c); if (i18 == 0) { c.xorps(v0, v0); @@ -2348,28 +2573,29 @@ private: { c.movaps(v0, imm_xmm(s19_to_s32[i18 & 0x3ffff])); } - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } //0 - 3 void SELB(u32 rt, u32 ra, u32 rb, u32 rc) { - WRAPPER_BEGIN(rt, ra, rb, rc); + /*WRAPPER_BEGIN(rt, ra, rb, rc); for (u64 i = 0; i < 2; ++i) { CPU.GPR[rt]._u64[i] = (CPU.GPR[rc]._u64[i] & CPU.GPR[rb]._u64[i]) | (~CPU.GPR[rc]._u64[i] & CPU.GPR[ra]._u64[i]); } - WRAPPER_END(rt, ra, rb, rc); - /*XmmVar v0(c); - XmmVar v1(c); - c.movaps(v0, cpu_xmm(GPR[ra])); + WRAPPER_END(rt, ra, rb, rc);*/ + XmmVar v0(c), v1(c); + c.movaps(v0, cpu_xmm(GPR[rb])); c.movaps(v1, cpu_xmm(GPR[rc])); - c.andnps(v0, v1); - c.andps(v1, cpu_xmm(GPR[rb])); + c.andps(v0, v1); + c.andnps(v1, cpu_xmm(GPR[ra])); c.orps(v0, v1); - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void SHUFB(u32 rt, u32 ra, u32 rb, u32 rc) { @@ -2411,32 +2637,87 @@ private: } void FNMS(u32 rt, u32 ra, u32 rb, u32 rc) { - WRAPPER_BEGIN(rt, ra, rb, rc); + /*WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[rc]._f[0] - CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[rc]._f[1] - CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[rc]._f[2] - CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[rc]._f[3] - CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, rc); - /*XmmVar v0(c), v1(c); + WRAPPER_END(rt, ra, rb, rc);*/ + XmmVar v0(c), v1(c); c.movaps(v0, cpu_xmm(GPR[ra])); - c.mulps(v0, cpu_xmm(GPR[rb])); - c.movaps(v1, cpu_xmm(GPR[rc])); + if (ra == rc) + { + c.movaps(v1, v0); + } + else + { + c.movaps(v1, cpu_xmm(GPR[rc])); + } + if (ra == rb) + { + c.mulps(v0, v0); + } + else + { + if (rb == rc) + { + c.mulps(v0, v1); + } + else + { + c.mulps(v0, cpu_xmm(GPR[rb])); + } + } c.subps(v1, v0); - c.movaps(cpu_xmm(GPR[rt]), v1);*/ + c.movaps(cpu_xmm(GPR[rt]), v1); + LOG_OPCODE(); } void FMA(u32 rt, u32 ra, u32 rb, u32 rc) { - WRAPPER_BEGIN(rt, ra, rb, rc); + /*WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] + CPU.GPR[rc]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] + CPU.GPR[rc]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] + CPU.GPR[rc]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] + CPU.GPR[rc]._f[3]; - WRAPPER_END(rt, ra, rb, rc); - /*XmmVar v0(c); + WRAPPER_END(rt, ra, rb, rc);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); - c.mulps(v0, cpu_xmm(GPR[rb])); - c.addps(v0, cpu_xmm(GPR[rc])); - c.movaps(cpu_xmm(GPR[rt]), v0);*/ + if (ra == rc || rb == rc) + { + XmmVar v1(c); + if (ra == rc) + { + c.movaps(v1, v0); + if (ra == rb) // == rc + { + c.mulps(v0, v0); + c.addps(v0, v1); + } + else + { + c.mulps(v0, cpu_xmm(GPR[rb])); + c.addps(v0, v1); + } + } + else // rb == rc + { + c.movaps(v1, cpu_xmm(GPR[rb])); + c.mulps(v0, v1); + c.addps(v0, v1); + } + } + else if (ra == rb) + { + c.mulps(v0, v0); + c.addps(v0, cpu_xmm(GPR[rc])); + } + else + { + c.mulps(v0, cpu_xmm(GPR[rb])); + c.addps(v0, cpu_xmm(GPR[rc])); + } + c.movaps(cpu_xmm(GPR[rt]), v0); + LOG_OPCODE(); } void FMS(u32 rt, u32 ra, u32 rb, u32 rc) { diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index e7ecf4755e..daf7551c62 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -29,6 +29,7 @@ void SPURecompilerCore::Compile(u16 pos) { compiler.addFunc(kFuncConvHost, FuncBuilder4()); const u16 start = pos; + u32 excess = 0; entry[start].count = 0; GpVar cpu_var(compiler, kVarTypeIntPtr, "cpu"); @@ -68,6 +69,10 @@ void SPURecompilerCore::Compile(u16 pos) m_enc->do_finalize = true; } bool fin = m_enc->do_finalize; + if (entry[pos].valid == re(opcode)) + { + excess++; + } entry[pos].valid = re(opcode); if (fin) break; @@ -78,6 +83,8 @@ void SPURecompilerCore::Compile(u16 pos) compiler.ret(pos_var); compiler.endFunc(); entry[start].pointer = compiler.make(); + + //ConLog.Write("Compiled: %d (excess %d), ls_addr = 0x%x", entry[start].count, excess, pos * 4); } u8 SPURecompilerCore::DecodeMemory(const u64 address) From 33f7afd6e606ba8005c4e042bcba8ef277a9b29a Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Tue, 8 Apr 2014 19:10:07 +0400 Subject: [PATCH 05/14] Some bugs fixed --- rpcs3/Emu/Cell/SPURecompiler.h | 211 ++++++++++++++++++++++----- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 21 ++- 2 files changed, 196 insertions(+), 36 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index cc14ca3435..337cb2a81d 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -83,7 +83,7 @@ class SPURecompilerCore : public CPUDecoder public: SPUInterpreter* inter; JitRuntime runtime; - Compiler compiler; + //Compiler compiler; struct SPURecEntry { @@ -106,6 +106,8 @@ public: virtual u8 DecodeMemory(const u64 address); }; +#define c (*compiler) + #define cpu_xmm(x) oword_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 16) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 16") #define cpu_qword(x) qword_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 8) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 8") #define cpu_dword(x) dword_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 4) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 4") @@ -114,7 +116,9 @@ public: #define imm_xmm(x) oword_ptr(*imm_var, offsetof(SPUImmTable, x)) -#define LOG_OPCODE(...) //ConLog.Write(__FUNCTION__ "()" __VA_ARGS__) +#define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"()"__VA_ARGS__) + +#define LOG3_OPCODE(...) // ConLog.Write("Linked "__FUNCTION__"()"__VA_ARGS__) #define WRAPPER_BEGIN(a0, a1, a2, a3) struct opcode_wrapper \ { \ @@ -130,7 +134,7 @@ public: call->setArg(1, imm_u(a1)); \ call->setArg(2, imm_u(a2)); \ call->setArg(3, imm_u(a3)); \ - LOG_OPCODE(); + LOG3_OPCODE(); class SPURecompiler : public SPUOpcodes @@ -138,16 +142,16 @@ class SPURecompiler : public SPUOpcodes private: SPUThread& CPU; SPURecompilerCore& rec; - Compiler& c; public: + Compiler* compiler; bool do_finalize; GpVar* cpu_var; GpVar* ls_var; GpVar* imm_var; GpVar* pos_var; - SPURecompiler(SPUThread& cpu, SPURecompilerCore& rec) : CPU(cpu), rec(rec), c(rec.compiler) + SPURecompiler(SPUThread& cpu, SPURecompilerCore& rec) : CPU(cpu), rec(rec), compiler(nullptr) { } @@ -229,6 +233,7 @@ private: CPU.GPR[rt]._u32[2] = CPU.GPR[rb]._u32[2] - CPU.GPR[ra]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[rb]._u32[3] - CPU.GPR[ra]._u32[3]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { @@ -253,6 +258,7 @@ private: CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { @@ -275,13 +281,14 @@ private: } void BG(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] > CPU.GPR[rb]._u32[0] ? 0 : 1; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] > CPU.GPR[rb]._u32[1] ? 0 : 1; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] > CPU.GPR[rb]._u32[2] ? 0 : 1; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] > CPU.GPR[rb]._u32[3] ? 0 : 1; - WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); + WRAPPER_END(rt, ra, rb, 0); + + /*XmmVar v0(c); if (ra == rb) { // load { 1, 1, 1, 1 } @@ -292,13 +299,10 @@ private: { // compare if-greater-then c.movdqa(v0, cpu_xmm(GPR[rb])); - c.psubd(v0, cpu_xmm(GPR[ra])); - c.psrad(v0, 32); - // add 1 - c.paddd(v0, imm_xmm(s19_to_s32[1])); + // (not implemented) c.movdqa(cpu_xmm(GPR[rt]), v0); } - LOG_OPCODE(); + LOG_OPCODE();*/ } void SFH(u32 rt, u32 ra, u32 rb) { @@ -315,6 +319,7 @@ private: CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]); CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]); WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (ra != rb) c.orps(v0, cpu_xmm(GPR[rb])); @@ -364,6 +369,7 @@ private: CPU.GPR[rt]._u32[2] = (CPU.GPR[rb]._u32[2] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x3f); CPU.GPR[rt]._u32[3] = (CPU.GPR[rb]._u32[3] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x3f); WRAPPER_END(rt, ra, rb, 0); + // AVX2: masking with 0x3f + VPSLLVD may be better /*for (u32 i = 0; i < 4; i++) { @@ -427,6 +433,7 @@ private: CPU.GPR[rt]._u32[2] = nRot < 32 ? CPU.GPR[ra]._u32[2] >> nRot : 0; CPU.GPR[rt]._u32[3] = nRot < 32 ? CPU.GPR[ra]._u32[3] >> nRot : 0; WRAPPER_END(rt, ra, i7, 0);*/ + const int nRot = (0 - i7) & 0x3f; // !!! XmmVar v0(c); if (nRot > 31) @@ -471,6 +478,7 @@ private: for (u32 j = 0; j < 4; ++j) CPU.GPR[rt]._u32[j] = CPU.GPR[ra]._u32[j] << s; WRAPPER_END(rt, ra, i7, 0);*/ + const int s = i7 & 0x3f; XmmVar v0(c); if (s > 31) @@ -538,6 +546,7 @@ private: CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[ra])); if (ra == rb) @@ -559,6 +568,7 @@ private: CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] & CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { @@ -626,6 +636,7 @@ private: WRAPPER_BEGIN(ra, rt, yy, zz); CPU.WriteChannel(ra, CPU.GPR[rt]); WRAPPER_END(ra, rt, 0, 0); + /*GpVar v(c, kVarTypeUInt32); c.mov(v, cpu_dword(GPR[rt]._u32[3])); switch (ra) @@ -717,7 +728,7 @@ private: } void STQX(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); u32 lsa = (CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) & 0x3fff0; if(!CPU.IsGoodLSA(lsa)) { @@ -727,7 +738,28 @@ private: } CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ + + GpVar lsa(c, kVarTypeUInt32); + GpVar v0(c, kVarTypeUInt64); + GpVar v1(c, kVarTypeUInt64); + c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); + if (ra == rb) + { + c.add(lsa, lsa); + } + else + { + c.add(lsa, cpu_dword(GPR[rb]._u32[3])); + } + c.and_(lsa, 0x3fff0); + c.mov(v0, cpu_qword(GPR[rt]._u64[0])); + c.mov(v1, cpu_qword(GPR[rt]._u64[1])); + c.bswap(v0); + c.bswap(v1); + c.mov(qword_ptr(*ls_var, lsa, 0, 0), v1); + c.mov(qword_ptr(*ls_var, lsa, 0, 8), v0); + LOG_OPCODE(); } void BI(u32 ra) { @@ -829,6 +861,7 @@ private: for (int i = 0; i < 4; i++) CPU.GPR[rt]._f[i] = 1 / CPU.GPR[ra]._f[i]; WRAPPER_END(rt, ra, 0, 0);*/ + XmmVar v0(c); c.rcpps(v0, cpu_xmm(GPR[ra])); c.movaps(cpu_xmm(GPR[rt]), v0); @@ -840,16 +873,17 @@ private: for (int i = 0; i < 4; i++) CPU.GPR[rt]._f[i] = 1 / sqrt(abs(CPU.GPR[ra]._f[i])); WRAPPER_END(rt, ra, 0, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); - c.andps(v0, imm_xmm(max_int)); + c.andps(v0, imm_xmm(max_int)); // abs c.rsqrtps(v0, v0); c.movaps(cpu_xmm(GPR[rt]), v0); LOG_OPCODE(); } void LQX(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); u32 a = CPU.GPR[ra]._u32[3], b = CPU.GPR[rb]._u32[3]; u32 lsa = (a + b) & 0x3fff0; @@ -862,7 +896,28 @@ private: } CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ + + GpVar lsa(c, kVarTypeUInt32); + GpVar v0(c, kVarTypeUInt64); + GpVar v1(c, kVarTypeUInt64); + c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); + if (ra == rb) + { + c.add(lsa, lsa); + } + else + { + c.add(lsa, cpu_dword(GPR[rb]._u32[3])); + } + c.and_(lsa, 0x3fff0); + c.mov(v0, qword_ptr(*ls_var, lsa, 0, 0)); + c.mov(v1, qword_ptr(*ls_var, lsa, 0, 8)); + c.bswap(v0); + c.bswap(v1); + c.mov(cpu_qword(GPR[rt]._u64[0]), v1); + c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + LOG_OPCODE(); } void ROTQBYBI(u32 rt, u32 ra, u32 rb) { @@ -1085,14 +1140,29 @@ private: for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; WRAPPER_END(rt, ra, i7, 0);*/ + const int s = i7 & 0xf; - XmmVar v0(c), v1(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.movdqa(v1, v0); - c.pslldq(v0, s); - c.psrldq(v1, 0xf - s); - c.por(v0, v1); - c.movdqa(cpu_xmm(GPR[rt]), v0); + XmmVar v0(c); + if (s == 0) + { + // mov + if (ra != rt) + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + // else nop + } + else + { + XmmVar v1(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.movdqa(v1, v0); + c.pslldq(v0, s); + c.psrldq(v1, 16 - s); + c.por(v0, v1); + c.movdqa(cpu_xmm(GPR[rt]), v0); + } LOG_OPCODE(); } void ROTQMBYI(u32 rt, u32 ra, s32 i7) @@ -1104,6 +1174,7 @@ private: for (int b = 0; b < 16 - s; b++) CPU.GPR[rt]._u8[b] = temp._u8[b + s]; WRAPPER_END(rt, ra, i7, 0);*/ + const int s = (0 - i7) & 0x1f; XmmVar v0(c); if (s == 0) @@ -1140,6 +1211,7 @@ private: for (int b = s; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[b - s]; WRAPPER_END(rt, ra, i7, 0);*/ + const int s = i7 & 0x1f; XmmVar v0(c); if (s == 0) @@ -1180,10 +1252,26 @@ private: } void XOR(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] ^ CPU.GPR[rb]._u32[w]; - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ + + XmmVar v0(c); + if (ra == rb) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + // xor + c.movaps(v0, cpu_xmm(GPR[ra])); + c.xorps(v0, cpu_xmm(GPR[rb])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void CGTH(u32 rt, u32 ra, u32 rb) { @@ -1281,6 +1369,7 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > CPU.GPR[rb]._u32[i]) ? 0xffffffff : 0x00000000; } WRAPPER_END(rt, ra, rb, 0); + /*XmmVar v0(c); if (ra == rb) { @@ -1304,6 +1393,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] & (~CPU.GPR[rb]._u32[w]); WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { @@ -1328,6 +1418,7 @@ private: CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._f[2] > CPU.GPR[rb]._f[2] ? 0xffffffff : 0; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] > CPU.GPR[rb]._f[3] ? 0xffffffff : 0; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { @@ -1359,6 +1450,7 @@ private: CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] + CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] + CPU.GPR[rb]._f[3]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (ra == rb) @@ -1380,6 +1472,7 @@ private: CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] - CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] - CPU.GPR[rb]._f[3]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); if (ra == rb) { @@ -1403,6 +1496,7 @@ private: CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (ra == rb) @@ -1530,6 +1624,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] + CPU.GPR[rb]._u32[w] + (CPU.GPR[rt]._u32[w] & 1); WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[rt])); c.pand(v0, imm_xmm(s19_to_s32[1])); @@ -1544,6 +1639,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[rb]._u32[w] - CPU.GPR[ra]._u32[w] - (1 - (CPU.GPR[rt]._u32[w] & 1)); WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c), v1(c); c.movdqa(v1, cpu_xmm(GPR[rt])); c.pandn(v1, imm_xmm(s19_to_s32[1])); @@ -1753,6 +1849,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * CPU.GPR[rb]._u16[w*2]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[ra])); if (ra == rb) @@ -1783,6 +1880,7 @@ private: /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt] = CPU.GPR[rb]; WRAPPER_END(rt, ra, rb, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[rb])); c.movaps(cpu_xmm(GPR[rt]), v0); @@ -1814,6 +1912,7 @@ private: CPU.GPR[rt]._u32[i] = (u32)CPU.GPR[rt]._f[i]; //trunc } WRAPPER_END(rt, ra, i8, 0);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (i8 != 173) @@ -1848,6 +1947,7 @@ private: } } WRAPPER_END(rt, ra, i8, 0); + /*XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (i8 != 173) @@ -1876,6 +1976,7 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); } WRAPPER_END(rt, ra, i8, 0);*/ + XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[ra])); c.cvtdq2ps(v0, v0); // convert to floats @@ -1901,6 +2002,7 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); } WRAPPER_END(rt, ra, i8, 0); + /*XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[ra])); // TODO: convert from unsigned value @@ -1929,7 +2031,7 @@ private: } void STQA(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, yy, zz); + /*WRAPPER_BEGIN(rt, i16, yy, zz); u32 lsa = (i16 << 2) & 0x3fff0; if(!CPU.IsGoodLSA(lsa)) { @@ -1939,7 +2041,18 @@ private: } CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); - WRAPPER_END(rt, i16, 0, 0); + WRAPPER_END(rt, i16, 0, 0);*/ + + u32 lsa = (i16 << 2) & 0x3fff0; + GpVar v0(c, kVarTypeUInt64); + GpVar v1(c, kVarTypeUInt64); + c.mov(v0, cpu_qword(GPR[rt]._u64[0])); + c.mov(v1, cpu_qword(GPR[rt]._u64[1])); + c.bswap(v0); + c.bswap(v1); + c.mov(qword_ptr(*ls_var, lsa), v1); + c.mov(qword_ptr(*ls_var, lsa + 8), v0); + LOG_OPCODE(); } void BRNZ(u32 rt, s32 i16) { @@ -1989,6 +2102,7 @@ private: } CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); WRAPPER_END(rt, i16, CPU.PC, 0);*/ + u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); @@ -2010,7 +2124,7 @@ private: } void LQA(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, yy, zz); + /*WRAPPER_BEGIN(rt, i16, yy, zz); u32 lsa = (i16 << 2) & 0x3fff0; if(!CPU.IsGoodLSA(lsa)) { @@ -2020,7 +2134,18 @@ private: } CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); - WRAPPER_END(rt, i16, 0, 0); + WRAPPER_END(rt, i16, 0, 0);*/ + + u32 lsa = (i16 << 2) & 0x3fff0; + GpVar v0(c, kVarTypeUInt64); + GpVar v1(c, kVarTypeUInt64); + c.mov(v0, qword_ptr(*ls_var, lsa)); + c.mov(v1, qword_ptr(*ls_var, lsa + 8)); + c.bswap(v0); + c.bswap(v1); + c.mov(cpu_qword(GPR[rt]._u64[0]), v1); + c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + LOG_OPCODE(); } void BRASL(u32 rt, s32 i16) { @@ -2060,6 +2185,7 @@ private: } } WRAPPER_END(rt, i16, 0, 0);*/ + XmmVar v0(c); c.movaps(v0, imm_xmm(fsmbi_mask[i16 & 0xffff])); c.movaps(cpu_xmm(GPR[rt]), v0); @@ -2090,6 +2216,7 @@ private: } CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); WRAPPER_END(rt, i16, CPU.PC, 0);*/ + u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); @@ -2109,6 +2236,7 @@ private: CPU.GPR[rt]._i32[2] = CPU.GPR[rt]._i32[3] = (s32)i16; WRAPPER_END(rt, i16, 0, 0);*/ + XmmVar v0(c); if (i16 == 0) { @@ -2131,6 +2259,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = (s32)i16 << 16; WRAPPER_END(rt, i16, 0, 0);*/ + XmmVar v0(c); if (i16 == 0) { @@ -2161,6 +2290,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] |= (i16 & 0xFFFF); WRAPPER_END(rt, i16, 0, 0);*/ + XmmVar v0(c); if (i16 == 0) { @@ -2183,6 +2313,7 @@ private: for (u32 i = 0; i < 4; ++i) CPU.GPR[rt]._i32[i] = CPU.GPR[ra]._i32[i] | (s32)i10; WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); if (i10 == -1) { @@ -2228,6 +2359,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = (s32)i10 - CPU.GPR[ra]._i32[w]; WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); if (i10 == 0) { @@ -2258,6 +2390,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] & (s32)i10; WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); if (i10 == 0) { @@ -2305,6 +2438,7 @@ private: CPU.GPR[rt]._i32[2] = CPU.GPR[ra]._i32[2] + (s32)i10; CPU.GPR[rt]._i32[3] = CPU.GPR[ra]._i32[3] + (s32)i10; WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); if (i10 == 0) { @@ -2345,10 +2479,10 @@ private: //ConLog.Write("wrapper::STQD (lsa=0x%x): GPR[%d] (0x%llx%llx)", lsa, rt, CPU.GPR[rt]._u64[1], CPU.GPR[rt]._u64[0]); CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); WRAPPER_END(rt, i10, ra, 0);*/ + GpVar lsa(c, kVarTypeUInt32); GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); - c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); if (i10) c.add(lsa, i10); c.and_(lsa, 0x3fff0); @@ -2373,10 +2507,10 @@ private: CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); WRAPPER_END(rt, i10, ra, 0);*/ + GpVar lsa(c, kVarTypeUInt32); GpVar v0(c, kVarTypeUInt64); GpVar v1(c, kVarTypeUInt64); - c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); if (i10) c.add(lsa, i10); c.and_(lsa, 0x3fff0); @@ -2415,6 +2549,7 @@ private: for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > (s32)i10 ? 0xffffffff : 0; WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[ra])); c.pcmpgtd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); @@ -2451,6 +2586,7 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > (u32)i10) ? 0xffffffff : 0x00000000; } WRAPPER_END(rt, ra, i10, 0); + /*XmmVar v0(c); if (i10 == -1) { @@ -2517,6 +2653,7 @@ private: for(u32 i = 0; i < 4; ++i) CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._i32[i] == (s32)i10) ? 0xffffffff : 0x00000000; WRAPPER_END(rt, ra, i10, 0);*/ + XmmVar v0(c); c.movdqa(v0, cpu_xmm(GPR[ra])); c.pcmpeqd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); @@ -2564,6 +2701,7 @@ private: CPU.GPR[rt]._u32[2] = CPU.GPR[rt]._u32[3] = i18 & 0x3FFFF; WRAPPER_END(rt, i18, 0, 0);*/ + XmmVar v0(c); if (i18 == 0) { @@ -2588,6 +2726,7 @@ private: (~CPU.GPR[rc]._u64[i] & CPU.GPR[ra]._u64[i]); } WRAPPER_END(rt, ra, rb, rc);*/ + XmmVar v0(c), v1(c); c.movaps(v0, cpu_xmm(GPR[rb])); c.movaps(v1, cpu_xmm(GPR[rc])); @@ -2643,6 +2782,7 @@ private: CPU.GPR[rt]._f[2] = CPU.GPR[rc]._f[2] - CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[rc]._f[3] - CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; WRAPPER_END(rt, ra, rb, rc);*/ + XmmVar v0(c), v1(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (ra == rc) @@ -2680,6 +2820,7 @@ private: CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] + CPU.GPR[rc]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] + CPU.GPR[rc]._f[3]; WRAPPER_END(rt, ra, rb, rc);*/ + XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (ra == rc || rb == rc) @@ -2741,4 +2882,6 @@ private: do_finalize = true; Emu.Pause(); } -}; \ No newline at end of file +}; + +#undef c \ No newline at end of file diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index daf7551c62..120d55af8e 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -9,7 +9,7 @@ SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) : m_enc(new SPURecompiler(cpu, *this)) , inter(new SPUInterpreter(cpu)) , CPU(cpu) -, compiler(&runtime) +//, compiler(&runtime) { memset(entry, 0, sizeof(entry)); } @@ -27,6 +27,12 @@ void SPURecompilerCore::Decode(const u32 code) // decode instruction and run wit void SPURecompilerCore::Compile(u16 pos) { + const u64 stamp0 = get_system_time(); + u64 time0 = 0; + + Compiler compiler(&runtime); + m_enc->compiler = &compiler; + compiler.addFunc(kFuncConvHost, FuncBuilder4()); const u16 start = pos; u32 excess = 0; @@ -61,8 +67,16 @@ void SPURecompilerCore::Compile(u16 pos) m_enc->do_finalize = false; if (opcode) { + const u64 stamp1 = get_system_time(); (*SPU_instr::rrr_list)(m_enc, opcode); // compile single opcode + /*if ((pos % 128 == 127) && !m_enc->do_finalize) + { + // force finalization between every slice using absolute alignment + compiler.mov(pos_var, pos + 1); + m_enc->do_finalize = true; + }*/ entry[start].count++; + time0 += get_system_time() - stamp1; } else { @@ -80,11 +94,14 @@ void SPURecompilerCore::Compile(u16 pos) pos++; } + const u64 stamp1 = get_system_time(); compiler.ret(pos_var); compiler.endFunc(); entry[start].pointer = compiler.make(); - //ConLog.Write("Compiled: %d (excess %d), ls_addr = 0x%x", entry[start].count, excess, pos * 4); + //ConLog.Write("Compiled: %d (excess %d), addr=0x%x, time: [start=%d (decoding=%d), finalize=%d]", + //entry[start].count, excess, start * 4, stamp1 - stamp0, time0, get_system_time() - stamp1); + m_enc->compiler = nullptr; } u8 SPURecompilerCore::DecodeMemory(const u64 address) From 180f8aac5d2d64c79703503fb7c3e504640d9f9a Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sat, 12 Apr 2014 14:27:27 +0400 Subject: [PATCH 06/14] SPU Interpreter fix --- asmjit | 2 +- rpcs3/Emu/Cell/SPUInterpreter.h | 39 ++-- rpcs3/Emu/Cell/SPURecompiler.h | 207 ++++++++++++++++----- rpcs3/Emu/Io/PadHandler.h | 1 + rpcs3/Emu/SysCalls/Modules/cellSysutil.cpp | 4 +- rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp | 4 +- rpcs3/rpcs3.vcxproj.filters | 4 +- 7 files changed, 194 insertions(+), 67 deletions(-) diff --git a/asmjit b/asmjit index 5ac69447dc..906f89bfc5 160000 --- a/asmjit +++ b/asmjit @@ -1 +1 @@ -Subproject commit 5ac69447dc2b7bca332be552cbe747051641f9e9 +Subproject commit 906f89bfc59138f0e4c7c43551f16f8c43887572 diff --git a/rpcs3/Emu/Cell/SPUInterpreter.h b/rpcs3/Emu/Cell/SPUInterpreter.h index 353cccb537..66f259e591 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.h +++ b/rpcs3/Emu/Cell/SPUInterpreter.h @@ -15,7 +15,12 @@ __m128d m128d; } __u32x4; */ -#define LOG2_OPCODE(...) //unsigned char cs[20]; sha1(&Memory[CPU.dmac.ls_offset], 256*1024, cs); ConLog.Write("Mem Dump: 0x%llx", *(u64*)cs); ConLog.Write(__FUNCTION__ "(): " __VA_ARGS__) +#define MEM_AND_REG_HASH() \ + unsigned char mem_h[20]; sha1(&Memory[CPU.dmac.ls_offset], 256*1024, mem_h); \ + unsigned char reg_h[20]; sha1((const unsigned char*)CPU.GPR, sizeof(CPU.GPR), reg_h); \ + ConLog.Write("Mem hash: 0x%llx, reg hash: 0x%llx", *(u64*)mem_h, *(u64*)reg_h); + +#define LOG2_OPCODE(...) // ConLog.Write(__FUNCTION__ "(): " __VA_ARGS__) class SPUInterpreter : public SPUOpcodes { @@ -120,17 +125,17 @@ private: } void ROTM(u32 rt, u32 ra, u32 rb) { - CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) % 64) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) % 64) : 0; - CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) % 64) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) % 64) : 0; - CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) % 64) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) % 64) : 0; - CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) % 64) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) % 64) : 0; + CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : 0; + CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : 0; + CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : 0; + CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : 0; } void ROTMA(u32 rt, u32 ra, u32 rb) { - CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._i32[0]) % 64) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._i32[0]) % 64) : CPU.GPR[ra]._i32[0] >> 31; - CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._i32[1]) % 64) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._i32[1]) % 64) : CPU.GPR[ra]._i32[1] >> 31; - CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._i32[2]) % 64) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._i32[2]) % 64) : CPU.GPR[ra]._i32[2] >> 31; - CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._i32[3]) % 64) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._i32[3]) % 64) : CPU.GPR[ra]._i32[3] >> 31; + CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : CPU.GPR[ra]._i32[0] >> 31; + CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : CPU.GPR[ra]._i32[1] >> 31; + CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : CPU.GPR[ra]._i32[2] >> 31; + CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : CPU.GPR[ra]._i32[3] >> 31; } void SHL(u32 rt, u32 ra, u32 rb) { @@ -147,12 +152,12 @@ private: void ROTHM(u32 rt, u32 ra, u32 rb) { for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) % 32) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) % 32) : 0; + CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : 0; } void ROTMAH(u32 rt, u32 ra, u32 rb) { for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._i16[h]) % 32) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._i16[h]) % 32) : CPU.GPR[ra]._i16[h] >> 15; + CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : CPU.GPR[ra]._i16[h] >> 15; } void SHLH(u32 rt, u32 ra, u32 rb) { @@ -169,7 +174,7 @@ private: } void ROTMI(u32 rt, u32 ra, s32 i7) { - const int nRot = (0 - i7) % 64; + const int nRot = (0 - i7) & 0x3f; CPU.GPR[rt]._u32[0] = nRot < 32 ? CPU.GPR[ra]._u32[0] >> nRot : 0; CPU.GPR[rt]._u32[1] = nRot < 32 ? CPU.GPR[ra]._u32[1] >> nRot : 0; CPU.GPR[rt]._u32[2] = nRot < 32 ? CPU.GPR[ra]._u32[2] >> nRot : 0; @@ -177,7 +182,7 @@ private: } void ROTMAI(u32 rt, u32 ra, s32 i7) { - const int nRot = (0 - i7) % 64; + const int nRot = (0 - i7) & 0x3f; CPU.GPR[rt]._i32[0] = nRot < 32 ? CPU.GPR[ra]._i32[0] >> nRot : CPU.GPR[ra]._i32[0] >> 31; CPU.GPR[rt]._i32[1] = nRot < 32 ? CPU.GPR[ra]._i32[1] >> nRot : CPU.GPR[ra]._i32[1] >> 31; CPU.GPR[rt]._i32[2] = nRot < 32 ? CPU.GPR[ra]._i32[2] >> nRot : CPU.GPR[ra]._i32[2] >> 31; @@ -188,7 +193,7 @@ private: const u32 s = i7 & 0x3f; for (u32 j = 0; j < 4; ++j) - CPU.GPR[rt]._u32[j] = CPU.GPR[ra]._u32[j] << s; + CPU.GPR[rt]._u32[j] = (s >= 32) ? 0 : CPU.GPR[ra]._u32[j] << s; } void ROTHI(u32 rt, u32 ra, s32 i7) { @@ -199,14 +204,14 @@ private: } void ROTHMI(u32 rt, u32 ra, s32 i7) { - const int nRot = (0 - i7) % 32; + const int nRot = (0 - i7) & 0x1f; for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = nRot < 16 ? CPU.GPR[ra]._u16[h] >> nRot : 0; } void ROTMAHI(u32 rt, u32 ra, s32 i7) { - const int nRot = (0 - i7) % 32; + const int nRot = (0 - i7) & 0x1f; for (int h = 0; h < 8; h++) CPU.GPR[rt]._i16[h] = nRot < 16 ? CPU.GPR[ra]._i16[h] >> nRot : CPU.GPR[ra]._i16[h] >> 15; @@ -216,7 +221,7 @@ private: const int nRot = i7 & 0x1f; for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[0] = nRot > 15 ? 0 : CPU.GPR[ra]._u16[0] << nRot; + CPU.GPR[rt]._u16[h] = nRot > 15 ? 0 : CPU.GPR[ra]._u16[h] << nRot; } void A(u32 rt, u32 ra, u32 rb) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 337cb2a81d..a5b77f7cdd 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -18,6 +18,7 @@ struct SPUImmTable { __m128i s19_to_s32[1 << 19]; __m128i fsmbi_mask[1 << 16]; + __m128i u8_to_u8[256]; __m128 scale_to_float[256]; __m128 scale_to_int[256]; __m128i min_int; @@ -70,9 +71,19 @@ struct SPUImmTable max_int.m128i_u32[1] = 0x7fffffff; max_int.m128i_u32[2] = 0x7fffffff; max_int.m128i_u32[3] = 0x7fffffff; + // table for byte consts + for (u32 i = 0; i < sizeof(u8_to_u8) / sizeof(__m128i); i++) + { + for (u32 j = 0; j < 16; j++) + { + u8_to_u8[i].m128i_u8[j] = i; + } + } } }; +extern const SPUImmTable g_spu_imm; + class SPURecompiler; class SPURecompilerCore : public CPUDecoder @@ -118,22 +129,22 @@ public: #define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"()"__VA_ARGS__) -#define LOG3_OPCODE(...) // ConLog.Write("Linked "__FUNCTION__"()"__VA_ARGS__) +#define LOG3_OPCODE(...) //ConLog.Write("Linked "__FUNCTION__"()"__VA_ARGS__) -#define WRAPPER_BEGIN(a0, a1, a2, a3) struct opcode_wrapper \ +#define WRAPPER_BEGIN(a0, a1, a2, a3) struct opwr_##a0 \ { \ static void opcode(u32 a0, u32 a1, u32 a2, u32 a3) \ { \ SPUThread& CPU = *(SPUThread*)GetCurrentCPUThread(); -#define WRAPPER_END(a0, a1, a2, a3) LOG2_OPCODE(); } \ +#define WRAPPER_END(a0, a1, a2, a3) /*LOG2_OPCODE();*/ } \ }; \ c.mov(cpu_qword(PC), (u32)CPU.PC); \ - X86X64CallNode* call = c.call(imm_ptr(&opcode_wrapper::opcode), kFuncConvHost, FuncBuilder4()); \ - call->setArg(0, imm_u(a0)); \ - call->setArg(1, imm_u(a1)); \ - call->setArg(2, imm_u(a2)); \ - call->setArg(3, imm_u(a3)); \ + X86X64CallNode* call##a0 = c.call(imm_ptr(&opwr_##a0::opcode), kFuncConvHost, FuncBuilder4()); \ + call##a0->setArg(0, imm_u(a0)); \ + call##a0->setArg(1, imm_u(a1)); \ + call##a0->setArg(2, imm_u(a2)); \ + call##a0->setArg(3, imm_u(a3)); \ LOG3_OPCODE(); @@ -151,7 +162,10 @@ public: GpVar* imm_var; GpVar* pos_var; - SPURecompiler(SPUThread& cpu, SPURecompilerCore& rec) : CPU(cpu), rec(rec), compiler(nullptr) + SPURecompiler(SPUThread& cpu, SPURecompilerCore& rec) + : CPU(cpu) + , rec(rec) + , compiler(nullptr) { } @@ -291,25 +305,51 @@ private: /*XmmVar v0(c); if (ra == rb) { - // load { 1, 1, 1, 1 } c.movaps(v0, imm_xmm(s19_to_s32[1])); c.movaps(cpu_xmm(GPR[rt]), v0); } else { - // compare if-greater-then - c.movdqa(v0, cpu_xmm(GPR[rb])); - // (not implemented) + XmmVar v1(c), v2(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.movdqa(v1, cpu_xmm(GPR[rb])); + // compare if-greater-than + c.movdqa(cpu_xmm(GPR[rt]), v0); + + // sign bits: + // a b (b-a) -> (result of BG) + // 0 0 0 -> 1 + // 0 0 1 -> 0 + // 0 1 0 -> 1 + // 0 1 1 -> 1 + // 1 0 0 -> 0 + // 1 0 1 -> 0 + // 1 1 0 -> 0 + // 1 1 1 -> 1 } LOG_OPCODE();*/ } void SFH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = CPU.GPR[rb]._u16[h] - CPU.GPR[ra]._u16[h]; - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ + + XmmVar v0(c); + if (ra == rb) + { + // zero + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + c.movaps(v0, cpu_xmm(GPR[rb])); + c.psubw(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } } void NOR(u32 rt, u32 ra, u32 rb) { @@ -346,19 +386,19 @@ private: void ROTM(u32 rt, u32 ra, u32 rb) { WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) % 64) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) % 64) : 0; - CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) % 64) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) % 64) : 0; - CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) % 64) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) % 64) : 0; - CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) % 64) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) % 64) : 0; + CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : 0; + CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : 0; + CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : 0; + CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : 0; WRAPPER_END(rt, ra, rb, 0); } void ROTMA(u32 rt, u32 ra, u32 rb) { WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._i32[0]) % 64) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._i32[0]) % 64) : CPU.GPR[ra]._i32[0] >> 31; - CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._i32[1]) % 64) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._i32[1]) % 64) : CPU.GPR[ra]._i32[1] >> 31; - CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._i32[2]) % 64) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._i32[2]) % 64) : CPU.GPR[ra]._i32[2] >> 31; - CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._i32[3]) % 64) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._i32[3]) % 64) : CPU.GPR[ra]._i32[3] >> 31; + CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : CPU.GPR[ra]._i32[0] >> 31; + CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : CPU.GPR[ra]._i32[1] >> 31; + CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : CPU.GPR[ra]._i32[2] >> 31; + CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : CPU.GPR[ra]._i32[3] >> 31; WRAPPER_END(rt, ra, rb, 0); } void SHL(u32 rt, u32 ra, u32 rb) @@ -397,14 +437,14 @@ private: { WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) % 32) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) % 32) : 0; + CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : 0; WRAPPER_END(rt, ra, rb, 0); } void ROTMAH(u32 rt, u32 ra, u32 rb) { WRAPPER_BEGIN(rt, ra, rb, zz); for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._i16[h]) % 32) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._i16[h]) % 32) : CPU.GPR[ra]._i16[h] >> 15; + CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : CPU.GPR[ra]._i16[h] >> 15; WRAPPER_END(rt, ra, rb, 0); } void SHLH(u32 rt, u32 ra, u32 rb) @@ -427,14 +467,14 @@ private: void ROTMI(u32 rt, u32 ra, s32 i7) { /*WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) % 64; // ??? + const int nRot = (0 - (s32)i7) & 0x3f; CPU.GPR[rt]._u32[0] = nRot < 32 ? CPU.GPR[ra]._u32[0] >> nRot : 0; CPU.GPR[rt]._u32[1] = nRot < 32 ? CPU.GPR[ra]._u32[1] >> nRot : 0; CPU.GPR[rt]._u32[2] = nRot < 32 ? CPU.GPR[ra]._u32[2] >> nRot : 0; CPU.GPR[rt]._u32[3] = nRot < 32 ? CPU.GPR[ra]._u32[3] >> nRot : 0; WRAPPER_END(rt, ra, i7, 0);*/ - const int nRot = (0 - i7) & 0x3f; // !!! + const int nRot = (0 - i7) & 0x3f; XmmVar v0(c); if (nRot > 31) { @@ -463,20 +503,41 @@ private: } void ROTMAI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) % 64; + /*WRAPPER_BEGIN(rt, ra, i7, zz); + const int nRot = (0 - (s32)i7) & 0x3f; CPU.GPR[rt]._i32[0] = nRot < 32 ? CPU.GPR[ra]._i32[0] >> nRot : CPU.GPR[ra]._i32[0] >> 31; CPU.GPR[rt]._i32[1] = nRot < 32 ? CPU.GPR[ra]._i32[1] >> nRot : CPU.GPR[ra]._i32[1] >> 31; CPU.GPR[rt]._i32[2] = nRot < 32 ? CPU.GPR[ra]._i32[2] >> nRot : CPU.GPR[ra]._i32[2] >> 31; CPU.GPR[rt]._i32[3] = nRot < 32 ? CPU.GPR[ra]._i32[3] >> nRot : CPU.GPR[ra]._i32[3] >> 31; - WRAPPER_END(rt, ra, i7, 0); + WRAPPER_END(rt, ra, i7, 0);*/ + + const int nRot = (0 - i7) & 0x3f; + XmmVar v0(c); + if (nRot == 0) + { + // mov + if (ra != rt) + { + c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + // else nop + } + else + { + // shift right arithmetical + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.psrad(v0, nRot); + c.movdqa(cpu_xmm(GPR[rt]), v0); + } + LOG_OPCODE(); } void SHLI(u32 rt, u32 ra, s32 i7) { /*WRAPPER_BEGIN(rt, ra, i7, zz); const u32 s = i7 & 0x3f; for (u32 j = 0; j < 4; ++j) - CPU.GPR[rt]._u32[j] = CPU.GPR[ra]._u32[j] << s; + CPU.GPR[rt]._u32[j] = (s >= 32) ? 0 : CPU.GPR[ra]._u32[j] << s; WRAPPER_END(rt, ra, i7, 0);*/ const int s = i7 & 0x3f; @@ -517,7 +578,7 @@ private: void ROTHMI(u32 rt, u32 ra, s32 i7) { WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) % 32; + const int nRot = (0 - (s32)i7) & 0x1f; for (int h = 0; h < 8; h++) CPU.GPR[rt]._u16[h] = nRot < 16 ? CPU.GPR[ra]._u16[h] >> nRot : 0; WRAPPER_END(rt, ra, i7, 0); @@ -525,7 +586,7 @@ private: void ROTMAHI(u32 rt, u32 ra, s32 i7) { WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) % 32; + const int nRot = (0 - (s32)i7) & 0x1f; for (int h = 0; h < 8; h++) CPU.GPR[rt]._i16[h] = nRot < 16 ? CPU.GPR[ra]._i16[h] >> nRot : CPU.GPR[ra]._i16[h] >> 15; WRAPPER_END(rt, ra, i7, 0); @@ -535,7 +596,7 @@ private: WRAPPER_BEGIN(rt, ra, i7, zz); const int nRot = i7 & 0x1f; for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[0] = nRot > 15 ? 0 : CPU.GPR[ra]._u16[0] << nRot; + CPU.GPR[rt]._u16[h] = nRot > 15 ? 0 : CPU.GPR[ra]._u16[h] << nRot; WRAPPER_END(rt, ra, i7, 0); } void A(u32 rt, u32 ra, u32 rb) @@ -617,10 +678,15 @@ private: } void AVGB(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); for (int b = 0; b < 16; b++) CPU.GPR[rt]._u8[b] = (CPU.GPR[ra]._u8[b] + CPU.GPR[rb]._u8[b] + 1) >> 1; - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ + + XmmVar v0(c); + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.pavgb(v0, cpu_xmm(GPR[rb])); + c.movdqa(cpu_xmm(GPR[rt]), v0); } void MTSPR(u32 rt, u32 sa) { @@ -1245,10 +1311,23 @@ private: } void CGT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > CPU.GPR[rb]._i32[w] ? 0xffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ + + XmmVar v0(c); + if (ra == rb) + { + c.xorps(v0, v0); + c.movaps(cpu_xmm(GPR[rt]), v0); + } + else + { + c.movdqa(v0, cpu_xmm(GPR[ra])); + c.pcmpgtd(v0, cpu_xmm(GPR[rb])); + c.movdqa(cpu_xmm(GPR[rt]), v0); + } } void XOR(u32 rt, u32 ra, u32 rb) { @@ -1919,7 +1998,7 @@ private: { c.mulps(v0, imm_xmm(scale_to_int[i8 & 0xff])); // scale } - c.cvtps2dq(v0, v0); // convert to ints + c.cvttps2dq(v0, v0); // convert to ints with truncation c.movdqa(cpu_xmm(GPR[rt]), v0); LOG_OPCODE(); } @@ -2396,7 +2475,7 @@ private: { // zero c.xorps(v0, v0); - c.movaps(v0, cpu_xmm(GPR[ra])); + c.movaps(cpu_xmm(GPR[rt]), v0); } else if (i10 == -1) { @@ -2738,7 +2817,13 @@ private: } void SHUFB(u32 rt, u32 ra, u32 rb, u32 rc) { - WRAPPER_BEGIN(rt, ra, rb, rc); + /*WRAPPER_BEGIN(ra, rb, rc, zz); + ConLog.Write("SHUFB: input ra=%d, value=0x%016llx%016llx", ra, CPU.GPR[ra]._u64[1], CPU.GPR[ra]._u64[0]); + ConLog.Write("SHUFB: input rb=%d, value=0x%016llx%016llx", rb, CPU.GPR[rb]._u64[1], CPU.GPR[rb]._u64[0]); + ConLog.Write("SHUFB: input rc=%d, value=0x%016llx%016llx", rc, CPU.GPR[rc]._u64[1], CPU.GPR[rc]._u64[0]); + WRAPPER_END(ra, rb, rc, 0);*/ + + WRAPPER_BEGIN(rc, rt, ra, rb); const SPU_GPR_hdr _a = CPU.GPR[ra]; const SPU_GPR_hdr _b = CPU.GPR[rb]; for (int i = 0; i < 16; i++) @@ -2764,8 +2849,44 @@ private: CPU.GPR[rt]._u8[i] = _a._u8[15 - (b & 0x0F)]; } } - WRAPPER_END(rt, ra, rb, rc); - // TODO + WRAPPER_END(rc, rt, ra, rb); + + /*XmmVar v0(c), v1(c), v2(c), v3(c), v4(c), vFF(c); + c.movdqa(v0, cpu_xmm(GPR[rc])); // v0 = mask + // generate specific values: + c.movdqa(v1, imm_xmm(u8_to_u8[0xe0])); // v1 = 11100000 + c.movdqa(v2, v0); // copy mask v2 = mask + c.movdqa(v3, imm_xmm(u8_to_u8[0x80])); // v3 = 10000000 + c.pand(v2, v1); // filter mask v2 = mask & 11100000 + c.movdqa(vFF, v2); // load filtered mask vFF = mask & 11100000 + c.movdqa(v4, imm_xmm(u8_to_u8[0xc0])); // v4 = 11000000 + c.pcmpeqb(vFF, v4); // gen 0xff values vFF = (mask & 11100000 == 11000000) ? 0xff : 0 + c.movdqa(v4, v2); // load filtered mask v4 = mask & 11100000 + c.pand(v4, v3); // filter mask again v4 = mask & 10000000 + c.pcmpeqb(v2, v1); // v2 = (mask & 11100000 == 11100000) ? 0xff : 0 + c.pcmpeqb(v4, v3); // v4 = (mask & 10000000 == 10000000) ? 0xff : 0 + c.pand(v2, v3); // generate 0x80 values v2 = (mask & 11100000 == 11100000) ? 0x80 : 0 + c.por(vFF, v2); // merge 0xff and 0x80 vFF = (mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0 + c.pandn(v1, v0); // filter mask v1 = mask & 00011111 + // select bytes from [rb]: + c.movdqa(v2, imm_xmm(u8_to_u8[15])); // v2 = 00001111 + c.pxor(v1, imm_xmm(u8_to_u8[0x10])); // v1 = (mask & 00011111) ^ 00010000 + c.psubb(v2, v1); // v2 = 00001111 - ((mask & 00011111) ^ 00010000) + c.movdqa(v1, cpu_xmm(GPR[rb])); // v1 = rb + c.pshufb(v1, v2); // v1 = select(rb, 00001111 - ((mask & 00011111) ^ 00010000)) + // select bytes from [ra]: + c.pxor(v2, imm_xmm(u8_to_u8[0xf0])); // v2 = (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000 + c.movdqa(v3, cpu_xmm(GPR[ra])); // v3 = ra + c.pshufb(v3, v2); // v3 = select(ra, (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000) + c.por(v1, v3); // v1 = select(rb, 00001111 - ((mask & 00011111) ^ 00010000)) | (v3) + c.pandn(v4, v1); // filter result v4 = v1 & ((mask & 10000000 == 10000000) ? 0 : 0xff) + c.por(vFF, v4); // final merge vFF = (mask & 10000000 == 10000000) ? ((mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0) : (v1) + c.movdqa(cpu_xmm(GPR[rt]), vFF); + LOG_OPCODE();*/ + + /*WRAPPER_BEGIN(rt, xx, yy, zz); + //ConLog.Write("SHUFB: output=%d, value=0x%016llx%016llx", rt, CPU.GPR[rt]._u64[1], CPU.GPR[rt]._u64[0]); + WRAPPER_END(rt, 0, 0, 0);*/ } void MPYA(u32 rt, u32 ra, u32 rb, u32 rc) { diff --git a/rpcs3/Emu/Io/PadHandler.h b/rpcs3/Emu/Io/PadHandler.h index d07c6026b7..4f97edab75 100644 --- a/rpcs3/Emu/Io/PadHandler.h +++ b/rpcs3/Emu/Io/PadHandler.h @@ -202,6 +202,7 @@ protected: std::vector m_pads; public: + virtual ~PadHandlerBase() = default; virtual void Init(const u32 max_connect)=0; virtual void Close()=0; diff --git a/rpcs3/Emu/SysCalls/Modules/cellSysutil.cpp b/rpcs3/Emu/SysCalls/Modules/cellSysutil.cpp index 87b5c68e01..e8e92d9fb4 100644 --- a/rpcs3/Emu/SysCalls/Modules/cellSysutil.cpp +++ b/rpcs3/Emu/SysCalls/Modules/cellSysutil.cpp @@ -532,7 +532,7 @@ int cellAudioOutGetSoundAvailability(u32 audioOut, u32 type, u32 fs, u32 option) option = 0; - int available = 8; // should be at least 2 + int available = 2; // should be at least 2 switch(fs) { @@ -573,7 +573,7 @@ int cellAudioOutGetSoundAvailability2(u32 audioOut, u32 type, u32 fs, u32 ch, u3 option = 0; - int available = 8; // should be at least 2 + int available = 2; // should be at least 2 switch(fs) { diff --git a/rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp b/rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp index ba5c8023d8..535a01fcab 100644 --- a/rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp +++ b/rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp @@ -397,7 +397,7 @@ int sys_spu_initialize(u32 max_usable_spu, u32 max_raw_spu) //181 int sys_spu_thread_write_ls(u32 id, u32 address, u64 value, u32 type) { - sc_spu.Warning("sys_spu_thread_write_ls(id=%d, address=0x%x, value=0x%llx, type=0x%x)", + sc_spu.Log("sys_spu_thread_write_ls(id=%d, address=0x%x, value=0x%llx, type=0x%x)", id, address, value, type); CPUThread* thr = Emu.GetCPU().GetThread(id); @@ -430,7 +430,7 @@ int sys_spu_thread_write_ls(u32 id, u32 address, u64 value, u32 type) //182 int sys_spu_thread_read_ls(u32 id, u32 address, mem64_t value, u32 type) { - sc_spu.Warning("sys_spu_thread_read_ls(id=%d, address=0x%x, value_addr=0x%x, type=0x%x)", + sc_spu.Log("sys_spu_thread_read_ls(id=%d, address=0x%x, value_addr=0x%x, type=0x%x)", id, address, value.GetAddr(), type); CPUThread* thr = Emu.GetCPU().GetThread(id); diff --git a/rpcs3/rpcs3.vcxproj.filters b/rpcs3/rpcs3.vcxproj.filters index 8776fa3b4a..8574f6f5e4 100644 --- a/rpcs3/rpcs3.vcxproj.filters +++ b/rpcs3/rpcs3.vcxproj.filters @@ -1,4 +1,4 @@ - + @@ -982,7 +982,7 @@ Emu\SysCalls\Modules - Include + Emu\Cell \ No newline at end of file From c41317dd82b68b7cf35b07186c4337b29a691a20 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Mon, 14 Apr 2014 13:42:55 +0400 Subject: [PATCH 07/14] New bugs added --- rpcs3/Emu/Cell/SPURecompiler.h | 962 ++++++++++++++++++--------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 15 +- 2 files changed, 647 insertions(+), 330 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index a5b77f7cdd..6aeaa8810c 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -140,7 +140,11 @@ public: #define WRAPPER_END(a0, a1, a2, a3) /*LOG2_OPCODE();*/ } \ }; \ c.mov(cpu_qword(PC), (u32)CPU.PC); \ - X86X64CallNode* call##a0 = c.call(imm_ptr(&opwr_##a0::opcode), kFuncConvHost, FuncBuilder4()); \ + if (#a0[0] == 'r') XmmInvalidate(a0); \ + if (#a1[0] == 'r') XmmInvalidate(a1); \ + if (#a2[0] == 'r') XmmInvalidate(a2); \ + if (#a3[0] == 'r') XmmInvalidate(a3); \ + X86X64CallNode* call##a0 = c.call(imm_ptr(&opwr_##a0::opcode), kFuncConvHost, FuncBuilder4()); \ call##a0->setArg(0, imm_u(a0)); \ call##a0->setArg(1, imm_u(a1)); \ call##a0->setArg(2, imm_u(a2)); \ @@ -162,6 +166,28 @@ public: GpVar* imm_var; GpVar* pos_var; + struct XmmLink + { + XmmVar* data; + s8 reg; + bool taken; + mutable bool got; + + XmmLink() + : data(nullptr) + , reg(-1) + , taken(false) + { + } + + const XmmVar& get() const + { + assert(data); + got = true; + return *data; + } + } xmm_var[16]; + SPURecompiler(SPUThread& cpu, SPURecompilerCore& rec) : CPU(cpu) , rec(rec) @@ -169,6 +195,131 @@ public: { } + const XmmLink& XmmAlloc() // get empty xmm register + { + for (u32 i = 15; ~i; i--) + { + if ((xmm_var[i].reg == -1) && !xmm_var[i].taken) + { + xmm_var[i].taken = true; + xmm_var[i].got = false; + return xmm_var[i]; + } + } + for (u32 i = 0; i < 16; i++) + { + if (!xmm_var[i].taken) + { + c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); + xmm_var[i].taken = true; + xmm_var[i].got = false; + return xmm_var[i]; + } + } + assert(false); + return *(XmmLink*)nullptr; + } + + const XmmLink& XmmGet(s8 reg) // get xmm register with specific SPU reg + { + assert(reg >= 0); + XmmLink* res = nullptr; + for (u32 i = 0; i < 16; i++) + { + if (xmm_var[i].reg == reg) + { + res = &xmm_var[i]; + if (xmm_var[i].taken) throw "XmmGet(): xmm_var is taken"; + xmm_var[i].taken = true; + xmm_var[i].got = false; + for (u32 j = i + 1; j < 16; j++) + { + if (xmm_var[j].reg == reg) throw "XmmGet(): xmm_var duplicate"; + } + break; + } + } + if (!res) + { + res = &(XmmLink&)XmmAlloc(); + c.movaps(*res->data, cpu_xmm(GPR[reg])); + } + return *res; + } + + const XmmLink& XmmCopy(const XmmLink& from) // XmmAlloc + mov + { + const XmmLink& res = XmmAlloc(); + c.movaps(*res.data, *from.data); + return res; + } + + const XmmLink* XmmRead(const s8 reg) const // get xmm register with specific SPU reg or nullptr + { + assert(reg >= 0); + for (u32 i = 0; i < 16; i++) + { + if (xmm_var[i].reg == reg) + { + if (xmm_var[i].got && xmm_var[i].taken) throw "XmmRead(): wrong reuse"; + return &xmm_var[i]; + } + } + return nullptr; + } + + void XmmInvalidate(const s8 reg) // invalidate cached register + { + assert(reg >= 0); + for (u32 i = 0; i < 16; i++) + { + if (xmm_var[i].reg == reg) + { + if (xmm_var[i].taken) throw "XmmInvalidate(): xmm_var is taken"; + xmm_var[i].reg = -1; + } + } + } + + void XmmFinalize(const XmmLink& var, s8 reg = -1) + { + // invalidation + for (u32 i = 0; i < 16; i++) + { + if (xmm_var[i].reg == reg) + { + xmm_var[i].reg = -1; + } + } + for (u32 i = 0; i < 16; i++) + { + if (xmm_var[i].data == var.data) + { + assert(xmm_var[i].taken); + // save immediately: + if (reg >= 0) c.movaps(cpu_xmm(GPR[reg]), *xmm_var[i].data); + // (to disable caching:) + //reg = -1; + xmm_var[i].reg = reg; + xmm_var[i].taken = false; + return; + } + } + assert(false); + } + + void XmmRelease() + { + for (u32 i = 0; i < 16; i++) + { + if (xmm_var[i].reg >= 0) + { + //c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); + xmm_var[i].reg = -1; + } + } + } + private: //0 - 10 void STOP(u32 code) @@ -183,7 +334,7 @@ private: } }; c.mov(cpu_qword(PC), (u32)CPU.PC); - X86X64CallNode* call = c.call(imm_ptr(&STOP_wrapper::STOP), kFuncConvHost, FuncBuilder1()); + X86X64CallNode* call = c.call(imm_ptr(&STOP_wrapper::STOP), kFuncConvHost, FuncBuilder1()); call->setArg(0, imm_u(code)); c.mov(*pos_var, (CPU.PC >> 2) + 1); do_finalize = true; @@ -248,19 +399,26 @@ private: CPU.GPR[rt]._u32[3] = CPU.GPR[rb]._u32[3] - CPU.GPR[ra]._u32[3]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { // sub from - c.movdqa(v0, cpu_xmm(GPR[rb])); - c.psubd(v0, cpu_xmm(GPR[ra])); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& vb = XmmGet(rb); + if (const XmmLink* va = XmmRead(ra)) + { + c.psubd(vb.get(), va->get()); + } + else + { + c.psubd(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); } LOG_OPCODE(); } @@ -273,23 +431,29 @@ private: CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { // mov if (ra != rt) { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } else { // or - c.movaps(v0, cpu_xmm(GPR[ra])); - c.orps(v0, cpu_xmm(GPR[rb])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& vb = XmmGet(rb); + if (const XmmLink* va = XmmRead(ra)) + { + c.orps(vb.get(), va->get()); + } + else + { + c.orps(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); } LOG_OPCODE(); } @@ -337,35 +501,43 @@ private: CPU.GPR[rt]._u16[h] = CPU.GPR[rb]._u16[h] - CPU.GPR[ra]._u16[h]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { - c.movaps(v0, cpu_xmm(GPR[rb])); - c.psubw(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& vb = XmmGet(rb); + if (const XmmLink* va = XmmRead(ra)) + { + c.psubw(vb.get(), va->get()); + } + else + { + c.psubw(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); } + LOG_OPCODE(); } void NOR(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]); CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]); CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]); CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]); - WRAPPER_END(rt, ra, rb, 0);*/ + WRAPPER_END(rt, ra, rb, 0); - XmmVar v0(c); + /*XmmVar v0(c); c.movaps(v0, cpu_xmm(GPR[ra])); if (ra != rb) c.orps(v0, cpu_xmm(GPR[rb])); c.xorps(v0, imm_xmm(s19_to_s32[0x7ffff])); c.movaps(cpu_xmm(GPR[rt]), v0); - LOG_OPCODE(); + LOG_OPCODE();*/ } void ABSDB(u32 rt, u32 ra, u32 rb) { @@ -475,29 +647,29 @@ private: WRAPPER_END(rt, ra, i7, 0);*/ const int nRot = (0 - i7) & 0x3f; - XmmVar v0(c); if (nRot > 31) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else if (nRot == 0) { // mov if (ra != rt) { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } else { // shift right logical - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.psrld(v0, nRot); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.psrld(va.get(), nRot); + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -512,23 +684,22 @@ private: WRAPPER_END(rt, ra, i7, 0);*/ const int nRot = (0 - i7) & 0x3f; - XmmVar v0(c); if (nRot == 0) { // mov if (ra != rt) { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } else { // shift right arithmetical - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.psrad(v0, nRot); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.psrad(va.get(), nRot); + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -541,29 +712,29 @@ private: WRAPPER_END(rt, ra, i7, 0);*/ const int s = i7 & 0x3f; - XmmVar v0(c); if (s > 31) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else if (s == 0) { // mov if (ra != rt) { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } else { // shift left - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.pslld(v0, s); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.pslld(va.get(), s); + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -601,25 +772,30 @@ private: } void A(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]; - WRAPPER_END(rt, ra, rb, 0);*/ + WRAPPER_END(rt, ra, rb, 0); - XmmVar v0(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); + // !!! + + /*const XmmLink& vb = XmmGet(rb); if (ra == rb) { - c.paddd(v0, v0); + c.paddd(vb.get(), vb.get()); + } + else if (const XmmLink* va = XmmRead(ra)) + { + c.paddd(vb.get(), va->get()); } else { - c.paddd(v0, cpu_xmm(GPR[rb])); + c.paddd(vb.get(), cpu_xmm(GPR[ra])); } - c.movdqa(cpu_xmm(GPR[rt]), v0); - LOG_OPCODE(); + XmmFinalize(vb, rt); + LOG_OPCODE();*/ } void AND(u32 rt, u32 ra, u32 rb) { @@ -630,23 +806,29 @@ private: CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { if (rt != ra) { // mov - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmGet(ra); + XmmFinalize(v0, rt); } // else nop } else { // and - c.movaps(v0, cpu_xmm(GPR[ra])); - c.andps(v0, cpu_xmm(GPR[rb])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& vb = XmmGet(rb); + if (const XmmLink* va = XmmRead(ra)) + { + c.andps(vb.get(), va->get()); + } + else + { + c.andps(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); } LOG_OPCODE(); } @@ -683,10 +865,16 @@ private: CPU.GPR[rt]._u8[b] = (CPU.GPR[ra]._u8[b] + CPU.GPR[rb]._u8[b] + 1) >> 1; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.pavgb(v0, cpu_xmm(GPR[rb])); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& vb = XmmGet(rb); + if (const XmmLink* va = XmmRead(ra)) + { + c.pavgb(vb.get(), va->get()); + } + else + { + c.pavgb(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); } void MTSPR(u32 rt, u32 sa) { @@ -729,7 +917,7 @@ private: default: { - X86X64CallNode* call = c.call(imm_ptr(&WRCH_wrapper::WRCH), kFuncConvHost, FuncBuilder2()); + X86X64CallNode* call = c.call(imm_ptr(&WRCH_wrapper::WRCH), kFuncConvHost, FuncBuilder2()); call->setArg(0, imm_u(ra)); call->setArg(1, v); } @@ -928,9 +1116,16 @@ private: CPU.GPR[rt]._f[i] = 1 / CPU.GPR[ra]._f[i]; WRAPPER_END(rt, ra, 0, 0);*/ - XmmVar v0(c); - c.rcpps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& vr = XmmAlloc(); + if (const XmmLink* va = XmmRead(ra)) + { + c.rcpps(vr.get(), va->get()); + } + else + { + c.rcpps(vr.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vr, rt); LOG_OPCODE(); } void FRSQEST(u32 rt, u32 ra) @@ -940,11 +1135,10 @@ private: CPU.GPR[rt]._f[i] = 1 / sqrt(abs(CPU.GPR[ra]._f[i])); WRAPPER_END(rt, ra, 0, 0);*/ - XmmVar v0(c); - c.movaps(v0, cpu_xmm(GPR[ra])); - c.andps(v0, imm_xmm(max_int)); // abs - c.rsqrtps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& vr = XmmGet(ra); + c.andps(vr.get(), imm_xmm(max_int)); // abs + c.rsqrtps(vr.get(), vr.get()); + XmmFinalize(vr, rt); LOG_OPCODE(); } void LQX(u32 rt, u32 ra, u32 rb) @@ -1001,7 +1195,7 @@ private: const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = 0; b < 16 - s; b++) - CPU.GPR[rt]._u8[b] = temp._u8[b + s]; + CPU.GPR[rt]._u8[b] = temp._u8[(b + s) & 0xf]; WRAPPER_END(rt, ra, rb, 0); } void SHLQBYBI(u32 rt, u32 ra, u32 rb) @@ -1103,7 +1297,7 @@ private: const SPU_GPR_hdr temp = CPU.GPR[ra]; CPU.GPR[rt].Reset(); for (int b = 0; b < 16 - s; b++) - CPU.GPR[rt]._u8[b] = temp._u8[b + s]; + CPU.GPR[rt]._u8[b] = temp._u8[(b + s) & 0xf]; WRAPPER_END(rt, ra, rb, 0); } void SHLQBY(u32 rt, u32 ra, u32 rb) @@ -1208,26 +1402,25 @@ private: WRAPPER_END(rt, ra, i7, 0);*/ const int s = i7 & 0xf; - XmmVar v0(c); if (s == 0) { // mov if (ra != rt) { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmGet(ra); + XmmFinalize(v0, rt); } // else nop } else { - XmmVar v1(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.movdqa(v1, v0); - c.pslldq(v0, s); - c.psrldq(v1, 16 - s); - c.por(v0, v1); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + const XmmLink& v1 = XmmCopy(va); + c.pslldq(va.get(), s); + c.psrldq(v1.get(), 16 - s); + c.por(va.get(), v1.get()); + XmmFinalize(va, rt); + XmmFinalize(v1); } LOG_OPCODE(); } @@ -1242,29 +1435,29 @@ private: WRAPPER_END(rt, ra, i7, 0);*/ const int s = (0 - i7) & 0x1f; - XmmVar v0(c); if (s == 0) { if (ra != rt) { // mov - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmGet(ra); + XmmFinalize(v0, rt); } // else nop } else if (s > 15) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { // shift right - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.psrldq(v0, s); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.psrldq(va.get(), s); + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -1277,31 +1470,31 @@ private: for (int b = s; b < 16; b++) CPU.GPR[rt]._u8[b] = temp._u8[b - s]; WRAPPER_END(rt, ra, i7, 0);*/ - + const int s = i7 & 0x1f; - XmmVar v0(c); if (s == 0) { if (ra != rt) { // mov - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } else if (s > 15) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { // shift left - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.pslldq(v0, s); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.pslldq(va.get(), s); + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -1316,18 +1509,27 @@ private: CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > CPU.GPR[rb]._i32[w] ? 0xffffffff : 0; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + // zero + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.pcmpgtd(v0, cpu_xmm(GPR[rb])); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vb = XmmRead(rb)) + { + c.pcmpgtd(va.get(), vb->get()); + } + else + { + c.pcmpgtd(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); } + LOG_OPCODE(); } void XOR(u32 rt, u32 ra, u32 rb) { @@ -1336,19 +1538,26 @@ private: CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] ^ CPU.GPR[rb]._u32[w]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { // xor - c.movaps(v0, cpu_xmm(GPR[ra])); - c.xorps(v0, cpu_xmm(GPR[rb])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vb = XmmRead(rb)) + { + c.xorps(va.get(), vb->get()); + } + else + { + c.xorps(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -1453,8 +1662,9 @@ private: if (ra == rb) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { @@ -1463,8 +1673,7 @@ private: // (not implemented) c.movdqa(cpu_xmm(GPR[rt]), v0); } - LOG_OPCODE(); - */ + LOG_OPCODE();*/ } void ANDC(u32 rt, u32 ra, u32 rb) { @@ -1473,19 +1682,26 @@ private: CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] & (~CPU.GPR[rb]._u32[w]); WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { // and not - c.movaps(v0, cpu_xmm(GPR[rb])); - c.andnps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& vb = XmmGet(rb); + if (const XmmLink* va = XmmRead(ra)) + { + c.andnps(vb.get(), va->get()); + } + else + { + c.andnps(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); } LOG_OPCODE(); } @@ -1498,19 +1714,26 @@ private: CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] > CPU.GPR[rb]._f[3] ? 0xffffffff : 0; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { // not-less-or-equal - c.movaps(v0, cpu_xmm(GPR[ra])); - c.cmpps(v0, cpu_xmm(GPR[rb]), 6); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vb = XmmRead(rb)) + { + c.cmpps(va.get(), vb->get(), 6); + } + else + { + c.cmpps(va.get(), cpu_xmm(GPR[rb]), 6); + } + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -1530,17 +1753,16 @@ private: CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] + CPU.GPR[rb]._f[3]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); - c.movaps(v0, cpu_xmm(GPR[ra])); - if (ra == rb) + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vb = XmmRead(rb)) { - c.addps(v0, v0); + c.addps(va.get(), vb->get()); } else { - c.addps(v0, cpu_xmm(GPR[rb])); + c.addps(va.get(), cpu_xmm(GPR[rb])); } - c.movaps(cpu_xmm(GPR[rt]), v0); + XmmFinalize(va, rt); LOG_OPCODE(); } void FS(u32 rt, u32 ra, u32 rb) @@ -1552,42 +1774,54 @@ private: CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] - CPU.GPR[rb]._f[3]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); if (ra == rb) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.subps(v0, cpu_xmm(GPR[rb])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vb = XmmRead(rb)) + { + c.subps(va.get(), vb->get()); + } + else + { + c.subps(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); } LOG_OPCODE(); } void FM(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); + WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, 0);*/ + WRAPPER_END(rt, ra, rb, 0); - XmmVar v0(c); - c.movaps(v0, cpu_xmm(GPR[ra])); + // !!! + + /*const XmmLink& va = XmmGet(ra); if (ra == rb) { - c.mulps(v0, v0); + c.mulps(va.get(), va.get()); + } + else if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); } else { - c.mulps(v0, cpu_xmm(GPR[rb])); + c.mulps(va.get(), cpu_xmm(GPR[rb])); } - c.movaps(cpu_xmm(GPR[rt]), v0); - LOG_OPCODE(); + XmmFinalize(va, rt); + LOG_OPCODE();*/ } void CLGTH(u32 rt, u32 ra, u32 rb) { @@ -1704,12 +1938,11 @@ private: CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] + CPU.GPR[rb]._u32[w] + (CPU.GPR[rt]._u32[w] & 1); WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); - c.movdqa(v0, cpu_xmm(GPR[rt])); - c.pand(v0, imm_xmm(s19_to_s32[1])); - c.paddd(v0, cpu_xmm(GPR[ra])); - c.paddd(v0, cpu_xmm(GPR[rb])); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& vt = XmmGet(rt); + c.pand(vt.get(), imm_xmm(s19_to_s32[1])); + c.paddd(vt.get(), cpu_xmm(GPR[ra])); + c.paddd(vt.get(), cpu_xmm(GPR[rb])); + XmmFinalize(vt, rt); LOG_OPCODE(); } void SFX(u32 rt, u32 ra, u32 rb) @@ -1719,22 +1952,25 @@ private: CPU.GPR[rt]._u32[w] = CPU.GPR[rb]._u32[w] - CPU.GPR[ra]._u32[w] - (1 - (CPU.GPR[rt]._u32[w] & 1)); WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c), v1(c); - c.movdqa(v1, cpu_xmm(GPR[rt])); - c.pandn(v1, imm_xmm(s19_to_s32[1])); + const XmmLink& vt = XmmGet(rt); + c.pandn(vt.get(), imm_xmm(s19_to_s32[1])); if (ra == rb) { // load zero - c.pxor(v0, v0); + const XmmLink& v0 = XmmAlloc(); + c.pxor(v0.get(), v0.get()); + c.psubd(v0.get(), vt.get()); + XmmFinalize(v0, rt); } else { // sub - c.movdqa(v0, cpu_xmm(GPR[rb])); - c.psubd(v0, cpu_xmm(GPR[ra])); + const XmmLink& vb = XmmGet(rb); + c.psubd(vb.get(), cpu_xmm(GPR[ra])); + c.psubd(vb.get(), vt.get()); + XmmFinalize(vb, rt); } - c.psubd(v0, v1); - c.movdqa(cpu_xmm(GPR[rt]), v0); + XmmFinalize(vt); LOG_OPCODE(); } void CGX(u32 rt, u32 ra, u32 rb) @@ -1929,22 +2165,22 @@ private: CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * CPU.GPR[rb]._u16[w*2]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); + const XmmLink& va = XmmGet(ra); if (ra == rb) { - c.pand(v0, imm_xmm(s19_to_s32[0xffff])); - c.pmulld(v0, v0); + c.pand(va.get(), imm_xmm(s19_to_s32[0xffff])); + c.pmulld(va.get(), va.get()); } else { - XmmVar v1(c); - c.movdqa(v1, imm_xmm(s19_to_s32[0xffff])); // load mask - c.pand(v0, v1); // clear high words of each dword - c.pand(v1, cpu_xmm(GPR[rb])); - c.pmulld(v0, v1); + const XmmLink& v1 = XmmAlloc(); + c.movdqa(v1.get(), imm_xmm(s19_to_s32[0xffff])); // load mask + c.pand(va.get(), v1.get()); // clear high words of each dword + c.pand(v1.get(), cpu_xmm(GPR[rb])); + c.pmulld(va.get(), v1.get()); + XmmFinalize(v1); } - c.movdqa(cpu_xmm(GPR[rt]), v0); + XmmFinalize(va, rt); LOG_OPCODE(); } void CEQB(u32 rt, u32 ra, u32 rb) @@ -1960,9 +2196,8 @@ private: CPU.GPR[rt] = CPU.GPR[rb]; WRAPPER_END(rt, ra, rb, 0);*/ - XmmVar v0(c); - c.movaps(v0, cpu_xmm(GPR[rb])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& vb = XmmGet(rb); + XmmFinalize(vb, rt); LOG_OPCODE(); } void HEQ(u32 rt, u32 ra, u32 rb) @@ -1992,14 +2227,13 @@ private: } WRAPPER_END(rt, ra, i8, 0);*/ - XmmVar v0(c); - c.movaps(v0, cpu_xmm(GPR[ra])); + const XmmLink& va = XmmGet(ra); if (i8 != 173) { - c.mulps(v0, imm_xmm(scale_to_int[i8 & 0xff])); // scale + c.mulps(va.get(), imm_xmm(scale_to_int[i8 & 0xff])); // scale } - c.cvttps2dq(v0, v0); // convert to ints with truncation - c.movdqa(cpu_xmm(GPR[rt]), v0); + c.cvttps2dq(va.get(), va.get()); // convert to ints with truncation + XmmFinalize(va, rt); LOG_OPCODE(); } void CFLTU(u32 rt, u32 ra, s32 i8) @@ -2056,14 +2290,13 @@ private: } WRAPPER_END(rt, ra, i8, 0);*/ - XmmVar v0(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.cvtdq2ps(v0, v0); // convert to floats + const XmmLink& va = XmmGet(ra); + c.cvtdq2ps(va.get(), va.get()); // convert to floats if (i8 != 155) { - c.mulps(v0, imm_xmm(scale_to_float[i8 & 0xff])); // scale + c.mulps(va.get(), imm_xmm(scale_to_float[i8 & 0xff])); // scale } - c.movaps(cpu_xmm(GPR[rt]), v0); + XmmFinalize(va, rt); LOG_OPCODE(); } void CUFLT(u32 rt, u32 ra, s32 i8) @@ -2265,9 +2498,19 @@ private: } WRAPPER_END(rt, i16, 0, 0);*/ - XmmVar v0(c); - c.movaps(v0, imm_xmm(fsmbi_mask[i16 & 0xffff])); - c.movaps(cpu_xmm(GPR[rt]), v0); + if (i16 == 0) + { + // zero + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else + { + const XmmLink& vr = XmmAlloc(); + c.movaps(vr.get(), imm_xmm(fsmbi_mask[i16 & 0xffff])); + XmmFinalize(vr, rt); + } LOG_OPCODE(); } void BRSL(u32 rt, s32 i16) @@ -2316,20 +2559,20 @@ private: CPU.GPR[rt]._i32[3] = (s32)i16; WRAPPER_END(rt, i16, 0, 0);*/ - XmmVar v0(c); + const XmmLink& vr = XmmAlloc(); if (i16 == 0) { - c.xorps(v0, v0); + c.xorps(vr.get(), vr.get()); } else if (i16 == -1) { - c.cmpps(v0, v0, 0); + c.cmpps(vr.get(), vr.get(), 0); } else { - c.movaps(v0, imm_xmm(s19_to_s32[i16 & 0x7ffff])); + c.movaps(vr.get(), imm_xmm(s19_to_s32[i16 & 0x7ffff])); } - c.movaps(cpu_xmm(GPR[rt]), v0); + XmmFinalize(vr, rt); LOG_OPCODE(); } void ILHU(u32 rt, s32 i16) @@ -2339,22 +2582,23 @@ private: CPU.GPR[rt]._i32[w] = (s32)i16 << 16; WRAPPER_END(rt, i16, 0, 0);*/ - XmmVar v0(c); + const XmmLink& vr = XmmAlloc(); if (i16 == 0) { - c.xorps(v0, v0); + c.xorps(vr.get(), vr.get()); } else if (i16 == -1) { - c.cmpps(v0, v0, 0); - c.pslld(v0, 16); + c.cmpps(vr.get(), vr.get(), 0); + c.pslld(vr.get(), 16); } else { - c.movaps(v0, imm_xmm(s19_to_s32[i16 & 0x7ffff])); - c.pslld(v0, 16); + c.movaps(vr.get(), imm_xmm(s19_to_s32[i16 & 0x7ffff])); + c.pslld(vr.get(), 16); } - c.movaps(cpu_xmm(GPR[rt]), v0); + XmmFinalize(vr, rt); + LOG_OPCODE(); } void ILH(u32 rt, s32 i16) { @@ -2370,16 +2614,15 @@ private: CPU.GPR[rt]._i32[w] |= (i16 & 0xFFFF); WRAPPER_END(rt, i16, 0, 0);*/ - XmmVar v0(c); if (i16 == 0) { // nop } else { - c.movaps(v0, cpu_xmm(GPR[rt])); - c.orps(v0, imm_xmm(s19_to_s32[i16 & 0xffff])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& vt = XmmGet(rt); + c.orps(vt.get(), imm_xmm(s19_to_s32[i16 & 0xffff])); + XmmFinalize(vt, rt); } LOG_OPCODE(); } @@ -2393,28 +2636,28 @@ private: CPU.GPR[rt]._i32[i] = CPU.GPR[ra]._i32[i] | (s32)i10; WRAPPER_END(rt, ra, i10, 0);*/ - XmmVar v0(c); if (i10 == -1) { // fill with 1 - c.cmpps(v0, v0, 0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v1 = XmmAlloc(); + c.cmpps(v1.get(), v1.get(), 0); + XmmFinalize(v1, rt); } else if (i10 == 0) { if (rt != ra) { // mov - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmGet(ra); + XmmFinalize(v0, rt); } // else nop } else { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.orps(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.orps(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -2439,21 +2682,50 @@ private: CPU.GPR[rt]._i32[w] = (s32)i10 - CPU.GPR[ra]._i32[w]; WRAPPER_END(rt, ra, i10, 0);*/ - XmmVar v0(c); if (i10 == 0) { - c.pxor(v0, v0); + // zero + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + if (const XmmLink* va = XmmRead(ra)) + { + c.psubd(v0.get(), va->get()); + } + else + { + c.psubd(v0.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(v0, rt); } else if (i10 == -1) { - c.pcmpeqd(v0, v0); + // fill with 1 + const XmmLink& v1 = XmmAlloc(); + c.pcmpeqd(v1.get(), v1.get()); + if (const XmmLink* va = XmmRead(ra)) + { + c.psubd(v1.get(), va->get()); + } + else + { + c.psubd(v1.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(v1, rt); } else { - c.movdqa(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + const XmmLink& vr = XmmAlloc(); + c.movdqa(vr.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + if (const XmmLink* va = XmmRead(ra)) + { + c.psubd(vr.get(), va->get()); + } + else + { + c.psubd(vr.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vr, rt); } - c.psubd(v0, cpu_xmm(GPR[ra])); - c.movdqa(cpu_xmm(GPR[rt]), v0); LOG_OPCODE(); } void SFHI(u32 rt, u32 ra, s32 i10) @@ -2470,28 +2742,28 @@ private: CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] & (s32)i10; WRAPPER_END(rt, ra, i10, 0);*/ - XmmVar v0(c); if (i10 == 0) { // zero - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else if (i10 == -1) { // mov if (ra != rt) { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } else { - c.movaps(v0, cpu_xmm(GPR[ra])); - c.andps(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.andps(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -2518,23 +2790,22 @@ private: CPU.GPR[rt]._i32[3] = CPU.GPR[ra]._i32[3] + (s32)i10; WRAPPER_END(rt, ra, i10, 0);*/ - XmmVar v0(c); if (i10 == 0) { if (rt != ra) { // mov - c.movaps(v0, cpu_xmm(GPR[ra])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmGet(ra); + XmmFinalize(v0, rt); } // else nop } else { // add - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.paddd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.paddd(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + XmmFinalize(va, rt); } LOG_OPCODE(); } @@ -2629,10 +2900,9 @@ private: CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > (s32)i10 ? 0xffffffff : 0; WRAPPER_END(rt, ra, i10, 0);*/ - XmmVar v0(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.pcmpgtd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.pcmpgtd(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + XmmFinalize(va, rt); LOG_OPCODE(); } void CGTHI(u32 rt, u32 ra, s32 i10) @@ -2670,8 +2940,9 @@ private: if (i10 == -1) { // zero result - c.xorps(v0, v0); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v0 = XmmAlloc(); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); } else { @@ -2733,10 +3004,9 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._i32[i] == (s32)i10) ? 0xffffffff : 0x00000000; WRAPPER_END(rt, ra, i10, 0);*/ - XmmVar v0(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.pcmpeqd(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); - c.movdqa(cpu_xmm(GPR[rt]), v0); + const XmmLink& va = XmmGet(ra); + c.pcmpeqd(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + XmmFinalize(va, rt); LOG_OPCODE(); } void CEQHI(u32 rt, u32 ra, s32 i10) @@ -2781,39 +3051,41 @@ private: CPU.GPR[rt]._u32[3] = i18 & 0x3FFFF; WRAPPER_END(rt, i18, 0, 0);*/ - XmmVar v0(c); + const XmmLink& vr = XmmAlloc(); if (i18 == 0) { - c.xorps(v0, v0); + c.xorps(vr.get(), vr.get()); } else { - c.movaps(v0, imm_xmm(s19_to_s32[i18 & 0x3ffff])); + c.movaps(vr.get(), imm_xmm(s19_to_s32[i18 & 0x3ffff])); } - c.movaps(cpu_xmm(GPR[rt]), v0); + XmmFinalize(vr, rt); LOG_OPCODE(); } //0 - 3 void SELB(u32 rt, u32 ra, u32 rb, u32 rc) { - /*WRAPPER_BEGIN(rt, ra, rb, rc); + WRAPPER_BEGIN(rt, ra, rb, rc); for (u64 i = 0; i < 2; ++i) { CPU.GPR[rt]._u64[i] = (CPU.GPR[rc]._u64[i] & CPU.GPR[rb]._u64[i]) | (~CPU.GPR[rc]._u64[i] & CPU.GPR[ra]._u64[i]); } - WRAPPER_END(rt, ra, rb, rc);*/ + WRAPPER_END(rt, ra, rb, rc); - XmmVar v0(c), v1(c); - c.movaps(v0, cpu_xmm(GPR[rb])); - c.movaps(v1, cpu_xmm(GPR[rc])); - c.andps(v0, v1); - c.andnps(v1, cpu_xmm(GPR[ra])); - c.orps(v0, v1); - c.movaps(cpu_xmm(GPR[rt]), v0); - LOG_OPCODE(); + // !!! + + /*const XmmLink& vb = XmmGet(rb); + const XmmLink& vc = XmmGet(rc); + c.andps(vb.get(), vc.get()); + c.andnps(vc.get(), cpu_xmm(GPR[ra])); + c.orps(vb.get(), vc.get()); + XmmFinalize(vb, rt); + XmmFinalize(vc); + LOG_OPCODE();*/ } void SHUFB(u32 rt, u32 ra, u32 rb, u32 rc) { @@ -2851,37 +3123,45 @@ private: } WRAPPER_END(rc, rt, ra, rb); - /*XmmVar v0(c), v1(c), v2(c), v3(c), v4(c), vFF(c); - c.movdqa(v0, cpu_xmm(GPR[rc])); // v0 = mask + /*const XmmLink& v0 = XmmGet(rc); // v0 = mask + const XmmLink& v1 = XmmAlloc(); + const XmmLink& v2 = XmmCopy(v0); // v2 = mask + const XmmLink& v3 = XmmAlloc(); + const XmmLink& v4 = XmmAlloc(); + const XmmLink& vFF = XmmAlloc(); // generate specific values: - c.movdqa(v1, imm_xmm(u8_to_u8[0xe0])); // v1 = 11100000 - c.movdqa(v2, v0); // copy mask v2 = mask - c.movdqa(v3, imm_xmm(u8_to_u8[0x80])); // v3 = 10000000 - c.pand(v2, v1); // filter mask v2 = mask & 11100000 - c.movdqa(vFF, v2); // load filtered mask vFF = mask & 11100000 - c.movdqa(v4, imm_xmm(u8_to_u8[0xc0])); // v4 = 11000000 - c.pcmpeqb(vFF, v4); // gen 0xff values vFF = (mask & 11100000 == 11000000) ? 0xff : 0 - c.movdqa(v4, v2); // load filtered mask v4 = mask & 11100000 - c.pand(v4, v3); // filter mask again v4 = mask & 10000000 - c.pcmpeqb(v2, v1); // v2 = (mask & 11100000 == 11100000) ? 0xff : 0 - c.pcmpeqb(v4, v3); // v4 = (mask & 10000000 == 10000000) ? 0xff : 0 - c.pand(v2, v3); // generate 0x80 values v2 = (mask & 11100000 == 11100000) ? 0x80 : 0 - c.por(vFF, v2); // merge 0xff and 0x80 vFF = (mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0 - c.pandn(v1, v0); // filter mask v1 = mask & 00011111 + c.movdqa(v1.get(), imm_xmm(u8_to_u8[0xe0])); // v1 = 11100000 + c.movdqa(v3.get(), imm_xmm(u8_to_u8[0x80])); // v3 = 10000000 + c.pand(v2.get(), v1.get()); // filter mask v2 = mask & 11100000 + c.movdqa(vFF.get(), v2.get()); // and copy vFF = mask & 11100000 + c.movdqa(v4.get(), imm_xmm(u8_to_u8[0xc0])); // v4 = 11000000 + c.pcmpeqb(vFF.get(), v4.get()); // gen 0xff vFF = (mask & 11100000 == 11000000) ? 0xff : 0 + c.movdqa(v4.get(), v2.get()); // copy again v4 = mask & 11100000 + c.pand(v4.get(), v3.get()); // filter mask v4 = mask & 10000000 + c.pcmpeqb(v2.get(), v1.get()); // v2 = (mask & 11100000 == 11100000) ? 0xff : 0 + c.pcmpeqb(v4.get(), v3.get()); // v4 = (mask & 10000000 == 10000000) ? 0xff : 0 + c.pand(v2.get(), v3.get()); // generate 0x80 v2 = (mask & 11100000 == 11100000) ? 0x80 : 0 + c.por(vFF.get(), v2.get()); // merge 0xff, 0x80 vFF = (mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0 + c.pandn(v1.get(), v0.get()); // filter mask v1 = mask & 00011111 // select bytes from [rb]: - c.movdqa(v2, imm_xmm(u8_to_u8[15])); // v2 = 00001111 - c.pxor(v1, imm_xmm(u8_to_u8[0x10])); // v1 = (mask & 00011111) ^ 00010000 - c.psubb(v2, v1); // v2 = 00001111 - ((mask & 00011111) ^ 00010000) - c.movdqa(v1, cpu_xmm(GPR[rb])); // v1 = rb - c.pshufb(v1, v2); // v1 = select(rb, 00001111 - ((mask & 00011111) ^ 00010000)) + c.movdqa(v2.get(), imm_xmm(u8_to_u8[15])); // v2 = 00001111 + c.pxor(v1.get(), imm_xmm(u8_to_u8[0x10])); // v1 = (mask & 00011111) ^ 00010000 + c.psubb(v2.get(), v1.get()); // v2 = 00001111 - ((mask & 00011111) ^ 00010000) + c.movdqa(v1.get(), cpu_xmm(GPR[rb])); // v1 = rb + c.pshufb(v1.get(), v2.get()); // v1 = select(rb, 00001111 - ((mask & 00011111) ^ 00010000)) // select bytes from [ra]: - c.pxor(v2, imm_xmm(u8_to_u8[0xf0])); // v2 = (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000 - c.movdqa(v3, cpu_xmm(GPR[ra])); // v3 = ra - c.pshufb(v3, v2); // v3 = select(ra, (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000) - c.por(v1, v3); // v1 = select(rb, 00001111 - ((mask & 00011111) ^ 00010000)) | (v3) - c.pandn(v4, v1); // filter result v4 = v1 & ((mask & 10000000 == 10000000) ? 0 : 0xff) - c.por(vFF, v4); // final merge vFF = (mask & 10000000 == 10000000) ? ((mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0) : (v1) - c.movdqa(cpu_xmm(GPR[rt]), vFF); + c.pxor(v2.get(), imm_xmm(u8_to_u8[0xf0])); // v2 = (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000 + c.movdqa(v3.get(), cpu_xmm(GPR[ra])); // v3 = ra + c.pshufb(v3.get(), v2.get()); // v3 = select(ra, (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000) + c.por(v1.get(), v3.get()); // v1 = select(rb, 00001111 - ((mask & 00011111) ^ 00010000)) | (v3) + c.pandn(v4.get(), v1.get()); // filter result v4 = v1 & ((mask & 10000000 == 10000000) ? 0 : 0xff) + c.por(vFF.get(), v4.get()); // final merge vFF = (mask & 10000000 == 10000000) ? ((mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0) : (v1) + XmmFinalize(vFF, rt); + XmmFinalize(v4); + XmmFinalize(v3); + XmmFinalize(v2); + XmmFinalize(v1); + XmmFinalize(v0); LOG_OPCODE();*/ /*WRAPPER_BEGIN(rt, xx, yy, zz); @@ -2897,89 +3177,113 @@ private: } void FNMS(u32 rt, u32 ra, u32 rb, u32 rc) { - /*WRAPPER_BEGIN(rt, ra, rb, rc); + WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[rc]._f[0] - CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[rc]._f[1] - CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[rc]._f[2] - CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[rc]._f[3] - CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, rc);*/ + WRAPPER_END(rt, ra, rb, rc); + + // !!! + + /*const XmmLink& va = XmmGet(ra); + const XmmLink& vc = (ra == rc) ? XmmCopy(va) : XmmGet(rc); - XmmVar v0(c), v1(c); - c.movaps(v0, cpu_xmm(GPR[ra])); - if (ra == rc) - { - c.movaps(v1, v0); - } - else - { - c.movaps(v1, cpu_xmm(GPR[rc])); - } if (ra == rb) { - c.mulps(v0, v0); + c.mulps(va.get(), va.get()); + } + else if (rb == rc) + { + c.mulps(va.get(), vc.get()); + } + else if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); } else { - if (rb == rc) - { - c.mulps(v0, v1); - } - else - { - c.mulps(v0, cpu_xmm(GPR[rb])); - } + c.mulps(va.get(), cpu_xmm(GPR[rb])); } - c.subps(v1, v0); - c.movaps(cpu_xmm(GPR[rt]), v1); - LOG_OPCODE(); + c.subps(vc.get(), va.get()); + XmmFinalize(vc, rt); + XmmFinalize(va); + LOG_OPCODE();*/ } void FMA(u32 rt, u32 ra, u32 rb, u32 rc) { - /*WRAPPER_BEGIN(rt, ra, rb, rc); + WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] + CPU.GPR[rc]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] + CPU.GPR[rc]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] + CPU.GPR[rc]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] + CPU.GPR[rc]._f[3]; - WRAPPER_END(rt, ra, rb, rc);*/ + WRAPPER_END(rt, ra, rb, rc); - XmmVar v0(c); - c.movaps(v0, cpu_xmm(GPR[ra])); + // !!! + + /*const XmmLink& va = XmmGet(ra); if (ra == rc || rb == rc) { - XmmVar v1(c); if (ra == rc) { - c.movaps(v1, v0); + const XmmLink& vc = XmmCopy(va); if (ra == rb) // == rc { - c.mulps(v0, v0); - c.addps(v0, v1); + c.mulps(va.get(), va.get()); + } + else if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); } else { - c.mulps(v0, cpu_xmm(GPR[rb])); - c.addps(v0, v1); + c.mulps(va.get(), cpu_xmm(GPR[rb])); } + c.addps(va.get(), vc.get()); + XmmFinalize(vc); } else // rb == rc { - c.movaps(v1, cpu_xmm(GPR[rb])); - c.mulps(v0, v1); - c.addps(v0, v1); + const XmmLink& vb = XmmGet(rb); + c.mulps(va.get(), vb.get()); + c.addps(va.get(), vb.get()); + XmmFinalize(vb); } } else if (ra == rb) { - c.mulps(v0, v0); - c.addps(v0, cpu_xmm(GPR[rc])); + c.mulps(va.get(), va.get()); + if (const XmmLink* vc = XmmRead(rc)) + { + c.addps(va.get(), vc->get()); + } + else + { + c.addps(va.get(), cpu_xmm(GPR[rc])); + } } else { - c.mulps(v0, cpu_xmm(GPR[rb])); - c.addps(v0, cpu_xmm(GPR[rc])); + if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); + } + else + { + c.mulps(va.get(), cpu_xmm(GPR[rb])); + } + + if (const XmmLink* vc = XmmRead(rc)) // !!! + { + c.addps(va.get(), vc->get()); + } + else + { + c.addps(va.get(), cpu_xmm(GPR[rc])); + } } - c.movaps(cpu_xmm(GPR[rt]), v0); - LOG_OPCODE(); + XmmFinalize(va, rt); + LOG_OPCODE();*/ } void FMS(u32 rt, u32 ra, u32 rb, u32 rc) { diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index 120d55af8e..d59ed5b93b 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -56,9 +56,13 @@ void SPURecompilerCore::Compile(u16 pos) GpVar pos_var(compiler, kVarTypeUInt32, "pos"); compiler.setArg(3, pos_var); compiler.alloc(pos_var); - m_enc->pos_var = &pos_var; + for (u32 i = 0; i < 16; i++) + { + m_enc->xmm_var[i].data = new XmmVar(compiler); + } + compiler.xor_(pos_var, pos_var); while (true) @@ -94,6 +98,15 @@ void SPURecompilerCore::Compile(u16 pos) pos++; } + m_enc->XmmRelease(); + + for (u32 i = 0; i < 16; i++) + { + assert(!m_enc->xmm_var[i].taken); + delete m_enc->xmm_var[i].data; + m_enc->xmm_var[i].data = nullptr; + } + const u64 stamp1 = get_system_time(); compiler.ret(pos_var); compiler.endFunc(); From dab07513009f539508009fc3b53516d3488bead4 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Wed, 16 Apr 2014 15:09:06 +0400 Subject: [PATCH 08/14] ... --- rpcs3/Emu/ARMv7/ARMv7Decoder.h | 2 +- rpcs3/Emu/ARMv7/ARMv7Interpreter.h | 6 +- rpcs3/Emu/Cell/SPURecompiler.h | 628 +++++++++++++---------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 35 +- rpcs3/Emu/Cell/SPUThread.h | 2 +- rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp | 2 +- 6 files changed, 394 insertions(+), 281 deletions(-) diff --git a/rpcs3/Emu/ARMv7/ARMv7Decoder.h b/rpcs3/Emu/ARMv7/ARMv7Decoder.h index 71f37e3b38..b19bf94cd3 100644 --- a/rpcs3/Emu/ARMv7/ARMv7Decoder.h +++ b/rpcs3/Emu/ARMv7/ARMv7Decoder.h @@ -59,7 +59,7 @@ public: u8 I1 = 1 - (J1 ^ S); u8 I2 = 1 - (J2 ^ S); u16 imm11 = code1 & 0x7ff; - u32 imm32; + u32 imm32 = 0; switch(code1 >> 14) { diff --git a/rpcs3/Emu/ARMv7/ARMv7Interpreter.h b/rpcs3/Emu/ARMv7/ARMv7Interpreter.h index a11144e9b6..9aabb473fc 100644 --- a/rpcs3/Emu/ARMv7/ARMv7Interpreter.h +++ b/rpcs3/Emu/ARMv7/ARMv7Interpreter.h @@ -95,7 +95,7 @@ public: SRType DecodeImmShift(u8 type, u8 imm5, uint* shift_n) { - SRType shift_t; + SRType shift_t = SRType_None; switch(type) { @@ -119,7 +119,7 @@ public: SRType DecodeRegShift(u8 type) { - SRType shift_t; + SRType shift_t = SRType_None; switch(type) { @@ -235,7 +235,7 @@ public: bool ConditionPassed(u8 cond) { - bool result; + bool result = false; switch(cond >> 1) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 6aeaa8810c..b6aafbfdc5 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -94,7 +94,7 @@ class SPURecompilerCore : public CPUDecoder public: SPUInterpreter* inter; JitRuntime runtime; - //Compiler compiler; + bool first; struct SPURecEntry { @@ -127,9 +127,11 @@ public: #define imm_xmm(x) oword_ptr(*imm_var, offsetof(SPUImmTable, x)) -#define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"()"__VA_ARGS__) +#define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"(): "__VA_ARGS__) -#define LOG3_OPCODE(...) //ConLog.Write("Linked "__FUNCTION__"()"__VA_ARGS__) +#define LOG3_OPCODE(...) //ConLog.Write("Linked "__FUNCTION__"(): "__VA_ARGS__) + +#define LOG4_OPCODE(...) //c.addComment(fmt::Format("SPU info: "__FUNCTION__"(): "__VA_ARGS__).c_str()) #define WRAPPER_BEGIN(a0, a1, a2, a3) struct opwr_##a0 \ { \ @@ -139,7 +141,7 @@ public: #define WRAPPER_END(a0, a1, a2, a3) /*LOG2_OPCODE();*/ } \ }; \ - c.mov(cpu_qword(PC), (u32)CPU.PC); \ + /*XmmRelease();*/ \ if (#a0[0] == 'r') XmmInvalidate(a0); \ if (#a1[0] == 'r') XmmInvalidate(a1); \ if (#a2[0] == 'r') XmmInvalidate(a2); \ @@ -149,7 +151,7 @@ public: call##a0->setArg(1, imm_u(a1)); \ call##a0->setArg(2, imm_u(a2)); \ call##a0->setArg(3, imm_u(a3)); \ - LOG3_OPCODE(); + LOG3_OPCODE(/*#a0"=%d, "#a1"=%d, "#a2"=%d, "#a3"=%d", a0, a1, a2, a3*/); class SPURecompiler : public SPUOpcodes @@ -161,10 +163,16 @@ private: public: Compiler* compiler; bool do_finalize; + // input: GpVar* cpu_var; GpVar* ls_var; GpVar* imm_var; + // (input) output: GpVar* pos_var; + // temporary: + GpVar* addr; + GpVar* qw0; + GpVar* qw1; struct XmmLink { @@ -197,12 +205,14 @@ public: const XmmLink& XmmAlloc() // get empty xmm register { - for (u32 i = 15; ~i; i--) + for (u32 i = 0; i < 16; i++) { if ((xmm_var[i].reg == -1) && !xmm_var[i].taken) { xmm_var[i].taken = true; xmm_var[i].got = false; + LOG4_OPCODE("free reg taken (i=%d)", i); + xmm_var[i].reg = -1; return xmm_var[i]; } } @@ -210,9 +220,12 @@ public: { if (!xmm_var[i].taken) { - c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); + //(saving cached data?) + //c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); xmm_var[i].taken = true; xmm_var[i].got = false; + LOG4_OPCODE("cached reg taken (i=%d): GPR[%d] lost", i, xmm_var[i].reg); + xmm_var[i].reg = -1; return xmm_var[i]; } } @@ -232,10 +245,12 @@ public: if (xmm_var[i].taken) throw "XmmGet(): xmm_var is taken"; xmm_var[i].taken = true; xmm_var[i].got = false; + xmm_var[i].reg = -1; for (u32 j = i + 1; j < 16; j++) { if (xmm_var[j].reg == reg) throw "XmmGet(): xmm_var duplicate"; } + LOG4_OPCODE("cached GPR[%d] used (i=%d)", reg, i); break; } } @@ -243,15 +258,19 @@ public: { res = &(XmmLink&)XmmAlloc(); c.movaps(*res->data, cpu_xmm(GPR[reg])); + res->reg = -1; + LOG4_OPCODE("* cached GPR[%d] not found", reg); } return *res; } const XmmLink& XmmCopy(const XmmLink& from) // XmmAlloc + mov { - const XmmLink& res = XmmAlloc(); - c.movaps(*res.data, *from.data); - return res; + XmmLink* res = &(XmmLink&)XmmAlloc(); + c.movaps(*res->data, *from.data); + res->reg = -1; + LOG4_OPCODE("*"); + return *res; } const XmmLink* XmmRead(const s8 reg) const // get xmm register with specific SPU reg or nullptr @@ -262,9 +281,11 @@ public: if (xmm_var[i].reg == reg) { if (xmm_var[i].got && xmm_var[i].taken) throw "XmmRead(): wrong reuse"; + LOG4_OPCODE("GPR[%d] has been read (i=%d)", reg, i); return &xmm_var[i]; } } + LOG4_OPCODE("GPR[%d] not found", reg); return nullptr; } @@ -276,6 +297,7 @@ public: if (xmm_var[i].reg == reg) { if (xmm_var[i].taken) throw "XmmInvalidate(): xmm_var is taken"; + LOG4_OPCODE("GPR[%d] invalidated (i=%d)", reg, i); xmm_var[i].reg = -1; } } @@ -284,10 +306,11 @@ public: void XmmFinalize(const XmmLink& var, s8 reg = -1) { // invalidation - for (u32 i = 0; i < 16; i++) + if (reg >= 0) for (u32 i = 0; i < 16; i++) { if (xmm_var[i].reg == reg) { + LOG4_OPCODE("GPR[%d] invalidated (i=%d)", reg, i); xmm_var[i].reg = -1; } } @@ -297,9 +320,16 @@ public: { assert(xmm_var[i].taken); // save immediately: - if (reg >= 0) c.movaps(cpu_xmm(GPR[reg]), *xmm_var[i].data); + if (reg >= 0) + { + c.movaps(cpu_xmm(GPR[reg]), *xmm_var[i].data); + } + else + { + } + LOG4_OPCODE("GPR[%d] finalized (i=%d), GPR[%d] replaced", reg, i, xmm_var[i].reg); // (to disable caching:) - //reg = -1; + reg = -1; xmm_var[i].reg = reg; xmm_var[i].taken = false; return; @@ -315,6 +345,7 @@ public: if (xmm_var[i].reg >= 0) { //c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); + LOG4_OPCODE("GPR[%d] released (i=%d)", xmm_var[i].reg, i); xmm_var[i].reg = -1; } } @@ -377,6 +408,7 @@ private: } void RDCH(u32 rt, u32 ra) { + c.mov(cpu_qword(PC), (u32)CPU.PC); WRAPPER_BEGIN(rt, ra, yy, zz); CPU.ReadChannel(CPU.GPR[rt], ra); WRAPPER_END(rt, ra, 0, 0); @@ -384,6 +416,7 @@ private: } void RCHCNT(u32 rt, u32 ra) { + c.mov(cpu_qword(PC), (u32)CPU.PC); WRAPPER_BEGIN(rt, ra, yy, zz); CPU.GPR[rt].Reset(); CPU.GPR[rt]._u32[3] = CPU.GetChannelCount(ra); @@ -525,19 +558,18 @@ private: } void NOR(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]); CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]); CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]); CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]); - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ - /*XmmVar v0(c); - c.movaps(v0, cpu_xmm(GPR[ra])); - if (ra != rb) c.orps(v0, cpu_xmm(GPR[rb])); - c.xorps(v0, imm_xmm(s19_to_s32[0x7ffff])); - c.movaps(cpu_xmm(GPR[rt]), v0); - LOG_OPCODE();*/ + const XmmLink& va = XmmGet(ra); + if (ra != rb) c.orps(va.get(), cpu_xmm(GPR[rb])); + c.xorps(va.get(), imm_xmm(s19_to_s32[0x7ffff])); + XmmFinalize(va, rt); + LOG_OPCODE(); } void ABSDB(u32 rt, u32 ra, u32 rb) { @@ -772,30 +804,35 @@ private: } void A(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]; CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]; CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]; CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]; - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ // !!! - /*const XmmLink& vb = XmmGet(rb); if (ra == rb) { + const XmmLink& vb = XmmGet(rb); c.paddd(vb.get(), vb.get()); - } - else if (const XmmLink* va = XmmRead(ra)) - { - c.paddd(vb.get(), va->get()); + XmmFinalize(vb, rt); } else { - c.paddd(vb.get(), cpu_xmm(GPR[ra])); + const XmmLink& vb = XmmGet(rb); + if (const XmmLink* va = XmmRead(ra)) + { + c.paddd(vb.get(), va->get()); + } + else + { + c.paddd(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); } - XmmFinalize(vb, rt); - LOG_OPCODE();*/ + LOG_OPCODE(); } void AND(u32 rt, u32 ra, u32 rb) { @@ -887,9 +924,11 @@ private: } void WRCH(u32 ra, u32 rt) { + c.mov(cpu_qword(PC), (u32)CPU.PC); WRAPPER_BEGIN(ra, rt, yy, zz); CPU.WriteChannel(ra, CPU.GPR[rt]); WRAPPER_END(ra, rt, 0, 0); + // TODO /*GpVar v(c, kVarTypeUInt32); c.mov(v, cpu_dword(GPR[rt]._u32[3])); @@ -928,11 +967,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar pos_next(c, kVarTypeUInt32); - c.mov(pos_next, (u32)CPU.PC + 4); + c.mov(*addr, (u32)CPU.PC + 4); c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); - c.cmovne(*pos_var, pos_next); + c.cmovne(*pos_var, *addr); c.shr(*pos_var, 2); LOG_OPCODE(); } @@ -941,11 +979,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar pos_next(c, kVarTypeUInt32); - c.mov(pos_next, (u32)CPU.PC + 4); + c.mov(*addr, (u32)CPU.PC + 4); c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); - c.cmove(*pos_var, pos_next); + c.cmove(*pos_var, *addr); c.shr(*pos_var, 2); LOG_OPCODE(); } @@ -954,11 +991,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar pos_next(c, kVarTypeUInt32); - c.mov(pos_next, (u32)CPU.PC + 4); + c.mov(*addr, (u32)CPU.PC + 4); c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); c.cmp(cpu_word(GPR[rt]._u16[6]), 0); - c.cmovne(*pos_var, pos_next); + c.cmovne(*pos_var, *addr); c.shr(*pos_var, 2); LOG_OPCODE(); } @@ -967,11 +1003,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar pos_next(c, kVarTypeUInt32); - c.mov(pos_next, (u32)CPU.PC + 4); + c.mov(*addr, (u32)CPU.PC + 4); c.mov(*pos_var, cpu_dword(GPR[ra]._u32[3])); c.cmp(cpu_word(GPR[rt]._u16[6]), 0); - c.cmove(*pos_var, pos_next); + c.cmove(*pos_var, *addr); c.shr(*pos_var, 2); LOG_OPCODE(); } @@ -994,25 +1029,22 @@ private: CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); WRAPPER_END(rt, ra, rb, 0);*/ - GpVar lsa(c, kVarTypeUInt32); - GpVar v0(c, kVarTypeUInt64); - GpVar v1(c, kVarTypeUInt64); - c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); if (ra == rb) { - c.add(lsa, lsa); + c.add(*addr, *addr); } else { - c.add(lsa, cpu_dword(GPR[rb]._u32[3])); + c.add(*addr, cpu_dword(GPR[rb]._u32[3])); } - c.and_(lsa, 0x3fff0); - c.mov(v0, cpu_qword(GPR[rt]._u64[0])); - c.mov(v1, cpu_qword(GPR[rt]._u64[1])); - c.bswap(v0); - c.bswap(v1); - c.mov(qword_ptr(*ls_var, lsa, 0, 0), v1); - c.mov(qword_ptr(*ls_var, lsa, 0, 8), v0); + c.and_(*addr, 0x3fff0); + c.mov(*qw0, cpu_qword(GPR[rt]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[rt]._u64[1])); + c.bswap(*qw0); + c.bswap(*qw1); + c.mov(qword_ptr(*ls_var, *addr, 0, 0), *qw1); + c.mov(qword_ptr(*ls_var, *addr, 0, 8), *qw0); LOG_OPCODE(); } void BI(u32 ra) @@ -1158,25 +1190,22 @@ private: CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); WRAPPER_END(rt, ra, rb, 0);*/ - GpVar lsa(c, kVarTypeUInt32); - GpVar v0(c, kVarTypeUInt64); - GpVar v1(c, kVarTypeUInt64); - c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); if (ra == rb) { - c.add(lsa, lsa); + c.add(*addr, *addr); } else { - c.add(lsa, cpu_dword(GPR[rb]._u32[3])); + c.add(*addr, cpu_dword(GPR[rb]._u32[3])); } - c.and_(lsa, 0x3fff0); - c.mov(v0, qword_ptr(*ls_var, lsa, 0, 0)); - c.mov(v1, qword_ptr(*ls_var, lsa, 0, 8)); - c.bswap(v0); - c.bswap(v1); - c.mov(cpu_qword(GPR[rt]._u64[0]), v1); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + c.and_(*addr, 0x3fff0); + c.mov(*qw0, qword_ptr(*ls_var, *addr, 0, 0)); + c.mov(*qw1, qword_ptr(*ls_var, *addr, 0, 8)); + c.bswap(*qw0); + c.bswap(*qw1); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw1); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw0); LOG_OPCODE(); } void ROTQBYBI(u32 rt, u32 ra, u32 rb) @@ -1669,9 +1698,9 @@ private: else { // compare if-greater-then - c.movdqa(v0, cpu_xmm(GPR[rb])); - // (not implemented) - c.movdqa(cpu_xmm(GPR[rt]), v0); + // c.movdqa(v0, cpu_xmm(GPR[rb])); + // TODO + // c.movdqa(cpu_xmm(GPR[rt]), v0); } LOG_OPCODE();*/ } @@ -1798,30 +1827,35 @@ private: } void FM(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ // !!! - /*const XmmLink& va = XmmGet(ra); if (ra == rb) { + const XmmLink& va = XmmGet(ra); c.mulps(va.get(), va.get()); - } - else if (const XmmLink* vb = XmmRead(rb)) - { - c.mulps(va.get(), vb->get()); + XmmFinalize(va, rt); } else { - c.mulps(va.get(), cpu_xmm(GPR[rb])); + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); + } + else + { + c.mulps(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); } - XmmFinalize(va, rt); - LOG_OPCODE();*/ + LOG_OPCODE(); } void CLGTH(u32 rt, u32 ra, u32 rb) { @@ -2334,11 +2368,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar pos_next(c, kVarTypeUInt32); - c.mov(pos_next, (CPU.PC >> 2) + 1); + c.mov(*addr, (CPU.PC >> 2) + 1); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); - c.cmovne(*pos_var, pos_next); + c.cmovne(*pos_var, *addr); LOG_OPCODE(); } void STQA(u32 rt, s32 i16) @@ -2356,14 +2389,12 @@ private: WRAPPER_END(rt, i16, 0, 0);*/ u32 lsa = (i16 << 2) & 0x3fff0; - GpVar v0(c, kVarTypeUInt64); - GpVar v1(c, kVarTypeUInt64); - c.mov(v0, cpu_qword(GPR[rt]._u64[0])); - c.mov(v1, cpu_qword(GPR[rt]._u64[1])); - c.bswap(v0); - c.bswap(v1); - c.mov(qword_ptr(*ls_var, lsa), v1); - c.mov(qword_ptr(*ls_var, lsa + 8), v0); + c.mov(*qw0, cpu_qword(GPR[rt]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[rt]._u64[1])); + c.bswap(*qw0); + c.bswap(*qw1); + c.mov(qword_ptr(*ls_var, lsa), *qw1); + c.mov(qword_ptr(*ls_var, lsa + 8), *qw0); LOG_OPCODE(); } void BRNZ(u32 rt, s32 i16) @@ -2371,11 +2402,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar pos_next(c, kVarTypeUInt32); - c.mov(pos_next, (CPU.PC >> 2) + 1); + c.mov(*addr, (CPU.PC >> 2) + 1); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); c.cmp(cpu_dword(GPR[rt]._u32[3]), 0); - c.cmove(*pos_var, pos_next); + c.cmove(*pos_var, *addr); LOG_OPCODE(); } void BRHZ(u32 rt, s32 i16) @@ -2383,11 +2413,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar pos_next(c, kVarTypeUInt32); - c.mov(pos_next, (CPU.PC >> 2) + 1); + c.mov(*addr, (CPU.PC >> 2) + 1); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); c.cmp(cpu_word(GPR[rt]._u16[6]), 0); - c.cmovnz(*pos_var, pos_next); + c.cmovnz(*pos_var, *addr); LOG_OPCODE(); } void BRHNZ(u32 rt, s32 i16) @@ -2395,11 +2424,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar pos_next(c, kVarTypeUInt32); - c.mov(pos_next, (CPU.PC >> 2) + 1); + c.mov(*addr, (CPU.PC >> 2) + 1); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); c.cmp(cpu_word(GPR[rt]._u16[6]), 0); - c.cmovz(*pos_var, pos_next); + c.cmovz(*pos_var, *addr); LOG_OPCODE(); } void STQR(u32 rt, s32 i16) @@ -2416,14 +2444,12 @@ private: WRAPPER_END(rt, i16, CPU.PC, 0);*/ u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; - GpVar v0(c, kVarTypeUInt64); - GpVar v1(c, kVarTypeUInt64); - c.mov(v0, cpu_qword(GPR[rt]._u64[0])); - c.mov(v1, cpu_qword(GPR[rt]._u64[1])); - c.bswap(v0); - c.bswap(v1); - c.mov(qword_ptr(*ls_var, lsa), v1); - c.mov(qword_ptr(*ls_var, lsa + 8), v0); + c.mov(*qw0, cpu_qword(GPR[rt]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[rt]._u64[1])); + c.bswap(*qw0); + c.bswap(*qw1); + c.mov(qword_ptr(*ls_var, lsa), *qw1); + c.mov(qword_ptr(*ls_var, lsa + 8), *qw0); LOG_OPCODE(); } void BRA(s32 i16) @@ -2449,14 +2475,12 @@ private: WRAPPER_END(rt, i16, 0, 0);*/ u32 lsa = (i16 << 2) & 0x3fff0; - GpVar v0(c, kVarTypeUInt64); - GpVar v1(c, kVarTypeUInt64); - c.mov(v0, qword_ptr(*ls_var, lsa)); - c.mov(v1, qword_ptr(*ls_var, lsa + 8)); - c.bswap(v0); - c.bswap(v1); - c.mov(cpu_qword(GPR[rt]._u64[0]), v1); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + c.mov(*qw0, qword_ptr(*ls_var, lsa)); + c.mov(*qw1, qword_ptr(*ls_var, lsa + 8)); + c.bswap(*qw0); + c.bswap(*qw1); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw1); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw0); LOG_OPCODE(); } void BRASL(u32 rt, s32 i16) @@ -2464,10 +2488,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar v0(c, kVarTypeUInt64); - c.xor_(v0, v0); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0); - c.mov(cpu_qword(GPR[rt]._u64[0]), v0); + c.xor_(*addr, *addr); // zero + c.mov(cpu_dword(GPR[rt]._u32[0]), *addr); + c.mov(cpu_dword(GPR[rt]._u32[1]), *addr); + c.mov(cpu_dword(GPR[rt]._u32[2]), *addr); c.mov(cpu_dword(GPR[rt]._u32[3]), (u32)CPU.PC + 4); c.mov(*pos_var, branchTarget(0, i16) >> 2); LOG_OPCODE(); @@ -2518,10 +2542,10 @@ private: c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; - GpVar v0(c, kVarTypeUInt64); - c.xor_(v0, v0); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0); - c.mov(cpu_qword(GPR[rt]._u64[0]), v0); + c.xor_(*addr, *addr); // zero + c.mov(cpu_dword(GPR[rt]._u32[0]), *addr); + c.mov(cpu_dword(GPR[rt]._u32[1]), *addr); + c.mov(cpu_dword(GPR[rt]._u32[2]), *addr); c.mov(cpu_dword(GPR[rt]._u32[3]), (u32)CPU.PC + 4); c.mov(*pos_var, branchTarget(CPU.PC, i16) >> 2); LOG_OPCODE(); @@ -2540,14 +2564,12 @@ private: WRAPPER_END(rt, i16, CPU.PC, 0);*/ u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; - GpVar v0(c, kVarTypeUInt64); - GpVar v1(c, kVarTypeUInt64); - c.mov(v0, qword_ptr(*ls_var, lsa)); - c.mov(v1, qword_ptr(*ls_var, lsa + 8)); - c.bswap(v0); - c.bswap(v1); - c.mov(cpu_qword(GPR[rt]._u64[0]), v1); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + c.mov(*qw0, qword_ptr(*ls_var, lsa)); + c.mov(*qw1, qword_ptr(*ls_var, lsa + 8)); + c.bswap(*qw0); + c.bswap(*qw1); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw1); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw0); LOG_OPCODE(); } void IL(u32 rt, s32 i16) @@ -2687,14 +2709,7 @@ private: // zero const XmmLink& v0 = XmmAlloc(); c.xorps(v0.get(), v0.get()); - if (const XmmLink* va = XmmRead(ra)) - { - c.psubd(v0.get(), va->get()); - } - else - { - c.psubd(v0.get(), cpu_xmm(GPR[ra])); - } + c.psubd(v0.get(), cpu_xmm(GPR[ra])); XmmFinalize(v0, rt); } else if (i10 == -1) @@ -2702,28 +2717,14 @@ private: // fill with 1 const XmmLink& v1 = XmmAlloc(); c.pcmpeqd(v1.get(), v1.get()); - if (const XmmLink* va = XmmRead(ra)) - { - c.psubd(v1.get(), va->get()); - } - else - { - c.psubd(v1.get(), cpu_xmm(GPR[ra])); - } + c.psubd(v1.get(), cpu_xmm(GPR[ra])); XmmFinalize(v1, rt); } else { const XmmLink& vr = XmmAlloc(); c.movdqa(vr.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); - if (const XmmLink* va = XmmRead(ra)) - { - c.psubd(vr.get(), va->get()); - } - else - { - c.psubd(vr.get(), cpu_xmm(GPR[ra])); - } + c.psubd(vr.get(), cpu_xmm(GPR[ra])); XmmFinalize(vr, rt); } LOG_OPCODE(); @@ -2830,18 +2831,15 @@ private: CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); WRAPPER_END(rt, i10, ra, 0);*/ - GpVar lsa(c, kVarTypeUInt32); - GpVar v0(c, kVarTypeUInt64); - GpVar v1(c, kVarTypeUInt64); - c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); - if (i10) c.add(lsa, i10); - c.and_(lsa, 0x3fff0); - c.mov(v0, cpu_qword(GPR[rt]._u64[0])); - c.mov(v1, cpu_qword(GPR[rt]._u64[1])); - c.bswap(v0); - c.bswap(v1); - c.mov(qword_ptr(*ls_var, lsa, 0, 0), v1); - c.mov(qword_ptr(*ls_var, lsa, 0, 8), v0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + if (i10) c.add(*addr, i10); + c.and_(*addr, 0x3fff0); + c.mov(*qw0, cpu_qword(GPR[rt]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[rt]._u64[1])); + c.bswap(*qw0); + c.bswap(*qw1); + c.mov(qword_ptr(*ls_var, *addr, 0, 0), *qw1); + c.mov(qword_ptr(*ls_var, *addr, 0, 8), *qw0); LOG_OPCODE(); } void LQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding @@ -2858,18 +2856,15 @@ private: CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); WRAPPER_END(rt, i10, ra, 0);*/ - GpVar lsa(c, kVarTypeUInt32); - GpVar v0(c, kVarTypeUInt64); - GpVar v1(c, kVarTypeUInt64); - c.mov(lsa, cpu_dword(GPR[ra]._u32[3])); - if (i10) c.add(lsa, i10); - c.and_(lsa, 0x3fff0); - c.mov(v0, qword_ptr(*ls_var, lsa, 0, 0)); - c.mov(v1, qword_ptr(*ls_var, lsa, 0, 8)); - c.bswap(v0); - c.bswap(v1); - c.mov(cpu_qword(GPR[rt]._u64[0]), v1); - c.mov(cpu_qword(GPR[rt]._u64[1]), v0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + if (i10) c.add(*addr, i10); + c.and_(*addr, 0x3fff0); + c.mov(*qw0, qword_ptr(*ls_var, *addr, 0, 0)); + c.mov(*qw1, qword_ptr(*ls_var, *addr, 0, 8)); + c.bswap(*qw0); + c.bswap(*qw1); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw1); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw0); LOG_OPCODE(); } void XORI(u32 rt, u32 ra, s32 i10) @@ -2946,17 +2941,9 @@ private: } else { - if (i10 == 0) - { - // load zero - c.pxor(v0, v0); - } - else - { - c.movdqa(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); - } - // (not implemented) - c.movdqa(cpu_xmm(GPR[rt]), v0); + //c.movdqa(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); + // TODO + //c.movdqa(cpu_xmm(GPR[rt]), v0); }*/ } void CLGTHI(u32 rt, u32 ra, s32 i10) @@ -3067,25 +3054,25 @@ private: //0 - 3 void SELB(u32 rt, u32 ra, u32 rb, u32 rc) { - WRAPPER_BEGIN(rt, ra, rb, rc); + /*WRAPPER_BEGIN(rt, ra, rb, rc); for (u64 i = 0; i < 2; ++i) { CPU.GPR[rt]._u64[i] = (CPU.GPR[rc]._u64[i] & CPU.GPR[rb]._u64[i]) | (~CPU.GPR[rc]._u64[i] & CPU.GPR[ra]._u64[i]); } - WRAPPER_END(rt, ra, rb, rc); + WRAPPER_END(rt, ra, rb, rc);*/ // !!! - /*const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb); const XmmLink& vc = XmmGet(rc); c.andps(vb.get(), vc.get()); c.andnps(vc.get(), cpu_xmm(GPR[ra])); c.orps(vb.get(), vc.get()); XmmFinalize(vb, rt); XmmFinalize(vc); - LOG_OPCODE();*/ + LOG_OPCODE(); } void SHUFB(u32 rt, u32 ra, u32 rb, u32 rc) { @@ -3177,16 +3164,16 @@ private: } void FNMS(u32 rt, u32 ra, u32 rb, u32 rc) { - WRAPPER_BEGIN(rt, ra, rb, rc); + /*WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[rc]._f[0] - CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[rc]._f[1] - CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[rc]._f[2] - CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[rc]._f[3] - CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, rc); + WRAPPER_END(rt, ra, rb, rc);*/ // !!! - /*const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra); const XmmLink& vc = (ra == rc) ? XmmCopy(va) : XmmGet(rc); if (ra == rb) @@ -3197,71 +3184,6 @@ private: { c.mulps(va.get(), vc.get()); } - else if (const XmmLink* vb = XmmRead(rb)) - { - c.mulps(va.get(), vb->get()); - } - else - { - c.mulps(va.get(), cpu_xmm(GPR[rb])); - } - c.subps(vc.get(), va.get()); - XmmFinalize(vc, rt); - XmmFinalize(va); - LOG_OPCODE();*/ - } - void FMA(u32 rt, u32 ra, u32 rb, u32 rc) - { - WRAPPER_BEGIN(rt, ra, rb, rc); - CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] + CPU.GPR[rc]._f[0]; - CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] + CPU.GPR[rc]._f[1]; - CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] + CPU.GPR[rc]._f[2]; - CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] + CPU.GPR[rc]._f[3]; - WRAPPER_END(rt, ra, rb, rc); - - // !!! - - /*const XmmLink& va = XmmGet(ra); - if (ra == rc || rb == rc) - { - if (ra == rc) - { - const XmmLink& vc = XmmCopy(va); - if (ra == rb) // == rc - { - c.mulps(va.get(), va.get()); - } - else if (const XmmLink* vb = XmmRead(rb)) - { - c.mulps(va.get(), vb->get()); - } - else - { - c.mulps(va.get(), cpu_xmm(GPR[rb])); - } - c.addps(va.get(), vc.get()); - XmmFinalize(vc); - } - else // rb == rc - { - const XmmLink& vb = XmmGet(rb); - c.mulps(va.get(), vb.get()); - c.addps(va.get(), vb.get()); - XmmFinalize(vb); - } - } - else if (ra == rb) - { - c.mulps(va.get(), va.get()); - if (const XmmLink* vc = XmmRead(rc)) - { - c.addps(va.get(), vc->get()); - } - else - { - c.addps(va.get(), cpu_xmm(GPR[rc])); - } - } else { if (const XmmLink* vb = XmmRead(rb)) @@ -3272,8 +3194,35 @@ private: { c.mulps(va.get(), cpu_xmm(GPR[rb])); } + } + c.subps(vc.get(), va.get()); + XmmFinalize(vc, rt); + XmmFinalize(va); + LOG_OPCODE(); + } + void FMA(u32 rt, u32 ra, u32 rb, u32 rc) + { + /*WRAPPER_BEGIN(rt, ra, rb, rc); + CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] + CPU.GPR[rc]._f[0]; + CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] + CPU.GPR[rc]._f[1]; + CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] + CPU.GPR[rc]._f[2]; + CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] + CPU.GPR[rc]._f[3]; + WRAPPER_END(rt, ra, rb, rc);*/ - if (const XmmLink* vc = XmmRead(rc)) // !!! + // !!! + + if (ra != rb && rb != rc && rc != ra) + { + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); + } + else + { + c.mulps(va.get(), cpu_xmm(GPR[rb])); + } + if (const XmmLink* vc = XmmRead(rc)) { c.addps(va.get(), vc->get()); } @@ -3281,18 +3230,157 @@ private: { c.addps(va.get(), cpu_xmm(GPR[rc])); } + XmmFinalize(va, rt); } - XmmFinalize(va, rt); - LOG_OPCODE();*/ + else if (ra == rb && rb == rc) + { + const XmmLink& va = XmmGet(ra); + const XmmLink& vc = XmmCopy(va); + c.mulps(va.get(), va.get()); + c.addps(va.get(), vc.get()); + XmmFinalize(va, rt); + XmmFinalize(vc); + } + else if (ra == rb) + { + const XmmLink& va = XmmGet(ra); + c.mulps(va.get(), va.get()); + if (const XmmLink* vc = XmmRead(rc)) + { + c.addps(va.get(), vc->get()); + } + else + { + c.addps(va.get(), cpu_xmm(GPR[rc])); + } + XmmFinalize(va, rt); + } + else if (rb == rc) + { + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vc = XmmRead(rc)) + { + c.mulps(va.get(), vc->get()); + c.addps(va.get(), vc->get()); + } + else + { + c.mulps(va.get(), cpu_xmm(GPR[rb])); // == rc, not optimal + c.addps(va.get(), cpu_xmm(GPR[rc])); + } + XmmFinalize(va, rt); + } + else if (ra == rc) + { + const XmmLink& va = XmmGet(ra); + const XmmLink& vc = XmmCopy(va); + if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); + } + else + { + c.mulps(va.get(), cpu_xmm(GPR[rb])); + } + c.addps(va.get(), vc.get()); + XmmFinalize(va, rt); + XmmFinalize(vc); + } + else + { + throw __FUNCTION__"(): invalid case"; + } + LOG_OPCODE(); } void FMS(u32 rt, u32 ra, u32 rb, u32 rc) { - WRAPPER_BEGIN(rt, ra, rb, rc); + /*WRAPPER_BEGIN(rt, ra, rb, rc); CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] - CPU.GPR[rc]._f[0]; CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] - CPU.GPR[rc]._f[1]; CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] - CPU.GPR[rc]._f[2]; CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] - CPU.GPR[rc]._f[3]; - WRAPPER_END(rt, ra, rb, rc); + WRAPPER_END(rt, ra, rb, rc);*/ + + if (ra != rb && rb != rc && rc != ra) + { + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); + } + else + { + c.mulps(va.get(), cpu_xmm(GPR[rb])); + } + if (const XmmLink* vc = XmmRead(rc)) + { + c.subps(va.get(), vc->get()); + } + else + { + c.subps(va.get(), cpu_xmm(GPR[rc])); + } + XmmFinalize(va, rt); + } + else if (ra == rb && rb == rc) + { + const XmmLink& va = XmmGet(ra); + const XmmLink& vc = XmmCopy(va); + c.mulps(va.get(), va.get()); + c.subps(va.get(), vc.get()); + XmmFinalize(va, rt); + XmmFinalize(vc); + } + else if (ra == rb) + { + const XmmLink& va = XmmGet(ra); + c.mulps(va.get(), va.get()); + if (const XmmLink* vc = XmmRead(rc)) + { + c.subps(va.get(), vc->get()); + } + else + { + c.subps(va.get(), cpu_xmm(GPR[rc])); + } + XmmFinalize(va, rt); + } + else if (rb == rc) + { + const XmmLink& va = XmmGet(ra); + if (const XmmLink* vc = XmmRead(rc)) + { + c.mulps(va.get(), vc->get()); + c.subps(va.get(), vc->get()); + } + else + { + c.mulps(va.get(), cpu_xmm(GPR[rb])); // == rc, not optimal + c.subps(va.get(), cpu_xmm(GPR[rc])); + } + XmmFinalize(va, rt); + } + else if (ra == rc) + { + const XmmLink& va = XmmGet(ra); + const XmmLink& vc = XmmCopy(va); + if (const XmmLink* vb = XmmRead(rb)) + { + c.mulps(va.get(), vb->get()); + } + else + { + c.mulps(va.get(), cpu_xmm(GPR[rb])); + } + c.subps(va.get(), vc.get()); + XmmFinalize(va, rt); + XmmFinalize(vc); + } + else + { + throw __FUNCTION__"(): invalid case"; + } + LOG_OPCODE(); } void UNK(u32 code, u32 opcode, u32 gcode) diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index d59ed5b93b..a5519a9342 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -1,5 +1,6 @@ #include "stdafx.h" #include "SPUInstrTable.h" +#include "SPUDisAsm.h" #include "SPUInterpreter.h" #include "SPURecompiler.h" @@ -9,7 +10,7 @@ SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) : m_enc(new SPURecompiler(cpu, *this)) , inter(new SPUInterpreter(cpu)) , CPU(cpu) -//, compiler(&runtime) +, first(true) { memset(entry, 0, sizeof(entry)); } @@ -30,8 +31,14 @@ void SPURecompilerCore::Compile(u16 pos) const u64 stamp0 = get_system_time(); u64 time0 = 0; + SPUDisAsm dis_asm(CPUDisAsm_InterpreterMode); + + StringLogger stringLogger; + stringLogger.setOption(kLoggerOptionBinaryForm, true); + Compiler compiler(&runtime); m_enc->compiler = &compiler; + compiler.setLogger(&stringLogger); compiler.addFunc(kFuncConvHost, FuncBuilder4()); const u16 start = pos; @@ -55,12 +62,18 @@ void SPURecompilerCore::Compile(u16 pos) GpVar pos_var(compiler, kVarTypeUInt32, "pos"); compiler.setArg(3, pos_var); - compiler.alloc(pos_var); m_enc->pos_var = &pos_var; + GpVar addr_var(compiler, kVarTypeUInt32, "addr"); + m_enc->addr = &addr_var; + GpVar qw0_var(compiler, kVarTypeUInt64, "qw0"); + m_enc->qw0 = &qw0_var; + GpVar qw1_var(compiler, kVarTypeUInt64, "qw1"); + m_enc->qw1 = &qw1_var; + for (u32 i = 0; i < 16; i++) { - m_enc->xmm_var[i].data = new XmmVar(compiler); + m_enc->xmm_var[i].data = new XmmVar(compiler, kVarTypeXmm, fmt::Format("reg_%d", i).c_str()); } compiler.xor_(pos_var, pos_var); @@ -72,10 +85,15 @@ void SPURecompilerCore::Compile(u16 pos) if (opcode) { const u64 stamp1 = get_system_time(); - (*SPU_instr::rrr_list)(m_enc, opcode); // compile single opcode + // disasm for logging: + dis_asm.dump_pc = CPU.dmac.ls_offset + pos * 4; + (*SPU_instr::rrr_list)(&dis_asm, opcode); + compiler.addComment(fmt::Format("SPU data: PC=0x%05x %s", pos * 4, dis_asm.last_opcode.c_str()).c_str()); + // compile single opcode: + (*SPU_instr::rrr_list)(m_enc, opcode); + // force finalization between every slice using absolute alignment /*if ((pos % 128 == 127) && !m_enc->do_finalize) { - // force finalization between every slice using absolute alignment compiler.mov(pos_var, pos + 1); m_enc->do_finalize = true; }*/ @@ -111,10 +129,17 @@ void SPURecompilerCore::Compile(u16 pos) compiler.ret(pos_var); compiler.endFunc(); entry[start].pointer = compiler.make(); + compiler.setLogger(nullptr); // crashes without it + wxFile log; + log.Open(wxString::Format("SPUjit_%d.log", GetCurrentSPUThread().GetId()), first ? wxFile::write : wxFile::write_append); + log.Write(wxString::Format("========== START POSITION 0x%x ==========\n\n", start * 4)); + log.Write(wxString(stringLogger.getString())); + log.Close(); //ConLog.Write("Compiled: %d (excess %d), addr=0x%x, time: [start=%d (decoding=%d), finalize=%d]", //entry[start].count, excess, start * 4, stamp1 - stamp0, time0, get_system_time() - stamp1); m_enc->compiler = nullptr; + first = false; } u8 SPURecompilerCore::DecodeMemory(const u64 address) diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 71d7bbc05e..2b1e95d40e 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -331,7 +331,7 @@ public: return false; } res = m_value[0]; - for (u32 i = 1; i < max_count; i++) // FIFO + if (max_count > 1) for (u32 i = 1; i < max_count; i++) // FIFO { m_value[i-1] = m_value[i]; } diff --git a/rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp b/rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp index 535a01fcab..4966e53781 100644 --- a/rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp +++ b/rpcs3/Emu/SysCalls/lv2/SC_SPU_Thread.cpp @@ -468,7 +468,7 @@ int sys_spu_thread_read_ls(u32 id, u32 address, mem64_t value, u32 type) //190 int sys_spu_thread_write_spu_mb(u32 id, u32 value) { - sc_spu.Warning("sys_spu_thread_write_spu_mb(id=%d, value=0x%x)", id, value); + sc_spu.Log("sys_spu_thread_write_spu_mb(id=%d, value=0x%x)", id, value); CPUThread* thr = Emu.GetCPU().GetThread(id); From a57841d00605fe8475004bd22851d9f7570d2774 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Wed, 16 Apr 2014 15:36:20 +0400 Subject: [PATCH 09/14] Some comments cleared --- rpcs3/Emu/Cell/SPURecompiler.h | 476 +++------------------------------ 1 file changed, 38 insertions(+), 438 deletions(-) diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index b6aafbfdc5..99a0d66c0c 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -425,13 +425,6 @@ private: } void SF(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = CPU.GPR[rb]._u32[0] - CPU.GPR[ra]._u32[0]; - CPU.GPR[rt]._u32[1] = CPU.GPR[rb]._u32[1] - CPU.GPR[ra]._u32[1]; - CPU.GPR[rt]._u32[2] = CPU.GPR[rb]._u32[2] - CPU.GPR[ra]._u32[2]; - CPU.GPR[rt]._u32[3] = CPU.GPR[rb]._u32[3] - CPU.GPR[ra]._u32[3]; - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { // zero @@ -457,13 +450,6 @@ private: } void OR(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]; - CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]; - CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]; - CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]; - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { // mov @@ -529,11 +515,6 @@ private: } void SFH(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = CPU.GPR[rb]._u16[h] - CPU.GPR[ra]._u16[h]; - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { // zero @@ -558,13 +539,6 @@ private: } void NOR(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] | CPU.GPR[rb]._u32[0]); - CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] | CPU.GPR[rb]._u32[1]); - CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] | CPU.GPR[rb]._u32[2]); - CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] | CPU.GPR[rb]._u32[3]); - WRAPPER_END(rt, ra, rb, 0);*/ - const XmmLink& va = XmmGet(ra); if (ra != rb) c.orps(va.get(), cpu_xmm(GPR[rb])); c.xorps(va.get(), imm_xmm(s19_to_s32[0x7ffff])); @@ -670,14 +644,6 @@ private: } void ROTMI(u32 rt, u32 ra, s32 i7) { - /*WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) & 0x3f; - CPU.GPR[rt]._u32[0] = nRot < 32 ? CPU.GPR[ra]._u32[0] >> nRot : 0; - CPU.GPR[rt]._u32[1] = nRot < 32 ? CPU.GPR[ra]._u32[1] >> nRot : 0; - CPU.GPR[rt]._u32[2] = nRot < 32 ? CPU.GPR[ra]._u32[2] >> nRot : 0; - CPU.GPR[rt]._u32[3] = nRot < 32 ? CPU.GPR[ra]._u32[3] >> nRot : 0; - WRAPPER_END(rt, ra, i7, 0);*/ - const int nRot = (0 - i7) & 0x3f; if (nRot > 31) { @@ -707,14 +673,6 @@ private: } void ROTMAI(u32 rt, u32 ra, s32 i7) { - /*WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) & 0x3f; - CPU.GPR[rt]._i32[0] = nRot < 32 ? CPU.GPR[ra]._i32[0] >> nRot : CPU.GPR[ra]._i32[0] >> 31; - CPU.GPR[rt]._i32[1] = nRot < 32 ? CPU.GPR[ra]._i32[1] >> nRot : CPU.GPR[ra]._i32[1] >> 31; - CPU.GPR[rt]._i32[2] = nRot < 32 ? CPU.GPR[ra]._i32[2] >> nRot : CPU.GPR[ra]._i32[2] >> 31; - CPU.GPR[rt]._i32[3] = nRot < 32 ? CPU.GPR[ra]._i32[3] >> nRot : CPU.GPR[ra]._i32[3] >> 31; - WRAPPER_END(rt, ra, i7, 0);*/ - const int nRot = (0 - i7) & 0x3f; if (nRot == 0) { @@ -737,12 +695,6 @@ private: } void SHLI(u32 rt, u32 ra, s32 i7) { - /*WRAPPER_BEGIN(rt, ra, i7, zz); - const u32 s = i7 & 0x3f; - for (u32 j = 0; j < 4; ++j) - CPU.GPR[rt]._u32[j] = (s >= 32) ? 0 : CPU.GPR[ra]._u32[j] << s; - WRAPPER_END(rt, ra, i7, 0);*/ - const int s = i7 & 0x3f; if (s > 31) { @@ -804,15 +756,6 @@ private: } void A(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]; - CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]; - CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]; - CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]; - WRAPPER_END(rt, ra, rb, 0);*/ - - // !!! - if (ra == rb) { const XmmLink& vb = XmmGet(rb); @@ -836,20 +779,13 @@ private: } void AND(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] & CPU.GPR[rb]._u32[0]; - CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] & CPU.GPR[rb]._u32[1]; - CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] & CPU.GPR[rb]._u32[2]; - CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]; - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { if (rt != ra) { // mov - const XmmLink& v0 = XmmGet(ra); - XmmFinalize(v0, rt); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } @@ -897,11 +833,6 @@ private: } void AVGB(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = (CPU.GPR[ra]._u8[b] + CPU.GPR[rb]._u8[b] + 1) >> 1; - WRAPPER_END(rt, ra, rb, 0);*/ - const XmmLink& vb = XmmGet(rb); if (const XmmLink* va = XmmRead(ra)) { @@ -1017,18 +948,6 @@ private: } void STQX(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - u32 lsa = (CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) & 0x3fff0; - if(!CPU.IsGoodLSA(lsa)) - { - ConLog.Error("STQX: bad lsa (0x%x)", lsa); - Emu.Pause(); - return; - } - - CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); - WRAPPER_END(rt, ra, rb, 0);*/ - c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); if (ra == rb) { @@ -1143,11 +1062,6 @@ private: } void FREST(u32 rt, u32 ra) { - /*WRAPPER_BEGIN(rt, ra, yy, zz); - for (int i = 0; i < 4; i++) - CPU.GPR[rt]._f[i] = 1 / CPU.GPR[ra]._f[i]; - WRAPPER_END(rt, ra, 0, 0);*/ - const XmmLink& vr = XmmAlloc(); if (const XmmLink* va = XmmRead(ra)) { @@ -1162,11 +1076,6 @@ private: } void FRSQEST(u32 rt, u32 ra) { - /*WRAPPER_BEGIN(rt, ra, yy, zz); - for (int i = 0; i < 4; i++) - CPU.GPR[rt]._f[i] = 1 / sqrt(abs(CPU.GPR[ra]._f[i])); - WRAPPER_END(rt, ra, 0, 0);*/ - const XmmLink& vr = XmmGet(ra); c.andps(vr.get(), imm_xmm(max_int)); // abs c.rsqrtps(vr.get(), vr.get()); @@ -1175,21 +1084,6 @@ private: } void LQX(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - u32 a = CPU.GPR[ra]._u32[3], b = CPU.GPR[rb]._u32[3]; - - u32 lsa = (a + b) & 0x3fff0; - - if(!CPU.IsGoodLSA(lsa)) - { - ConLog.Error("LQX: bad lsa (0x%x)", lsa); - Emu.Pause(); - return; - } - - CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); - WRAPPER_END(rt, ra, rb, 0);*/ - c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); if (ra == rb) { @@ -1423,21 +1317,14 @@ private: } void ROTQBYI(u32 rt, u32 ra, s32 i7) { - /*WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = i7 & 0xf; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; - WRAPPER_END(rt, ra, i7, 0);*/ - const int s = i7 & 0xf; if (s == 0) { // mov if (ra != rt) { - const XmmLink& v0 = XmmGet(ra); - XmmFinalize(v0, rt); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } @@ -1455,22 +1342,14 @@ private: } void ROTQMBYI(u32 rt, u32 ra, s32 i7) { - /*WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = (0 - (s32)i7) & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = 0; b < 16 - s; b++) - CPU.GPR[rt]._u8[b] = temp._u8[b + s]; - WRAPPER_END(rt, ra, i7, 0);*/ - const int s = (0 - i7) & 0x1f; if (s == 0) { if (ra != rt) { // mov - const XmmLink& v0 = XmmGet(ra); - XmmFinalize(v0, rt); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } @@ -1492,14 +1371,6 @@ private: } void SHLQBYI(u32 rt, u32 ra, s32 i7) { - /*WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = i7 & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = s; b < 16; b++) - CPU.GPR[rt]._u8[b] = temp._u8[b - s]; - WRAPPER_END(rt, ra, i7, 0);*/ - const int s = i7 & 0x1f; if (s == 0) { @@ -1533,11 +1404,6 @@ private: } void CGT(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > CPU.GPR[rb]._i32[w] ? 0xffffffff : 0; - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { // zero @@ -1562,11 +1428,6 @@ private: } void XOR(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] ^ CPU.GPR[rb]._u32[w]; - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { // zero @@ -1706,11 +1567,6 @@ private: } void ANDC(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] & (~CPU.GPR[rb]._u32[w]); - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { // zero @@ -1736,13 +1592,6 @@ private: } void FCGT(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._f[0] > CPU.GPR[rb]._f[0] ? 0xffffffff : 0; - CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._f[1] > CPU.GPR[rb]._f[1] ? 0xffffffff : 0; - CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._f[2] > CPU.GPR[rb]._f[2] ? 0xffffffff : 0; - CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] > CPU.GPR[rb]._f[3] ? 0xffffffff : 0; - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { // zero @@ -1775,34 +1624,27 @@ private: } void FA(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] + CPU.GPR[rb]._f[0]; - CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] + CPU.GPR[rb]._f[1]; - CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] + CPU.GPR[rb]._f[2]; - CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] + CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, 0);*/ - const XmmLink& va = XmmGet(ra); - if (const XmmLink* vb = XmmRead(rb)) + if (ra == rb) { - c.addps(va.get(), vb->get()); + c.addps(va.get(), va.get()); } else { - c.addps(va.get(), cpu_xmm(GPR[rb])); + if (const XmmLink* vb = XmmRead(rb)) + { + c.addps(va.get(), vb->get()); + } + else + { + c.addps(va.get(), cpu_xmm(GPR[rb])); + } } XmmFinalize(va, rt); LOG_OPCODE(); } void FS(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] - CPU.GPR[rb]._f[0]; - CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] - CPU.GPR[rb]._f[1]; - CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] - CPU.GPR[rb]._f[2]; - CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] - CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, 0);*/ - if (ra == rb) { // zero @@ -1827,15 +1669,6 @@ private: } void FM(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; - CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; - CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; - CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, 0);*/ - - // !!! - if (ra == rb) { const XmmLink& va = XmmGet(ra); @@ -1967,11 +1800,6 @@ private: } void ADDX(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] + CPU.GPR[rb]._u32[w] + (CPU.GPR[rt]._u32[w] & 1); - WRAPPER_END(rt, ra, rb, 0);*/ - const XmmLink& vt = XmmGet(rt); c.pand(vt.get(), imm_xmm(s19_to_s32[1])); c.paddd(vt.get(), cpu_xmm(GPR[ra])); @@ -1981,11 +1809,6 @@ private: } void SFX(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[rb]._u32[w] - CPU.GPR[ra]._u32[w] - (1 - (CPU.GPR[rt]._u32[w] & 1)); - WRAPPER_END(rt, ra, rb, 0);*/ - const XmmLink& vt = XmmGet(rt); c.pandn(vt.get(), imm_xmm(s19_to_s32[1])); if (ra == rb) @@ -2194,11 +2017,6 @@ private: } void MPYU(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * CPU.GPR[rb]._u16[w*2]; - WRAPPER_END(rt, ra, rb, 0);*/ - const XmmLink& va = XmmGet(ra); if (ra == rb) { @@ -2226,10 +2044,6 @@ private: } void FI(u32 rt, u32 ra, u32 rb) { - /*WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt] = CPU.GPR[rb]; - WRAPPER_END(rt, ra, rb, 0);*/ - const XmmLink& vb = XmmGet(rb); XmmFinalize(vb, rt); LOG_OPCODE(); @@ -2246,21 +2060,6 @@ private: //0 - 9 void CFLTS(u32 rt, u32 ra, s32 i8) { - /*WRAPPER_BEGIN(rt, ra, i8, zz); - const u32 scale = 173 - (i8 & 0xff); //unsigned immediate - for (int i = 0; i < 4; i++) - { - u32 exp = ((CPU.GPR[ra]._u32[i] >> 23) & 0xff) + scale; - - if (exp > 255) - exp = 255; - - CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] & 0x807fffff) | (exp << 23); - - CPU.GPR[rt]._u32[i] = (u32)CPU.GPR[rt]._f[i]; //trunc - } - WRAPPER_END(rt, ra, i8, 0);*/ - const XmmLink& va = XmmGet(ra); if (i8 != 173) { @@ -2309,21 +2108,6 @@ private: } void CSFLT(u32 rt, u32 ra, s32 i8) { - /*WRAPPER_BEGIN(rt, ra, i8, zz); - const u32 scale = 155 - (i8 & 0xff); //unsigned immediate - for (int i = 0; i < 4; i++) - { - CPU.GPR[rt]._f[i] = (s32)CPU.GPR[ra]._i32[i]; - - u32 exp = ((CPU.GPR[rt]._u32[i] >> 23) & 0xff) - scale; - - if (exp > 255) //< 0 - exp = 0; - - CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); - } - WRAPPER_END(rt, ra, i8, 0);*/ - const XmmLink& va = XmmGet(ra); c.cvtdq2ps(va.get(), va.get()); // convert to floats if (i8 != 155) @@ -2376,19 +2160,7 @@ private: } void STQA(u32 rt, s32 i16) { - /*WRAPPER_BEGIN(rt, i16, yy, zz); - u32 lsa = (i16 << 2) & 0x3fff0; - if(!CPU.IsGoodLSA(lsa)) - { - ConLog.Error("STQA: bad lsa (0x%x)", lsa); - Emu.Pause(); - return; - } - - CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); - WRAPPER_END(rt, i16, 0, 0);*/ - - u32 lsa = (i16 << 2) & 0x3fff0; + const u32 lsa = (i16 << 2) & 0x3fff0; c.mov(*qw0, cpu_qword(GPR[rt]._u64[0])); c.mov(*qw1, cpu_qword(GPR[rt]._u64[1])); c.bswap(*qw0); @@ -2432,18 +2204,7 @@ private: } void STQR(u32 rt, s32 i16) { - /*WRAPPER_BEGIN(rt, i16, PC, zz); - u32 lsa = branchTarget(PC, (s32)i16) & 0x3fff0; - if (!CPU.IsGoodLSA(lsa)) - { - ConLog.Error("STQR: bad lsa (0x%x)", lsa); - Emu.Pause(); - return; - } - CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); - WRAPPER_END(rt, i16, CPU.PC, 0);*/ - - u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; + const u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; c.mov(*qw0, cpu_qword(GPR[rt]._u64[0])); c.mov(*qw1, cpu_qword(GPR[rt]._u64[1])); c.bswap(*qw0); @@ -2462,19 +2223,7 @@ private: } void LQA(u32 rt, s32 i16) { - /*WRAPPER_BEGIN(rt, i16, yy, zz); - u32 lsa = (i16 << 2) & 0x3fff0; - if(!CPU.IsGoodLSA(lsa)) - { - ConLog.Error("LQA: bad lsa (0x%x)", lsa); - Emu.Pause(); - return; - } - - CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); - WRAPPER_END(rt, i16, 0, 0);*/ - - u32 lsa = (i16 << 2) & 0x3fff0; + const u32 lsa = (i16 << 2) & 0x3fff0; c.mov(*qw0, qword_ptr(*ls_var, lsa)); c.mov(*qw1, qword_ptr(*ls_var, lsa + 8)); c.bswap(*qw0); @@ -2506,22 +2255,6 @@ private: } void FSMBI(u32 rt, s32 i16) { - /*WRAPPER_BEGIN(rt, i16, yy, zz); - const u32 s = i16; - - for (u32 j = 0; j < 16; ++j) - { - if ((s >> j) & 0x1) - { - CPU.GPR[rt]._u8[j] = 0xFF; - } - else - { - CPU.GPR[rt]._u8[j] = 0x00; - } - } - WRAPPER_END(rt, i16, 0, 0);*/ - if (i16 == 0) { // zero @@ -2552,18 +2285,7 @@ private: } void LQR(u32 rt, s32 i16) { - /*WRAPPER_BEGIN(rt, i16, PC, zz); - u32 lsa = branchTarget(PC, (s32)i16) & 0x3fff0; - if (!CPU.IsGoodLSA(lsa)) - { - ConLog.Error("LQR: bad lsa (0x%x)", lsa); - Emu.Pause(); - return; - } - CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); - WRAPPER_END(rt, i16, CPU.PC, 0);*/ - - u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; + const u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; c.mov(*qw0, qword_ptr(*ls_var, lsa)); c.mov(*qw1, qword_ptr(*ls_var, lsa + 8)); c.bswap(*qw0); @@ -2574,13 +2296,6 @@ private: } void IL(u32 rt, s32 i16) { - /*WRAPPER_BEGIN(rt, i16, yy, zz); - CPU.GPR[rt]._i32[0] = - CPU.GPR[rt]._i32[1] = - CPU.GPR[rt]._i32[2] = - CPU.GPR[rt]._i32[3] = (s32)i16; - WRAPPER_END(rt, i16, 0, 0);*/ - const XmmLink& vr = XmmAlloc(); if (i16 == 0) { @@ -2599,11 +2314,6 @@ private: } void ILHU(u32 rt, s32 i16) { - /*WRAPPER_BEGIN(rt, i16, yy, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = (s32)i16 << 16; - WRAPPER_END(rt, i16, 0, 0);*/ - const XmmLink& vr = XmmAlloc(); if (i16 == 0) { @@ -2631,11 +2341,6 @@ private: } void IOHL(u32 rt, s32 i16) { - /*WRAPPER_BEGIN(rt, i16, yy, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] |= (i16 & 0xFFFF); - WRAPPER_END(rt, i16, 0, 0);*/ - if (i16 == 0) { // nop @@ -2653,11 +2358,6 @@ private: //0 - 7 void ORI(u32 rt, u32 ra, s32 i10) { - /*WRAPPER_BEGIN(rt, ra, i10, zz); - for (u32 i = 0; i < 4; ++i) - CPU.GPR[rt]._i32[i] = CPU.GPR[ra]._i32[i] | (s32)i10; - WRAPPER_END(rt, ra, i10, 0);*/ - if (i10 == -1) { // fill with 1 @@ -2670,8 +2370,8 @@ private: if (rt != ra) { // mov - const XmmLink& v0 = XmmGet(ra); - XmmFinalize(v0, rt); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } @@ -2699,11 +2399,6 @@ private: } void SFI(u32 rt, u32 ra, s32 i10) { - /*WRAPPER_BEGIN(rt, ra, i10, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = (s32)i10 - CPU.GPR[ra]._i32[w]; - WRAPPER_END(rt, ra, i10, 0);*/ - if (i10 == 0) { // zero @@ -2738,11 +2433,6 @@ private: } void ANDI(u32 rt, u32 ra, s32 i10) { - /*WRAPPER_BEGIN(rt, ra, i10, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] & (s32)i10; - WRAPPER_END(rt, ra, i10, 0);*/ - if (i10 == 0) { // zero @@ -2784,20 +2474,13 @@ private: } void AI(u32 rt, u32 ra, s32 i10) { - /*WRAPPER_BEGIN(rt, ra, i10, zz); - CPU.GPR[rt]._i32[0] = CPU.GPR[ra]._i32[0] + (s32)i10; - CPU.GPR[rt]._i32[1] = CPU.GPR[ra]._i32[1] + (s32)i10; - CPU.GPR[rt]._i32[2] = CPU.GPR[ra]._i32[2] + (s32)i10; - CPU.GPR[rt]._i32[3] = CPU.GPR[ra]._i32[3] + (s32)i10; - WRAPPER_END(rt, ra, i10, 0);*/ - if (i10 == 0) { if (rt != ra) { // mov - const XmmLink& v0 = XmmGet(ra); - XmmFinalize(v0, rt); + const XmmLink& va = XmmGet(ra); + XmmFinalize(va, rt); } // else nop } @@ -2817,20 +2500,8 @@ private: CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] + (s32)i10; WRAPPER_END(rt, ra, i10, 0); } - void STQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding + void STQD(u32 rt, s32 i10, u32 ra) // i10 is shifted left by 4 while decoding { - /*WRAPPER_BEGIN(rt, i10, ra, zz); - const u32 lsa = (CPU.GPR[ra]._i32[3] + (s32)i10) & 0x3fff0; - if (!CPU.IsGoodLSA(lsa)) - { - ConLog.Error("STQD: bad lsa (0x%x)", lsa); - Emu.Pause(); - return; - } - //ConLog.Write("wrapper::STQD (lsa=0x%x): GPR[%d] (0x%llx%llx)", lsa, rt, CPU.GPR[rt]._u64[1], CPU.GPR[rt]._u64[0]); - CPU.WriteLS128(lsa, CPU.GPR[rt]._u128); - WRAPPER_END(rt, i10, ra, 0);*/ - c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); if (i10) c.add(*addr, i10); c.and_(*addr, 0x3fff0); @@ -2842,20 +2513,8 @@ private: c.mov(qword_ptr(*ls_var, *addr, 0, 8), *qw0); LOG_OPCODE(); } - void LQD(u32 rt, s32 i10, u32 ra) //i10 is shifted left by 4 while decoding + void LQD(u32 rt, s32 i10, u32 ra) // i10 is shifted left by 4 while decoding { - /*WRAPPER_BEGIN(rt, i10, ra, zz); - const u32 lsa = (CPU.GPR[ra]._i32[3] + (s32)i10) & 0x3fff0; - if (!CPU.IsGoodLSA(lsa)) - { - ConLog.Error("LQD: bad lsa (0x%x)", lsa); - Emu.Pause(); - return; - } - - CPU.GPR[rt]._u128 = CPU.ReadLS128(lsa); - WRAPPER_END(rt, i10, ra, 0);*/ - c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); if (i10) c.add(*addr, i10); c.and_(*addr, 0x3fff0); @@ -2890,11 +2549,6 @@ private: } void CGTI(u32 rt, u32 ra, s32 i10) { - /*WRAPPER_BEGIN(rt, ra, i10, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] > (s32)i10 ? 0xffffffff : 0; - WRAPPER_END(rt, ra, i10, 0);*/ - const XmmLink& va = XmmGet(ra); c.pcmpgtd(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); XmmFinalize(va, rt); @@ -2986,11 +2640,6 @@ private: } void CEQI(u32 rt, u32 ra, s32 i10) { - /*WRAPPER_BEGIN(rt, ra, i10, zz); - for(u32 i = 0; i < 4; ++i) - CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._i32[i] == (s32)i10) ? 0xffffffff : 0x00000000; - WRAPPER_END(rt, ra, i10, 0);*/ - const XmmLink& va = XmmGet(ra); c.pcmpeqd(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); XmmFinalize(va, rt); @@ -3031,13 +2680,6 @@ private: } void ILA(u32 rt, u32 i18) { - /*WRAPPER_BEGIN(rt, i18, yy, zz); - CPU.GPR[rt]._u32[0] = - CPU.GPR[rt]._u32[1] = - CPU.GPR[rt]._u32[2] = - CPU.GPR[rt]._u32[3] = i18 & 0x3FFFF; - WRAPPER_END(rt, i18, 0, 0);*/ - const XmmLink& vr = XmmAlloc(); if (i18 == 0) { @@ -3054,17 +2696,6 @@ private: //0 - 3 void SELB(u32 rt, u32 ra, u32 rb, u32 rc) { - /*WRAPPER_BEGIN(rt, ra, rb, rc); - for (u64 i = 0; i < 2; ++i) - { - CPU.GPR[rt]._u64[i] = - (CPU.GPR[rc]._u64[i] & CPU.GPR[rb]._u64[i]) | - (~CPU.GPR[rc]._u64[i] & CPU.GPR[ra]._u64[i]); - } - WRAPPER_END(rt, ra, rb, rc);*/ - - // !!! - const XmmLink& vb = XmmGet(rb); const XmmLink& vc = XmmGet(rc); c.andps(vb.get(), vc.get()); @@ -3076,13 +2707,7 @@ private: } void SHUFB(u32 rt, u32 ra, u32 rb, u32 rc) { - /*WRAPPER_BEGIN(ra, rb, rc, zz); - ConLog.Write("SHUFB: input ra=%d, value=0x%016llx%016llx", ra, CPU.GPR[ra]._u64[1], CPU.GPR[ra]._u64[0]); - ConLog.Write("SHUFB: input rb=%d, value=0x%016llx%016llx", rb, CPU.GPR[rb]._u64[1], CPU.GPR[rb]._u64[0]); - ConLog.Write("SHUFB: input rc=%d, value=0x%016llx%016llx", rc, CPU.GPR[rc]._u64[1], CPU.GPR[rc]._u64[0]); - WRAPPER_END(ra, rb, rc, 0);*/ - - WRAPPER_BEGIN(rc, rt, ra, rb); + /*WRAPPER_BEGIN(rc, rt, ra, rb); const SPU_GPR_hdr _a = CPU.GPR[ra]; const SPU_GPR_hdr _b = CPU.GPR[rb]; for (int i = 0; i < 16; i++) @@ -3108,9 +2733,9 @@ private: CPU.GPR[rt]._u8[i] = _a._u8[15 - (b & 0x0F)]; } } - WRAPPER_END(rc, rt, ra, rb); + WRAPPER_END(rc, rt, ra, rb);*/ - /*const XmmLink& v0 = XmmGet(rc); // v0 = mask + const XmmLink& v0 = XmmGet(rc); // v0 = mask const XmmLink& v1 = XmmAlloc(); const XmmLink& v2 = XmmCopy(v0); // v2 = mask const XmmLink& v3 = XmmAlloc(); @@ -3149,11 +2774,7 @@ private: XmmFinalize(v2); XmmFinalize(v1); XmmFinalize(v0); - LOG_OPCODE();*/ - - /*WRAPPER_BEGIN(rt, xx, yy, zz); - //ConLog.Write("SHUFB: output=%d, value=0x%016llx%016llx", rt, CPU.GPR[rt]._u64[1], CPU.GPR[rt]._u64[0]); - WRAPPER_END(rt, 0, 0, 0);*/ + LOG_OPCODE(); } void MPYA(u32 rt, u32 ra, u32 rb, u32 rc) { @@ -3164,15 +2785,6 @@ private: } void FNMS(u32 rt, u32 ra, u32 rb, u32 rc) { - /*WRAPPER_BEGIN(rt, ra, rb, rc); - CPU.GPR[rt]._f[0] = CPU.GPR[rc]._f[0] - CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0]; - CPU.GPR[rt]._f[1] = CPU.GPR[rc]._f[1] - CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1]; - CPU.GPR[rt]._f[2] = CPU.GPR[rc]._f[2] - CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2]; - CPU.GPR[rt]._f[3] = CPU.GPR[rc]._f[3] - CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3]; - WRAPPER_END(rt, ra, rb, rc);*/ - - // !!! - const XmmLink& va = XmmGet(ra); const XmmLink& vc = (ra == rc) ? XmmCopy(va) : XmmGet(rc); @@ -3202,15 +2814,6 @@ private: } void FMA(u32 rt, u32 ra, u32 rb, u32 rc) { - /*WRAPPER_BEGIN(rt, ra, rb, rc); - CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] + CPU.GPR[rc]._f[0]; - CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] + CPU.GPR[rc]._f[1]; - CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] + CPU.GPR[rc]._f[2]; - CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] + CPU.GPR[rc]._f[3]; - WRAPPER_END(rt, ra, rb, rc);*/ - - // !!! - if (ra != rb && rb != rc && rc != ra) { const XmmLink& va = XmmGet(ra); @@ -3265,8 +2868,10 @@ private: } else { - c.mulps(va.get(), cpu_xmm(GPR[rb])); // == rc, not optimal - c.addps(va.get(), cpu_xmm(GPR[rc])); + const XmmLink& vb = XmmGet(rb); + c.mulps(va.get(), vb.get()); + c.addps(va.get(), vb.get()); + XmmFinalize(vb); } XmmFinalize(va, rt); } @@ -3294,13 +2899,6 @@ private: } void FMS(u32 rt, u32 ra, u32 rb, u32 rc) { - /*WRAPPER_BEGIN(rt, ra, rb, rc); - CPU.GPR[rt]._f[0] = CPU.GPR[ra]._f[0] * CPU.GPR[rb]._f[0] - CPU.GPR[rc]._f[0]; - CPU.GPR[rt]._f[1] = CPU.GPR[ra]._f[1] * CPU.GPR[rb]._f[1] - CPU.GPR[rc]._f[1]; - CPU.GPR[rt]._f[2] = CPU.GPR[ra]._f[2] * CPU.GPR[rb]._f[2] - CPU.GPR[rc]._f[2]; - CPU.GPR[rt]._f[3] = CPU.GPR[ra]._f[3] * CPU.GPR[rb]._f[3] - CPU.GPR[rc]._f[3]; - WRAPPER_END(rt, ra, rb, rc);*/ - if (ra != rb && rb != rc && rc != ra) { const XmmLink& va = XmmGet(ra); @@ -3355,8 +2953,10 @@ private: } else { - c.mulps(va.get(), cpu_xmm(GPR[rb])); // == rc, not optimal - c.subps(va.get(), cpu_xmm(GPR[rc])); + const XmmLink& vb = XmmGet(rb); + c.mulps(va.get(), vb.get()); + c.subps(va.get(), vb.get()); + XmmFinalize(vb); } XmmFinalize(va, rt); } From 29d2ea7513fc18ddc700446114520a79af9fa05f Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Thu, 17 Apr 2014 03:28:21 +0400 Subject: [PATCH 10/14] Some bugs fixed --- asmjit | 2 +- rpcs3/Emu/Cell/SPUInterpreter.h | 49 +++++++++++++++------------- rpcs3/Emu/Cell/SPURecompiler.h | 41 ++++++++++++++++------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 27 +++++++++++++-- 4 files changed, 80 insertions(+), 39 deletions(-) diff --git a/asmjit b/asmjit index 906f89bfc5..6c50029aa0 160000 --- a/asmjit +++ b/asmjit @@ -1 +1 @@ -Subproject commit 906f89bfc59138f0e4c7c43551f16f8c43887572 +Subproject commit 6c50029aa0aa23722b3c4c507113afa04191e5df diff --git a/rpcs3/Emu/Cell/SPUInterpreter.h b/rpcs3/Emu/Cell/SPUInterpreter.h index 66f259e591..269fcd2043 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.h +++ b/rpcs3/Emu/Cell/SPUInterpreter.h @@ -20,7 +20,9 @@ unsigned char reg_h[20]; sha1((const unsigned char*)CPU.GPR, sizeof(CPU.GPR), reg_h); \ ConLog.Write("Mem hash: 0x%llx, reg hash: 0x%llx", *(u64*)mem_h, *(u64*)reg_h); -#define LOG2_OPCODE(...) // ConLog.Write(__FUNCTION__ "(): " __VA_ARGS__) +#define LOG2_OPCODE(...) //MEM_AND_REG_HASH(); ConLog.Write(__FUNCTION__ "(): " __VA_ARGS__) + +#define LOG5_OPCODE(...) /// class SPUInterpreter : public SPUOpcodes { @@ -41,6 +43,7 @@ private: void STOP(u32 code) { CPU.DoStop(code); + LOG2_OPCODE(); } void LNOP() { @@ -278,12 +281,12 @@ private: u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); if (CPU.GPR[rt]._u32[3] == 0) { - LOG2_OPCODE("taken (0x%llx)", target); + LOG5_OPCODE("taken (0x%llx)", target); CPU.SetBranch(target); } else { - LOG2_OPCODE("not taken (0x%llx)", target); + LOG5_OPCODE("not taken (0x%llx)", target); } } void BINZ(u32 rt, u32 ra) @@ -291,12 +294,12 @@ private: u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); if (CPU.GPR[rt]._u32[3] != 0) { - LOG2_OPCODE("taken (0x%llx)", target); + LOG5_OPCODE("taken (0x%llx)", target); CPU.SetBranch(target); } else { - LOG2_OPCODE("not taken (0x%llx)", target); + LOG5_OPCODE("not taken (0x%llx)", target); } } void BIHZ(u32 rt, u32 ra) @@ -304,12 +307,12 @@ private: u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); if (CPU.GPR[rt]._u16[6] == 0) { - LOG2_OPCODE("taken (0x%llx)", target); + LOG5_OPCODE("taken (0x%llx)", target); CPU.SetBranch(target); } else { - LOG2_OPCODE("not taken (0x%llx)", target); + LOG5_OPCODE("not taken (0x%llx)", target); } } void BIHNZ(u32 rt, u32 ra) @@ -317,12 +320,12 @@ private: u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); if (CPU.GPR[rt]._u16[6] != 0) { - LOG2_OPCODE("taken (0x%llx)", target); + LOG5_OPCODE("taken (0x%llx)", target); CPU.SetBranch(target); } else { - LOG2_OPCODE("not taken (0x%llx)", target); + LOG5_OPCODE("not taken (0x%llx)", target); } } void STOPD(u32 rc, u32 ra, u32 rb) @@ -345,7 +348,7 @@ private: void BI(u32 ra) { u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); - LOG2_OPCODE("branch (0x%llx)", target); + LOG5_OPCODE("branch (0x%llx)", target); CPU.SetBranch(target); } void BISL(u32 rt, u32 ra) @@ -353,7 +356,7 @@ private: u64 target = branchTarget(CPU.GPR[ra]._u32[3], 0); CPU.GPR[rt].Reset(); CPU.GPR[rt]._u32[3] = CPU.PC + 4; - LOG2_OPCODE("branch (0x%llx)", target); + LOG5_OPCODE("branch (0x%llx)", target); CPU.SetBranch(target); } void IRET(u32 ra) @@ -1094,12 +1097,12 @@ private: u64 target = branchTarget(CPU.PC, i16); if (CPU.GPR[rt]._u32[3] == 0) { - LOG2_OPCODE("taken (0x%llx)", target); + LOG5_OPCODE("taken (0x%llx)", target); CPU.SetBranch(target); } else { - LOG2_OPCODE("not taken (0x%llx)", target); + LOG5_OPCODE("not taken (0x%llx)", target); } } void STQA(u32 rt, s32 i16) @@ -1119,12 +1122,12 @@ private: u64 target = branchTarget(CPU.PC, i16); if (CPU.GPR[rt]._u32[3] != 0) { - LOG2_OPCODE("taken (0x%llx)", target); + LOG5_OPCODE("taken (0x%llx)", target); CPU.SetBranch(target); } else { - LOG2_OPCODE("not taken (0x%llx)", target); + LOG5_OPCODE("not taken (0x%llx)", target); } } void BRHZ(u32 rt, s32 i16) @@ -1132,12 +1135,12 @@ private: u64 target = branchTarget(CPU.PC, i16); if (CPU.GPR[rt]._u16[6] == 0) { - LOG2_OPCODE("taken (0x%llx)", target); + LOG5_OPCODE("taken (0x%llx)", target); CPU.SetBranch(target); } else { - LOG2_OPCODE("not taken (0x%llx)", target); + LOG5_OPCODE("not taken (0x%llx)", target); } } void BRHNZ(u32 rt, s32 i16) @@ -1145,12 +1148,12 @@ private: u64 target = branchTarget(CPU.PC, i16); if (CPU.GPR[rt]._u16[6] != 0) { - LOG2_OPCODE("taken (0x%llx)", target); + LOG5_OPCODE("taken (0x%llx)", target); CPU.SetBranch(target); } else { - LOG2_OPCODE("not taken (0x%llx)", target); + LOG5_OPCODE("not taken (0x%llx)", target); } } void STQR(u32 rt, s32 i16) @@ -1168,7 +1171,7 @@ private: void BRA(s32 i16) { u64 target = branchTarget(0, i16); - LOG2_OPCODE("branch (0x%llx)", target); + LOG5_OPCODE("branch (0x%llx)", target); CPU.SetBranch(target); } void LQA(u32 rt, s32 i16) @@ -1188,13 +1191,13 @@ private: u64 target = branchTarget(0, i16); CPU.GPR[rt].Reset(); CPU.GPR[rt]._u32[3] = CPU.PC + 4; - LOG2_OPCODE("branch (0x%llx)", target); + LOG5_OPCODE("branch (0x%llx)", target); CPU.SetBranch(target); } void BR(s32 i16) { u64 target = branchTarget(CPU.PC, i16); - LOG2_OPCODE("branch (0x%llx)", target); + LOG5_OPCODE("branch (0x%llx)", target); CPU.SetBranch(target); } void FSMBI(u32 rt, s32 i16) @@ -1218,7 +1221,7 @@ private: u64 target = branchTarget(CPU.PC, i16); CPU.GPR[rt].Reset(); CPU.GPR[rt]._u32[3] = CPU.PC + 4; - LOG2_OPCODE("branch (0x%llx)", target); + LOG5_OPCODE("branch (0x%llx)", target); CPU.SetBranch(target); } void LQR(u32 rt, s32 i16) diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 99a0d66c0c..82468f002b 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -212,7 +212,6 @@ public: xmm_var[i].taken = true; xmm_var[i].got = false; LOG4_OPCODE("free reg taken (i=%d)", i); - xmm_var[i].reg = -1; return xmm_var[i]; } } @@ -220,12 +219,12 @@ public: { if (!xmm_var[i].taken) { - //(saving cached data?) + // (saving cached data?) //c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); xmm_var[i].taken = true; xmm_var[i].got = false; LOG4_OPCODE("cached reg taken (i=%d): GPR[%d] lost", i, xmm_var[i].reg); - xmm_var[i].reg = -1; + xmm_var[i].reg = -1; // ??? return xmm_var[i]; } } @@ -245,7 +244,7 @@ public: if (xmm_var[i].taken) throw "XmmGet(): xmm_var is taken"; xmm_var[i].taken = true; xmm_var[i].got = false; - xmm_var[i].reg = -1; + //xmm_var[i].reg = -1; for (u32 j = i + 1; j < 16; j++) { if (xmm_var[j].reg == reg) throw "XmmGet(): xmm_var duplicate"; @@ -258,7 +257,7 @@ public: { res = &(XmmLink&)XmmAlloc(); c.movaps(*res->data, cpu_xmm(GPR[reg])); - res->reg = -1; + res->reg = -1; // ??? LOG4_OPCODE("* cached GPR[%d] not found", reg); } return *res; @@ -268,7 +267,7 @@ public: { XmmLink* res = &(XmmLink&)XmmAlloc(); c.movaps(*res->data, *from.data); - res->reg = -1; + res->reg = -1; // ??? LOG4_OPCODE("*"); return *res; } @@ -329,7 +328,7 @@ public: } LOG4_OPCODE("GPR[%d] finalized (i=%d), GPR[%d] replaced", reg, i, xmm_var[i].reg); // (to disable caching:) - reg = -1; + //reg = -1; xmm_var[i].reg = reg; xmm_var[i].taken = false; return; @@ -589,7 +588,9 @@ private: WRAPPER_END(rt, ra, rb, 0); // AVX2: masking with 0x3f + VPSLLVD may be better - /*for (u32 i = 0; i < 4; i++) + /*XmmInvalidate(rt); + + for (u32 i = 0; i < 4; i++) { GpVar v0(c, kVarTypeUInt32); c.mov(v0, cpu_dword(GPR[ra]._u32[i])); @@ -861,7 +862,9 @@ private: WRAPPER_END(ra, rt, 0, 0); // TODO - /*GpVar v(c, kVarTypeUInt32); + /*XmmInvalidate(rt); + + GpVar v(c, kVarTypeUInt32); c.mov(v, cpu_dword(GPR[rt]._u32[3])); switch (ra) { @@ -977,6 +980,8 @@ private: } void BISL(u32 rt, u32 ra) { + XmmInvalidate(rt); + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; @@ -1084,6 +1089,8 @@ private: } void LQX(u32 rt, u32 ra, u32 rb) { + XmmInvalidate(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); if (ra == rb) { @@ -2223,6 +2230,8 @@ private: } void LQA(u32 rt, s32 i16) { + XmmInvalidate(rt); + const u32 lsa = (i16 << 2) & 0x3fff0; c.mov(*qw0, qword_ptr(*ls_var, lsa)); c.mov(*qw1, qword_ptr(*ls_var, lsa + 8)); @@ -2234,6 +2243,8 @@ private: } void BRASL(u32 rt, s32 i16) { + XmmInvalidate(rt); + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; @@ -2272,6 +2283,8 @@ private: } void BRSL(u32 rt, s32 i16) { + XmmInvalidate(rt); + c.mov(cpu_qword(PC), (u32)CPU.PC); do_finalize = true; @@ -2285,6 +2298,8 @@ private: } void LQR(u32 rt, s32 i16) { + XmmInvalidate(rt); + const u32 lsa = branchTarget(CPU.PC, i16) & 0x3fff0; c.mov(*qw0, qword_ptr(*ls_var, lsa)); c.mov(*qw1, qword_ptr(*ls_var, lsa + 8)); @@ -2303,7 +2318,7 @@ private: } else if (i16 == -1) { - c.cmpps(vr.get(), vr.get(), 0); + c.pcmpeqd(vr.get(), vr.get()); } else { @@ -2321,7 +2336,7 @@ private: } else if (i16 == -1) { - c.cmpps(vr.get(), vr.get(), 0); + c.pcmpeqd(vr.get(), vr.get()); c.pslld(vr.get(), 16); } else @@ -2362,7 +2377,7 @@ private: { // fill with 1 const XmmLink& v1 = XmmAlloc(); - c.cmpps(v1.get(), v1.get(), 0); + c.pcmpeqd(v1.get(), v1.get()); XmmFinalize(v1, rt); } else if (i10 == 0) @@ -2515,6 +2530,8 @@ private: } void LQD(u32 rt, s32 i10, u32 ra) // i10 is shifted left by 4 while decoding { + XmmInvalidate(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); if (i10) c.add(*addr, i10); c.and_(*addr, 0x3fff0); diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index a5519a9342..35f784d857 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -144,6 +144,7 @@ void SPURecompilerCore::Compile(u16 pos) u8 SPURecompilerCore::DecodeMemory(const u64 address) { + assert(CPU.dmac.ls_offset == address - CPU.PC); const u64 m_offset = CPU.dmac.ls_offset; const u16 pos = (CPU.PC >> 2); @@ -179,10 +180,11 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) } } + bool did_compile = false; if (!entry[pos].pointer) { - // compile from current position to nearest dynamic or statically unresolved branch, zero data or something other Compile(pos); + did_compile = true; if (entry[pos].valid == 0) { ConLog.Error("SPURecompilerCore::Compile(ls_addr=0x%x): branch to 0x0 opcode", pos * sizeof(u32)); @@ -197,17 +199,36 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) Emu.Pause(); return 0; } - // jump + typedef u32(*Func)(void* _cpu, void* _ls, const SPUImmTable* _imm, u32 _pos); Func func = asmjit_cast(entry[pos].pointer); void* cpu = (u8*)&CPU.GPR[0] - offsetof(SPUThread, GPR[0]); // ugly cpu base offset detection + //if (did_compile) + { + //LOG2_OPCODE("SPURecompilerCore::DecodeMemory(ls_addr=0x%x): NewPC = 0x%llx", address, (u64)res << 2); + //if (pos == 0x19c >> 2) + { + //Emu.Pause(); + //for (uint i = 0; i < 128; ++i) ConLog.Write("r%d = 0x%s", i, CPU.GPR[i].ToString().c_str()); + } + } + u16 res = pos; res = (u16)func(cpu, &Memory[m_offset], &g_spu_imm, res); - LOG2_OPCODE("SPURecompilerCore::DecodeMemory(ls_addr=0x%x): NewPC = 0x%llx", address, (u64)res << 2); + if (did_compile) + { + //LOG2_OPCODE("SPURecompilerCore::DecodeMemory(ls_addr=0x%x): NewPC = 0x%llx", address, (u64)res << 2); + //if (pos == 0x340 >> 2) + { + //Emu.Pause(); + //for (uint i = 0; i < 128; ++i) ConLog.Write("r%d = 0x%s", i, CPU.GPR[i].ToString().c_str()); + } + } + if ((res - 1) == (CPU.PC >> 2)) { return 4; From 5d091411a30ac56648c9e13b50898584d85b5b36 Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Fri, 18 Apr 2014 00:25:02 +0400 Subject: [PATCH 11/14] More opcodes --- asmjit | 2 +- rpcs3/Emu/Cell/SPURecompiler.h | 978 ++++++++++++++++----------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 12 +- 3 files changed, 572 insertions(+), 420 deletions(-) diff --git a/asmjit b/asmjit index 6c50029aa0..316812daf0 160000 --- a/asmjit +++ b/asmjit @@ -1 +1 @@ -Subproject commit 6c50029aa0aa23722b3c4c507113afa04191e5df +Subproject commit 316812daf0d734f1e3dc3abb05785737513274f0 diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 82468f002b..7a2932bd0f 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -14,76 +14,6 @@ using namespace asmjit::host; #define UNIMPLEMENTED() UNK(__FUNCTION__) -struct SPUImmTable -{ - __m128i s19_to_s32[1 << 19]; - __m128i fsmbi_mask[1 << 16]; - __m128i u8_to_u8[256]; - __m128 scale_to_float[256]; - __m128 scale_to_int[256]; - __m128i min_int; - __m128i max_int; - - SPUImmTable() - { - // signed numbers table - for (u32 i = 0; i < sizeof(s19_to_s32) / sizeof(__m128i); i++) - { - const u32 v = (i & 0x40000) ? (i | 0xfff80000) : i; - s19_to_s32[i].m128i_i32[0] = v; - s19_to_s32[i].m128i_i32[1] = v; - s19_to_s32[i].m128i_i32[2] = v; - s19_to_s32[i].m128i_i32[3] = v; - } - // FSMBI mask table - for (u32 i = 0; i < sizeof(fsmbi_mask) / sizeof(__m128i); i++) - { - for (u32 j = 0; j < 16; j++) - { - fsmbi_mask[i].m128i_i8[j] = ((i >> j) & 0x1) ? 0xff : 0; - } - } - // scale table for (u)int -> float conversion - for (s32 i = 0; i < sizeof(scale_to_float) / sizeof(__m128); i++) - { - const float v = pow(2, i - 155); - scale_to_float[i].m128_f32[0] = v; - scale_to_float[i].m128_f32[1] = v; - scale_to_float[i].m128_f32[2] = v; - scale_to_float[i].m128_f32[3] = v; - } - // scale table for float -> (u)int conversion - for (s32 i = 0; i < sizeof(scale_to_int) / sizeof(__m128); i++) - { - const float v = pow(2, 173 - i); - scale_to_int[i].m128_f32[0] = v; - scale_to_int[i].m128_f32[1] = v; - scale_to_int[i].m128_f32[2] = v; - scale_to_int[i].m128_f32[3] = v; - } - // sign bit - min_int.m128i_u32[0] = 0x80000000; - min_int.m128i_u32[1] = 0x80000000; - min_int.m128i_u32[2] = 0x80000000; - min_int.m128i_u32[3] = 0x80000000; - // - max_int.m128i_u32[0] = 0x7fffffff; - max_int.m128i_u32[1] = 0x7fffffff; - max_int.m128i_u32[2] = 0x7fffffff; - max_int.m128i_u32[3] = 0x7fffffff; - // table for byte consts - for (u32 i = 0; i < sizeof(u8_to_u8) / sizeof(__m128i); i++) - { - for (u32 j = 0; j < 16; j++) - { - u8_to_u8[i].m128i_u8[j] = i; - } - } - } -}; - -extern const SPUImmTable g_spu_imm; - class SPURecompiler; class SPURecompilerCore : public CPUDecoder @@ -106,6 +36,8 @@ public: SPURecEntry entry[0x10000]; + std::vector<__m128i> imm_table; + SPURecompilerCore(SPUThread& cpu); ~SPURecompilerCore(); @@ -125,8 +57,6 @@ public: #define cpu_word(x) word_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 2) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 2") #define cpu_byte(x) byte_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 1) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 1") -#define imm_xmm(x) oword_ptr(*imm_var, offsetof(SPUImmTable, x)) - #define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"(): "__VA_ARGS__) #define LOG3_OPCODE(...) //ConLog.Write("Linked "__FUNCTION__"(): "__VA_ARGS__) @@ -180,20 +110,31 @@ public: s8 reg; bool taken; mutable bool got; + mutable u32 access; XmmLink() : data(nullptr) , reg(-1) , taken(false) + , got(false) + , access(0) { } const XmmVar& get() const { assert(data); + assert(taken); + if (!taken) throw "XmmLink::get(): wrong use"; got = true; return *data; } + + const XmmVar& read() const + { + assert(data); + return *data; + } } xmm_var[16]; SPURecompiler(SPUThread& cpu, SPURecompilerCore& rec) @@ -203,73 +144,54 @@ public: { } - const XmmLink& XmmAlloc() // get empty xmm register + const XmmLink& XmmAlloc(s8 pref = -1) // get empty xmm register { + if (pref >= 0) for (u32 i = 0; i < 16; i++) + { + if ((xmm_var[i].reg == pref) && !xmm_var[i].taken) + { + xmm_var[i].taken = true; + xmm_var[i].got = false; + xmm_var[i].access = 0; + LOG4_OPCODE("pref(%d) reg taken (i=%d)", pref, i); + return xmm_var[i]; + } + } for (u32 i = 0; i < 16; i++) { if ((xmm_var[i].reg == -1) && !xmm_var[i].taken) { xmm_var[i].taken = true; xmm_var[i].got = false; + xmm_var[i].access = 0; LOG4_OPCODE("free reg taken (i=%d)", i); return xmm_var[i]; } } + int last = -1, max = -1; for (u32 i = 0; i < 16; i++) { if (!xmm_var[i].taken) { - // (saving cached data?) - //c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); - xmm_var[i].taken = true; - xmm_var[i].got = false; - LOG4_OPCODE("cached reg taken (i=%d): GPR[%d] lost", i, xmm_var[i].reg); - xmm_var[i].reg = -1; // ??? - return xmm_var[i]; - } - } - assert(false); - return *(XmmLink*)nullptr; - } - - const XmmLink& XmmGet(s8 reg) // get xmm register with specific SPU reg - { - assert(reg >= 0); - XmmLink* res = nullptr; - for (u32 i = 0; i < 16; i++) - { - if (xmm_var[i].reg == reg) - { - res = &xmm_var[i]; - if (xmm_var[i].taken) throw "XmmGet(): xmm_var is taken"; - xmm_var[i].taken = true; - xmm_var[i].got = false; - //xmm_var[i].reg = -1; - for (u32 j = i + 1; j < 16; j++) + if ((int)xmm_var[i].access > max) { - if (xmm_var[j].reg == reg) throw "XmmGet(): xmm_var duplicate"; + last = i; + max = xmm_var[i].access; } - LOG4_OPCODE("cached GPR[%d] used (i=%d)", reg, i); - break; } } - if (!res) + if (last >= 0) { - res = &(XmmLink&)XmmAlloc(); - c.movaps(*res->data, cpu_xmm(GPR[reg])); - res->reg = -1; // ??? - LOG4_OPCODE("* cached GPR[%d] not found", reg); + // (saving cached data?) + //c.movaps(cpu_xmm(GPR[xmm_var[last].reg]), *xmm_var[last].data); + xmm_var[last].taken = true; + xmm_var[last].got = false; + LOG4_OPCODE("cached reg taken (i=%d): GPR[%d] lost", last, xmm_var[last].reg); + xmm_var[last].reg = -1; // ??? + xmm_var[last].access = 0; + return xmm_var[last]; } - return *res; - } - - const XmmLink& XmmCopy(const XmmLink& from) // XmmAlloc + mov - { - XmmLink* res = &(XmmLink&)XmmAlloc(); - c.movaps(*res->data, *from.data); - res->reg = -1; // ??? - LOG4_OPCODE("*"); - return *res; + throw "XmmAlloc() failed"; } const XmmLink* XmmRead(const s8 reg) const // get xmm register with specific SPU reg or nullptr @@ -279,8 +201,10 @@ public: { if (xmm_var[i].reg == reg) { - if (xmm_var[i].got && xmm_var[i].taken) throw "XmmRead(): wrong reuse"; + assert(!xmm_var[i].got); + if (xmm_var[i].got) throw "XmmRead(): wrong reuse"; LOG4_OPCODE("GPR[%d] has been read (i=%d)", reg, i); + xmm_var[i].access++; return &xmm_var[i]; } } @@ -288,6 +212,63 @@ public: return nullptr; } + const XmmLink& XmmGet(s8 reg, s8 target = -1) // get xmm register with specific SPU reg + { + assert(reg >= 0); + XmmLink* res = nullptr; + if (reg == target) + { + for (u32 i = 0; i < 16; i++) + { + if (xmm_var[i].reg == reg) + { + res = &xmm_var[i]; + if (xmm_var[i].taken) throw "XmmGet(): xmm_var is taken"; + xmm_var[i].taken = true; + xmm_var[i].got = false; + //xmm_var[i].reg = -1; + for (u32 j = i + 1; j < 16; j++) + { + if (xmm_var[j].reg == reg) throw "XmmGet(): xmm_var duplicate"; + } + LOG4_OPCODE("cached GPR[%d] used (i=%d)", reg, i); + break; + } + } + } + if (!res) + { + res = &(XmmLink&)XmmAlloc(target); + /*if (target != res->reg) + { + c.movaps(*res->data, cpu_xmm(GPR[reg])); + } + else*/ + { + if (const XmmLink* source = XmmRead(reg)) + { + c.movaps(*res->data, source->read()); + } + else + { + c.movaps(*res->data, cpu_xmm(GPR[reg])); + } + } + res->reg = -1; // ??? + LOG4_OPCODE("* cached GPR[%d] not found", reg); + } + return *res; + } + + const XmmLink& XmmCopy(const XmmLink& from, s8 pref = -1) // XmmAlloc + mov + { + XmmLink* res = &(XmmLink&)XmmAlloc(pref); + c.movaps(*res->data, *from.data); + res->reg = -1; // ??? + LOG4_OPCODE("*"); + return *res; + } + void XmmInvalidate(const s8 reg) // invalidate cached register { assert(reg >= 0); @@ -298,6 +279,7 @@ public: if (xmm_var[i].taken) throw "XmmInvalidate(): xmm_var is taken"; LOG4_OPCODE("GPR[%d] invalidated (i=%d)", reg, i); xmm_var[i].reg = -1; + xmm_var[i].access = 0; } } } @@ -311,6 +293,7 @@ public: { LOG4_OPCODE("GPR[%d] invalidated (i=%d)", reg, i); xmm_var[i].reg = -1; + xmm_var[i].access = 0; } } for (u32 i = 0; i < 16; i++) @@ -331,6 +314,8 @@ public: //reg = -1; xmm_var[i].reg = reg; xmm_var[i].taken = false; + xmm_var[i].got = false; + xmm_var[i].access = 0; return; } } @@ -346,10 +331,30 @@ public: //c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); LOG4_OPCODE("GPR[%d] released (i=%d)", xmm_var[i].reg, i); xmm_var[i].reg = -1; + xmm_var[i].access = 0; } } } + Mem XmmConst(const __m128i data) + { + for (u32 i = 0; i < rec.imm_table.size(); i++) + { + if (rec.imm_table[i].m128i_u64[0] == data.m128i_u64[0] && rec.imm_table[i].m128i_u64[1] == data.m128i_u64[1]) + { + return oword_ptr(*imm_var, i * sizeof(__m128i)); + } + } + const int shift = rec.imm_table.size() * sizeof(__m128i); + rec.imm_table.push_back(data); + return oword_ptr(*imm_var, shift); + } + + Mem XmmConst(const __m128 data) + { + return XmmConst((__m128i&)data); + } + private: //0 - 10 void STOP(u32 code) @@ -427,17 +432,17 @@ private: if (ra == rb) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { // sub from - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.psubd(vb.get(), va->get()); + c.psubd(vb.get(), va->read()); } else { @@ -454,7 +459,7 @@ private: // mov if (ra != rt) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop @@ -462,10 +467,10 @@ private: else { // or - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.orps(vb.get(), va->get()); + c.orps(vb.get(), va->read()); } else { @@ -477,28 +482,23 @@ private: } void BG(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._u32[0] > CPU.GPR[rb]._u32[0] ? 0 : 1; - CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._u32[1] > CPU.GPR[rb]._u32[1] ? 0 : 1; - CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._u32[2] > CPU.GPR[rb]._u32[2] ? 0 : 1; - CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[3] > CPU.GPR[rb]._u32[3] ? 0 : 1; - WRAPPER_END(rt, ra, rb, 0); - - /*XmmVar v0(c); if (ra == rb) { - c.movaps(v0, imm_xmm(s19_to_s32[1])); - c.movaps(cpu_xmm(GPR[rt]), v0); + const XmmLink& v1 = XmmAlloc(rt); + c.movaps(v1.get(), XmmConst(_mm_set1_epi32(1))); + XmmFinalize(v1, rt); } else { - XmmVar v1(c), v2(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); - c.movdqa(v1, cpu_xmm(GPR[rb])); // compare if-greater-than - - c.movdqa(cpu_xmm(GPR[rt]), v0); - + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = XmmGet(rb); + c.psubd(va.get(), XmmConst(_mm_set1_epi32(0x80000000))); + c.psubd(vb.get(), XmmConst(_mm_set1_epi32(0x80000000))); + c.pcmpgtd(va.get(), vb.get()); + c.paddd(va.get(), XmmConst(_mm_set1_epi32(1))); + XmmFinalize(va, rt); + XmmFinalize(vb); // sign bits: // a b (b-a) -> (result of BG) // 0 0 0 -> 1 @@ -510,23 +510,23 @@ private: // 1 1 0 -> 0 // 1 1 1 -> 1 } - LOG_OPCODE();*/ + LOG_OPCODE(); } void SFH(u32 rt, u32 ra, u32 rb) { if (ra == rb) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.psubw(vb.get(), va->get()); + c.psubw(vb.get(), va->read()); } else { @@ -538,18 +538,44 @@ private: } void NOR(u32 rt, u32 ra, u32 rb) { - const XmmLink& va = XmmGet(ra); - if (ra != rb) c.orps(va.get(), cpu_xmm(GPR[rb])); - c.xorps(va.get(), imm_xmm(s19_to_s32[0x7ffff])); + const XmmLink& va = XmmGet(ra, rt); + if (ra != rb) + { + if (const XmmLink* vb = XmmRead(rb)) + { + c.orps(va.get(), vb->read()); + } + else + { + c.orps(va.get(), cpu_xmm(GPR[rb])); + } + } + c.xorps(va.get(), XmmConst(_mm_set1_epi32(-1))); XmmFinalize(va, rt); LOG_OPCODE(); } void ABSDB(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = CPU.GPR[rb]._u8[b] > CPU.GPR[ra]._u8[b] ? CPU.GPR[rb]._u8[b] - CPU.GPR[ra]._u8[b] : CPU.GPR[ra]._u8[b] - CPU.GPR[rb]._u8[b]; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = XmmGet(rb); + const XmmLink& vm = XmmCopy(va); + c.pmaxub(va.get(), vb.get()); + c.pminub(vb.get(), vm.get()); + c.psubb(va.get(), vb.get()); + XmmFinalize(va, rt); + XmmFinalize(vb); + XmmFinalize(vm); + } + LOG_OPCODE(); } void ROT(u32 rt, u32 ra, u32 rb) { @@ -589,7 +615,6 @@ private: // AVX2: masking with 0x3f + VPSLLVD may be better /*XmmInvalidate(rt); - for (u32 i = 0; i < 4; i++) { GpVar v0(c, kVarTypeUInt32); @@ -635,30 +660,45 @@ private: } void ROTI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = i7 & 0x1f; - CPU.GPR[rt]._u32[0] = (CPU.GPR[ra]._u32[0] << nRot) | (CPU.GPR[ra]._u32[0] >> (32 - nRot)); - CPU.GPR[rt]._u32[1] = (CPU.GPR[ra]._u32[1] << nRot) | (CPU.GPR[ra]._u32[1] >> (32 - nRot)); - CPU.GPR[rt]._u32[2] = (CPU.GPR[ra]._u32[2] << nRot) | (CPU.GPR[ra]._u32[2] >> (32 - nRot)); - CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[3] << nRot) | (CPU.GPR[ra]._u32[3] >> (32 - nRot)); - WRAPPER_END(rt, ra, i7, 0); - } - void ROTMI(u32 rt, u32 ra, s32 i7) - { - const int nRot = (0 - i7) & 0x3f; - if (nRot > 31) - { - // zero - const XmmLink& v0 = XmmAlloc(); - c.xorps(v0.get(), v0.get()); - XmmFinalize(v0, rt); - } - else if (nRot == 0) + const int s = i7 & 0x1f; + if (s == 0) { // mov if (ra != rt) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& v1 = XmmCopy(va); + c.pslld(va.get(), s); + c.psrld(v1.get(), 32 - s); + c.por(va.get(), v1.get()); + XmmFinalize(va, rt); + XmmFinalize(v1); + } + LOG_OPCODE(); + } + void ROTMI(u32 rt, u32 ra, s32 i7) + { + const int s = (0 - i7) & 0x3f; + if (s > 31) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else if (s == 0) + { + // mov + if (ra != rt) + { + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop @@ -666,21 +706,21 @@ private: else { // shift right logical - const XmmLink& va = XmmGet(ra); - c.psrld(va.get(), nRot); + const XmmLink& va = XmmGet(ra, rt); + c.psrld(va.get(), s); XmmFinalize(va, rt); } LOG_OPCODE(); } void ROTMAI(u32 rt, u32 ra, s32 i7) { - const int nRot = (0 - i7) & 0x3f; - if (nRot == 0) + const int s = (0 - i7) & 0x3f; + if (s == 0) { // mov if (ra != rt) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop @@ -688,8 +728,8 @@ private: else { // shift right arithmetical - const XmmLink& va = XmmGet(ra); - c.psrad(va.get(), nRot); + const XmmLink& va = XmmGet(ra, rt); + c.psrad(va.get(), s); XmmFinalize(va, rt); } LOG_OPCODE(); @@ -700,7 +740,7 @@ private: if (s > 31) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } @@ -709,7 +749,7 @@ private: // mov if (ra != rt) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop @@ -717,7 +757,7 @@ private: else { // shift left - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); c.pslld(va.get(), s); XmmFinalize(va, rt); } @@ -725,50 +765,123 @@ private: } void ROTHI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = i7 & 0xf; - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._u16[h] << nRot) | (CPU.GPR[ra]._u16[h] >> (16 - nRot)); - WRAPPER_END(rt, ra, i7, 0); + const int s = i7 & 0xf; + if (s == 0) + { + // mov + if (ra != rt) + { + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& v1 = XmmCopy(va); + c.psllw(va.get(), s); + c.psrlw(v1.get(), 16 - s); + c.por(va.get(), v1.get()); + XmmFinalize(va, rt); + XmmFinalize(v1); + } + LOG_OPCODE(); } void ROTHMI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) & 0x1f; - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = nRot < 16 ? CPU.GPR[ra]._u16[h] >> nRot : 0; - WRAPPER_END(rt, ra, i7, 0); + const int s = (0 - i7) & 0x1f; + if (s > 15) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else if (s == 0) + { + // mov + if (ra != rt) + { + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + // shift right logical + const XmmLink& va = XmmGet(ra, rt); + c.psrlw(va.get(), s); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void ROTMAHI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = (0 - (s32)i7) & 0x1f; - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = nRot < 16 ? CPU.GPR[ra]._i16[h] >> nRot : CPU.GPR[ra]._i16[h] >> 15; - WRAPPER_END(rt, ra, i7, 0); + const int s = (0 - i7) & 0x1f; + if (s == 0) + { + // mov + if (ra != rt) + { + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + // shift right arithmetical + const XmmLink& va = XmmGet(ra, rt); + c.psraw(va.get(), s); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void SHLHI(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int nRot = i7 & 0x1f; - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = nRot > 15 ? 0 : CPU.GPR[ra]._u16[h] << nRot; - WRAPPER_END(rt, ra, i7, 0); + const int s = i7 & 0x1f; + if (s > 15) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.xorps(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else if (s == 0) + { + // mov + if (ra != rt) + { + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + // shift left + const XmmLink& va = XmmGet(ra, rt); + c.psllw(va.get(), s); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void A(u32 rt, u32 ra, u32 rb) { if (ra == rb) { - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rt); c.paddd(vb.get(), vb.get()); XmmFinalize(vb, rt); } else { - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.paddd(vb.get(), va->get()); + c.paddd(vb.get(), va->read()); } else { @@ -785,7 +898,7 @@ private: if (rt != ra) { // mov - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop @@ -793,10 +906,10 @@ private: else { // and - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.andps(vb.get(), va->get()); + c.andps(vb.get(), va->read()); } else { @@ -808,36 +921,81 @@ private: } void CG(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = ((CPU.GPR[ra]._u32[0] + CPU.GPR[rb]._u32[0]) < CPU.GPR[ra]._u32[0]) ? 1 : 0; - CPU.GPR[rt]._u32[1] = ((CPU.GPR[ra]._u32[1] + CPU.GPR[rb]._u32[1]) < CPU.GPR[ra]._u32[1]) ? 1 : 0; - CPU.GPR[rt]._u32[2] = ((CPU.GPR[ra]._u32[2] + CPU.GPR[rb]._u32[2]) < CPU.GPR[ra]._u32[2]) ? 1 : 0; - CPU.GPR[rt]._u32[3] = ((CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) < CPU.GPR[ra]._u32[3]) ? 1 : 0; - WRAPPER_END(rt, ra, rb, 0); - // TODO + if (ra == rb) + { + const XmmLink& va = XmmGet(ra, rt); + c.psrld(va.get(), 31); + XmmFinalize(va, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = XmmGet(rb); + c.paddd(vb.get(), va.get()); + c.psubd(va.get(), XmmConst(_mm_set1_epi32(0x80000000))); + c.psubd(vb.get(), XmmConst(_mm_set1_epi32(0x80000000))); + c.pcmpgtd(va.get(), vb.get()); + c.psrld(va.get(), 31); + XmmFinalize(va, rt); + XmmFinalize(vb); + } + LOG_OPCODE(); } void AH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] + CPU.GPR[rb]._u16[h]; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + const XmmLink& va = XmmGet(ra, rt); + c.paddw(va.get(), va.get()); + XmmFinalize(va, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.paddw(va.get(), vb->read()); + } + else + { + c.paddw(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void NAND(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = ~(CPU.GPR[ra]._u32[0] & CPU.GPR[rb]._u32[0]); - CPU.GPR[rt]._u32[1] = ~(CPU.GPR[ra]._u32[1] & CPU.GPR[rb]._u32[1]); - CPU.GPR[rt]._u32[2] = ~(CPU.GPR[ra]._u32[2] & CPU.GPR[rb]._u32[2]); - CPU.GPR[rt]._u32[3] = ~(CPU.GPR[ra]._u32[3] & CPU.GPR[rb]._u32[3]); - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + // not + const XmmLink& va = XmmGet(ra, rt); + c.xorps(va.get(), XmmConst(_mm_set1_epi32(-1))); + XmmFinalize(va, rt); + } + else + { + // nand + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.andps(va.get(), vb->read()); + } + else + { + c.andps(va.get(), cpu_xmm(GPR[rb])); + } + c.xorps(va.get(), XmmConst(_mm_set1_epi32(-1))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void AVGB(u32 rt, u32 ra, u32 rb) { const XmmLink& vb = XmmGet(rb); if (const XmmLink* va = XmmRead(ra)) { - c.pavgb(vb.get(), va->get()); + c.pavgb(vb.get(), va->read()); } else { @@ -1009,37 +1167,37 @@ private: } void GB(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[0] & 1) | - ((CPU.GPR[ra]._u32[1] & 1) << 1) | - ((CPU.GPR[ra]._u32[2] & 1) << 2) | - ((CPU.GPR[ra]._u32[3] & 1) << 3); - CPU.GPR[rt]._u32[2] = 0; - CPU.GPR[rt]._u64[0] = 0; - WRAPPER_END(rt, ra, 0, 0); - // TODO + const XmmLink& va = XmmGet(ra, rt); + c.pand(va.get(), XmmConst(_mm_set1_epi32(1))); + c.pmullw(va.get(), XmmConst(_mm_set_epi32(8, 4, 2, 1))); + c.phaddd(va.get(), va.get()); + c.phaddd(va.get(), va.get()); + c.pand(va.get(), XmmConst(_mm_set_epi32(0xffffffff, 0, 0, 0))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void GBH(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - u32 temp = 0; - for (int h = 0; h < 8; h++) - temp |= (CPU.GPR[ra]._u16[h] & 1) << h; - CPU.GPR[rt]._u32[3] = temp; - CPU.GPR[rt]._u32[2] = 0; - CPU.GPR[rt]._u64[0] = 0; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& va = XmmGet(ra, rt); + c.pand(va.get(), XmmConst(_mm_set1_epi16(1))); + c.pmullw(va.get(), XmmConst(_mm_set_epi16(128, 64, 32, 16, 8, 4, 2, 1))); + c.phaddw(va.get(), va.get()); + c.phaddw(va.get(), va.get()); + c.phaddw(va.get(), va.get()); + c.pand(va.get(), XmmConst(_mm_set_epi32(0xffff, 0, 0, 0))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void GBB(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - u32 temp = 0; - for (int b = 0; b < 16; b++) - temp |= (CPU.GPR[ra]._u8[b] & 1) << b; - CPU.GPR[rt]._u32[3] = temp; - CPU.GPR[rt]._u32[2] = 0; - CPU.GPR[rt]._u64[0] = 0; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& va = XmmGet(ra, rt); + //c.pand(va.get(), XmmConst(_mm_set1_epi8(1))); // ??? + c.pslld(va.get(), 7); + c.pmovmskb(*addr, va.get()); + c.pxor(va.get(), va.get()); + c.pinsrw(va.get(), *addr, 6); + XmmFinalize(va, rt); + LOG_OPCODE(); } void FSM(u32 rt, u32 ra) { @@ -1067,24 +1225,17 @@ private: } void FREST(u32 rt, u32 ra) { - const XmmLink& vr = XmmAlloc(); - if (const XmmLink* va = XmmRead(ra)) - { - c.rcpps(vr.get(), va->get()); - } - else - { - c.rcpps(vr.get(), cpu_xmm(GPR[ra])); - } - XmmFinalize(vr, rt); + const XmmLink& va = XmmGet(ra, rt); + c.rcpps(va.get(), va.get()); + XmmFinalize(va, rt); LOG_OPCODE(); } void FRSQEST(u32 rt, u32 ra) { - const XmmLink& vr = XmmGet(ra); - c.andps(vr.get(), imm_xmm(max_int)); // abs - c.rsqrtps(vr.get(), vr.get()); - XmmFinalize(vr, rt); + const XmmLink& va = XmmGet(ra, rt); + c.andps(va.get(), XmmConst(_mm_set1_epi32(0x7fffffff))); // abs + c.rsqrtps(va.get(), va.get()); + XmmFinalize(va, rt); LOG_OPCODE(); } void LQX(u32 rt, u32 ra, u32 rb) @@ -1270,14 +1421,25 @@ private: } void CWD(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); + /*WRAPPER_BEGIN(rt, ra, i7, zz); const int t = (CPU.GPR[ra]._u32[3] + (s32)i7) & 0xC; CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; CPU.GPR[rt]._u32[3 - (t >> 2)] = 0x00010203; - WRAPPER_END(rt, ra, i7, 0); - // TODO + WRAPPER_END(rt, ra, i7, 0);*/ + + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.add(*addr, i7); + c.and_(*addr, 3 << 2); + c.neg(*addr); + c.add(*addr, 3 << 2); + const XmmLink& vr = XmmAlloc(rt); + c.movaps(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + XmmFinalize(vr, rt); + XmmInvalidate(rt); + c.mov(dword_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u32[0])), 0x00010203); + LOG_OPCODE(); } void CDD(u32 rt, u32 ra, s32 i7) { @@ -1330,14 +1492,14 @@ private: // mov if (ra != rt) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop } else { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); const XmmLink& v1 = XmmCopy(va); c.pslldq(va.get(), s); c.psrldq(v1.get(), 16 - s); @@ -1355,7 +1517,7 @@ private: if (ra != rt) { // mov - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop @@ -1363,14 +1525,14 @@ private: else if (s > 15) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { // shift right - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); c.psrldq(va.get(), s); XmmFinalize(va, rt); } @@ -1384,7 +1546,7 @@ private: if (ra != rt) { // mov - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop @@ -1392,14 +1554,14 @@ private: else if (s > 15) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { // shift left - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); c.pslldq(va.get(), s); XmmFinalize(va, rt); } @@ -1414,16 +1576,16 @@ private: if (ra == rb) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.pcmpgtd(va.get(), vb->get()); + c.pcmpgtd(va.get(), vb->read()); } else { @@ -1438,17 +1600,17 @@ private: if (ra == rb) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { // xor - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.xorps(va.get(), vb->get()); + c.xorps(va.get(), vb->read()); } else { @@ -1548,46 +1710,42 @@ private: } void CLGT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (u32 i = 0; i < 4; ++i) - { - CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > CPU.GPR[rb]._u32[i]) ? 0xffffffff : 0x00000000; - } - WRAPPER_END(rt, ra, rb, 0); - - /*XmmVar v0(c); if (ra == rb) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { - // compare if-greater-then - // c.movdqa(v0, cpu_xmm(GPR[rb])); - // TODO - // c.movdqa(cpu_xmm(GPR[rt]), v0); + // compare if-greater-than + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = XmmGet(rb); + c.psubd(va.get(), XmmConst(_mm_set1_epi32(0x80000000))); + c.psubd(vb.get(), XmmConst(_mm_set1_epi32(0x80000000))); + c.pcmpgtd(va.get(), vb.get()); + XmmFinalize(va, rt); + XmmFinalize(vb); } - LOG_OPCODE();*/ + LOG_OPCODE(); } void ANDC(u32 rt, u32 ra, u32 rb) { if (ra == rb) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { // and not - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.andnps(vb.get(), va->get()); + c.andnps(vb.get(), va->read()); } else { @@ -1602,17 +1760,17 @@ private: if (ra == rb) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { // not-less-or-equal - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.cmpps(va.get(), vb->get(), 6); + c.cmpps(va.get(), vb->read(), 6); } else { @@ -1631,7 +1789,7 @@ private: } void FA(u32 rt, u32 ra, u32 rb) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (ra == rb) { c.addps(va.get(), va.get()); @@ -1640,7 +1798,7 @@ private: { if (const XmmLink* vb = XmmRead(rb)) { - c.addps(va.get(), vb->get()); + c.addps(va.get(), vb->read()); } else { @@ -1655,16 +1813,16 @@ private: if (ra == rb) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.subps(va.get(), vb->get()); + c.subps(va.get(), vb->read()); } else { @@ -1678,16 +1836,16 @@ private: { if (ra == rb) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); c.mulps(va.get(), va.get()); XmmFinalize(va, rt); } else { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.mulps(va.get(), vb->get()); + c.mulps(va.get(), vb->read()); } else { @@ -1808,7 +1966,7 @@ private: void ADDX(u32 rt, u32 ra, u32 rb) { const XmmLink& vt = XmmGet(rt); - c.pand(vt.get(), imm_xmm(s19_to_s32[1])); + c.pand(vt.get(), XmmConst(_mm_set1_epi32(1))); c.paddd(vt.get(), cpu_xmm(GPR[ra])); c.paddd(vt.get(), cpu_xmm(GPR[rb])); XmmFinalize(vt, rt); @@ -1817,11 +1975,11 @@ private: void SFX(u32 rt, u32 ra, u32 rb) { const XmmLink& vt = XmmGet(rt); - c.pandn(vt.get(), imm_xmm(s19_to_s32[1])); if (ra == rb) { // load zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); + c.pandn(vt.get(), XmmConst(_mm_set1_epi32(1))); c.pxor(v0.get(), v0.get()); c.psubd(v0.get(), vt.get()); XmmFinalize(v0, rt); @@ -1829,7 +1987,8 @@ private: else { // sub - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rt); + c.pandn(vt.get(), XmmConst(_mm_set1_epi32(1))); c.psubd(vb.get(), cpu_xmm(GPR[ra])); c.psubd(vb.get(), vt.get()); XmmFinalize(vb, rt); @@ -1979,11 +2138,15 @@ private: } void MPYH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = (CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2]) << 16; - WRAPPER_END(rt, ra, rb, 0); - // TODO + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + c.psrld(va.get(), 16); + c.pand(vb.get(), XmmConst(_mm_set1_epi32(0xffff))); + c.pmulld(va.get(), vb.get()); + c.pslld(va.get(), 16); + XmmFinalize(va, rt); + XmmFinalize(vb); + LOG_OPCODE(); } void MPYHH(u32 rt, u32 ra, u32 rb) { @@ -2024,16 +2187,16 @@ private: } void MPYU(u32 rt, u32 ra, u32 rb) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (ra == rb) { - c.pand(va.get(), imm_xmm(s19_to_s32[0xffff])); + c.pand(va.get(), XmmConst(_mm_set1_epi32(0xffff))); c.pmulld(va.get(), va.get()); } else { const XmmLink& v1 = XmmAlloc(); - c.movdqa(v1.get(), imm_xmm(s19_to_s32[0xffff])); // load mask + c.movdqa(v1.get(), XmmConst(_mm_set1_epi32(0xffff))); // load mask c.pand(va.get(), v1.get()); // clear high words of each dword c.pand(v1.get(), cpu_xmm(GPR[rb])); c.pmulld(va.get(), v1.get()); @@ -2067,10 +2230,10 @@ private: //0 - 9 void CFLTS(u32 rt, u32 ra, s32 i8) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (i8 != 173) { - c.mulps(va.get(), imm_xmm(scale_to_int[i8 & 0xff])); // scale + c.mulps(va.get(), XmmConst(_mm_set1_ps(pow(2, 173 - (i8 & 0xff))))); // scale } c.cvttps2dq(va.get(), va.get()); // convert to ints with truncation XmmFinalize(va, rt); @@ -2105,7 +2268,7 @@ private: c.movaps(v0, cpu_xmm(GPR[ra])); if (i8 != 173) { - c.mulps(v0, imm_xmm(scale_to_int[i8 & 0xff])); // scale + c.mulps(v0, XmmConst(_mm_set1_ps(pow(2, 173 - (i8 & 0xff))))); // scale } // TODO: handle negative values and convert to unsigned value // c.int3(); @@ -2115,11 +2278,11 @@ private: } void CSFLT(u32 rt, u32 ra, s32 i8) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); c.cvtdq2ps(va.get(), va.get()); // convert to floats if (i8 != 155) { - c.mulps(va.get(), imm_xmm(scale_to_float[i8 & 0xff])); // scale + c.mulps(va.get(), XmmConst(_mm_set1_ps(pow(2, (i8 & 0xff) - 155)))); // scale } XmmFinalize(va, rt); LOG_OPCODE(); @@ -2147,7 +2310,7 @@ private: c.cvtdq2ps(v0, v0); // convert to floats as signed if (i8 != 155) { - c.mulps(v0, imm_xmm(scale_to_float[i8 & 0xff])); // scale + c.mulps(v0, XmmConst(_mm_set1_ps(pow(2, (i8 & 0xff) - 155)))); // scale } c.movaps(cpu_xmm(GPR[rt]), v0); LOG_OPCODE();*/ @@ -2269,14 +2432,19 @@ private: if (i16 == 0) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { - const XmmLink& vr = XmmAlloc(); - c.movaps(vr.get(), imm_xmm(fsmbi_mask[i16 & 0xffff])); + const XmmLink& vr = XmmAlloc(rt); + __m128i fsmbi_mask; + for (u32 j = 0; j < 16; j++) + { + fsmbi_mask.m128i_i8[j] = ((i16 >> j) & 0x1) ? 0xff : 0; + } + c.movaps(vr.get(), XmmConst(fsmbi_mask)); XmmFinalize(vr, rt); } LOG_OPCODE(); @@ -2311,7 +2479,7 @@ private: } void IL(u32 rt, s32 i16) { - const XmmLink& vr = XmmAlloc(); + const XmmLink& vr = XmmAlloc(rt); if (i16 == 0) { c.xorps(vr.get(), vr.get()); @@ -2322,27 +2490,21 @@ private: } else { - c.movaps(vr.get(), imm_xmm(s19_to_s32[i16 & 0x7ffff])); + c.movaps(vr.get(), XmmConst(_mm_set1_epi32(i16))); } XmmFinalize(vr, rt); LOG_OPCODE(); } void ILHU(u32 rt, s32 i16) { - const XmmLink& vr = XmmAlloc(); + const XmmLink& vr = XmmAlloc(rt); if (i16 == 0) { c.xorps(vr.get(), vr.get()); } - else if (i16 == -1) - { - c.pcmpeqd(vr.get(), vr.get()); - c.pslld(vr.get(), 16); - } else { - c.movaps(vr.get(), imm_xmm(s19_to_s32[i16 & 0x7ffff])); - c.pslld(vr.get(), 16); + c.movaps(vr.get(), XmmConst(_mm_set1_epi32(i16 << 16))); } XmmFinalize(vr, rt); LOG_OPCODE(); @@ -2362,8 +2524,8 @@ private: } else { - const XmmLink& vt = XmmGet(rt); - c.orps(vt.get(), imm_xmm(s19_to_s32[i16 & 0xffff])); + const XmmLink& vt = XmmGet(rt, rt); + c.orps(vt.get(), XmmConst(_mm_set1_epi32(i16 & 0xffff))); XmmFinalize(vt, rt); } LOG_OPCODE(); @@ -2376,7 +2538,7 @@ private: if (i10 == -1) { // fill with 1 - const XmmLink& v1 = XmmAlloc(); + const XmmLink& v1 = XmmAlloc(rt); c.pcmpeqd(v1.get(), v1.get()); XmmFinalize(v1, rt); } @@ -2385,15 +2547,15 @@ private: if (rt != ra) { // mov - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop } else { - const XmmLink& va = XmmGet(ra); - c.orps(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + const XmmLink& va = XmmGet(ra, rt); + c.orps(va.get(), XmmConst(_mm_set1_epi32(i10))); XmmFinalize(va, rt); } LOG_OPCODE(); @@ -2417,7 +2579,7 @@ private: if (i10 == 0) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); c.psubd(v0.get(), cpu_xmm(GPR[ra])); XmmFinalize(v0, rt); @@ -2425,15 +2587,15 @@ private: else if (i10 == -1) { // fill with 1 - const XmmLink& v1 = XmmAlloc(); + const XmmLink& v1 = XmmAlloc(rt); c.pcmpeqd(v1.get(), v1.get()); c.psubd(v1.get(), cpu_xmm(GPR[ra])); XmmFinalize(v1, rt); } else { - const XmmLink& vr = XmmAlloc(); - c.movdqa(vr.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set1_epi32(i10))); c.psubd(vr.get(), cpu_xmm(GPR[ra])); XmmFinalize(vr, rt); } @@ -2451,7 +2613,7 @@ private: if (i10 == 0) { // zero - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } @@ -2460,15 +2622,15 @@ private: // mov if (ra != rt) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop } else { - const XmmLink& va = XmmGet(ra); - c.andps(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + const XmmLink& va = XmmGet(ra, rt); + c.andps(va.get(), XmmConst(_mm_set1_epi32(i10))); XmmFinalize(va, rt); } LOG_OPCODE(); @@ -2494,7 +2656,7 @@ private: if (rt != ra) { // mov - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); XmmFinalize(va, rt); } // else nop @@ -2502,8 +2664,8 @@ private: else { // add - const XmmLink& va = XmmGet(ra); - c.paddd(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + const XmmLink& va = XmmGet(ra, rt); + c.paddd(va.get(), XmmConst(_mm_set1_epi32(i10))); XmmFinalize(va, rt); } LOG_OPCODE(); @@ -2567,7 +2729,7 @@ private: void CGTI(u32 rt, u32 ra, s32 i10) { const XmmLink& va = XmmGet(ra); - c.pcmpgtd(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + c.pcmpgtd(va.get(), XmmConst(_mm_set1_epi32(i10))); XmmFinalize(va, rt); LOG_OPCODE(); } @@ -2595,27 +2757,21 @@ private: } void CLGTI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (u32 i = 0; i < 4; ++i) - { - CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] > (u32)i10) ? 0xffffffff : 0x00000000; - } - WRAPPER_END(rt, ra, i10, 0); - - /*XmmVar v0(c); if (i10 == -1) { // zero result - const XmmLink& v0 = XmmAlloc(); + const XmmLink& v0 = XmmAlloc(rt); c.xorps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else { - //c.movdqa(v0, imm_xmm(s19_to_s32[i10 & 0x7ffff])); - // TODO - //c.movdqa(cpu_xmm(GPR[rt]), v0); - }*/ + const XmmLink& va = XmmGet(ra); + c.psubd(va.get(), XmmConst(_mm_set1_epi32(0x80000000))); + c.pcmpgtd(va.get(), XmmConst(_mm_set1_epi32((u32)i10 - 0x80000000))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void CLGTHI(u32 rt, u32 ra, s32 i10) { @@ -2658,7 +2814,7 @@ private: void CEQI(u32 rt, u32 ra, s32 i10) { const XmmLink& va = XmmGet(ra); - c.pcmpeqd(va.get(), imm_xmm(s19_to_s32[i10 & 0x7ffff])); + c.pcmpeqd(va.get(), XmmConst(_mm_set1_epi32(i10))); XmmFinalize(va, rt); LOG_OPCODE(); } @@ -2697,14 +2853,14 @@ private: } void ILA(u32 rt, u32 i18) { - const XmmLink& vr = XmmAlloc(); + const XmmLink& vr = XmmAlloc(rt); if (i18 == 0) { c.xorps(vr.get(), vr.get()); } else { - c.movaps(vr.get(), imm_xmm(s19_to_s32[i18 & 0x3ffff])); + c.movaps(vr.get(), XmmConst(_mm_set1_epi32(i18 & 0x3ffff))); } XmmFinalize(vr, rt); LOG_OPCODE(); @@ -2757,13 +2913,13 @@ private: const XmmLink& v2 = XmmCopy(v0); // v2 = mask const XmmLink& v3 = XmmAlloc(); const XmmLink& v4 = XmmAlloc(); - const XmmLink& vFF = XmmAlloc(); + const XmmLink& vFF = XmmAlloc(rt); // generate specific values: - c.movdqa(v1.get(), imm_xmm(u8_to_u8[0xe0])); // v1 = 11100000 - c.movdqa(v3.get(), imm_xmm(u8_to_u8[0x80])); // v3 = 10000000 + c.movdqa(v1.get(), XmmConst(_mm_set1_epi32(0xe0e0e0e0))); // v1 = 11100000 + c.movdqa(v3.get(), XmmConst(_mm_set1_epi32(0x80808080))); // v3 = 10000000 c.pand(v2.get(), v1.get()); // filter mask v2 = mask & 11100000 c.movdqa(vFF.get(), v2.get()); // and copy vFF = mask & 11100000 - c.movdqa(v4.get(), imm_xmm(u8_to_u8[0xc0])); // v4 = 11000000 + c.movdqa(v4.get(), XmmConst(_mm_set1_epi32(0xc0c0c0c0))); // v4 = 11000000 c.pcmpeqb(vFF.get(), v4.get()); // gen 0xff vFF = (mask & 11100000 == 11000000) ? 0xff : 0 c.movdqa(v4.get(), v2.get()); // copy again v4 = mask & 11100000 c.pand(v4.get(), v3.get()); // filter mask v4 = mask & 10000000 @@ -2773,13 +2929,13 @@ private: c.por(vFF.get(), v2.get()); // merge 0xff, 0x80 vFF = (mask & 11100000 == 11000000) ? 0xff : (mask & 11100000 == 11100000) ? 0x80 : 0 c.pandn(v1.get(), v0.get()); // filter mask v1 = mask & 00011111 // select bytes from [rb]: - c.movdqa(v2.get(), imm_xmm(u8_to_u8[15])); // v2 = 00001111 - c.pxor(v1.get(), imm_xmm(u8_to_u8[0x10])); // v1 = (mask & 00011111) ^ 00010000 + c.movdqa(v2.get(), XmmConst(_mm_set1_epi8(15))); // v2 = 00001111 + c.pxor(v1.get(), XmmConst(_mm_set1_epi8(0x10))); // v1 = (mask & 00011111) ^ 00010000 c.psubb(v2.get(), v1.get()); // v2 = 00001111 - ((mask & 00011111) ^ 00010000) c.movdqa(v1.get(), cpu_xmm(GPR[rb])); // v1 = rb c.pshufb(v1.get(), v2.get()); // v1 = select(rb, 00001111 - ((mask & 00011111) ^ 00010000)) // select bytes from [ra]: - c.pxor(v2.get(), imm_xmm(u8_to_u8[0xf0])); // v2 = (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000 + c.pxor(v2.get(), XmmConst(_mm_set1_epi32(0xf0f0f0f0))); // v2 = (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000 c.movdqa(v3.get(), cpu_xmm(GPR[ra])); // v3 = ra c.pshufb(v3.get(), v2.get()); // v3 = select(ra, (00001111 - ((mask & 00011111) ^ 00010000)) ^ 11110000) c.por(v1.get(), v3.get()); // v1 = select(rb, 00001111 - ((mask & 00011111) ^ 00010000)) | (v3) @@ -2803,7 +2959,7 @@ private: void FNMS(u32 rt, u32 ra, u32 rb, u32 rc) { const XmmLink& va = XmmGet(ra); - const XmmLink& vc = (ra == rc) ? XmmCopy(va) : XmmGet(rc); + const XmmLink& vc = (ra == rc) ? XmmCopy(va, rt) : XmmGet(rc, rt); if (ra == rb) { @@ -2817,7 +2973,7 @@ private: { if (const XmmLink* vb = XmmRead(rb)) { - c.mulps(va.get(), vb->get()); + c.mulps(va.get(), vb->read()); } else { @@ -2833,10 +2989,10 @@ private: { if (ra != rb && rb != rc && rc != ra) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.mulps(va.get(), vb->get()); + c.mulps(va.get(), vb->read()); } else { @@ -2844,7 +3000,7 @@ private: } if (const XmmLink* vc = XmmRead(rc)) { - c.addps(va.get(), vc->get()); + c.addps(va.get(), vc->read()); } else { @@ -2854,7 +3010,7 @@ private: } else if (ra == rb && rb == rc) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); const XmmLink& vc = XmmCopy(va); c.mulps(va.get(), va.get()); c.addps(va.get(), vc.get()); @@ -2863,11 +3019,11 @@ private: } else if (ra == rb) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); c.mulps(va.get(), va.get()); if (const XmmLink* vc = XmmRead(rc)) { - c.addps(va.get(), vc->get()); + c.addps(va.get(), vc->read()); } else { @@ -2877,28 +3033,28 @@ private: } else if (rb == rc) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vc = XmmRead(rc)) { - c.mulps(va.get(), vc->get()); - c.addps(va.get(), vc->get()); + c.mulps(va.get(), vc->read()); + c.addps(va.get(), vc->read()); } else { - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rb); c.mulps(va.get(), vb.get()); c.addps(va.get(), vb.get()); - XmmFinalize(vb); + XmmFinalize(vb, rb); } XmmFinalize(va, rt); } else if (ra == rc) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); const XmmLink& vc = XmmCopy(va); if (const XmmLink* vb = XmmRead(rb)) { - c.mulps(va.get(), vb->get()); + c.mulps(va.get(), vb->read()); } else { @@ -2918,10 +3074,10 @@ private: { if (ra != rb && rb != rc && rc != ra) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.mulps(va.get(), vb->get()); + c.mulps(va.get(), vb->read()); } else { @@ -2929,7 +3085,7 @@ private: } if (const XmmLink* vc = XmmRead(rc)) { - c.subps(va.get(), vc->get()); + c.subps(va.get(), vc->read()); } else { @@ -2939,7 +3095,7 @@ private: } else if (ra == rb && rb == rc) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); const XmmLink& vc = XmmCopy(va); c.mulps(va.get(), va.get()); c.subps(va.get(), vc.get()); @@ -2948,11 +3104,11 @@ private: } else if (ra == rb) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); c.mulps(va.get(), va.get()); if (const XmmLink* vc = XmmRead(rc)) { - c.subps(va.get(), vc->get()); + c.subps(va.get(), vc->read()); } else { @@ -2962,28 +3118,28 @@ private: } else if (rb == rc) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vc = XmmRead(rc)) { - c.mulps(va.get(), vc->get()); - c.subps(va.get(), vc->get()); + c.mulps(va.get(), vc->read()); + c.subps(va.get(), vc->read()); } else { - const XmmLink& vb = XmmGet(rb); + const XmmLink& vb = XmmGet(rb, rb); c.mulps(va.get(), vb.get()); c.subps(va.get(), vb.get()); - XmmFinalize(vb); + XmmFinalize(vb, rb); } XmmFinalize(va, rt); } else if (ra == rc) { - const XmmLink& va = XmmGet(ra); + const XmmLink& va = XmmGet(ra, rt); const XmmLink& vc = XmmCopy(va); if (const XmmLink* vb = XmmRead(rb)) { - c.mulps(va.get(), vb->get()); + c.mulps(va.get(), vb->read()); } else { diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index 35f784d857..f1a5801950 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -4,8 +4,6 @@ #include "SPUInterpreter.h" #include "SPURecompiler.h" -static const SPUImmTable g_spu_imm; - SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) : m_enc(new SPURecompiler(cpu, *this)) , inter(new SPUInterpreter(cpu)) @@ -135,9 +133,9 @@ void SPURecompilerCore::Compile(u16 pos) log.Open(wxString::Format("SPUjit_%d.log", GetCurrentSPUThread().GetId()), first ? wxFile::write : wxFile::write_append); log.Write(wxString::Format("========== START POSITION 0x%x ==========\n\n", start * 4)); log.Write(wxString(stringLogger.getString())); + log.Write(wxString::Format("========== COMPILED %d (excess %d), time: [start=%lld (decoding=%lld), finalize=%lld]\n\n", + entry[start].count, excess, stamp1 - stamp0, time0, get_system_time() - stamp1)); log.Close(); - //ConLog.Write("Compiled: %d (excess %d), addr=0x%x, time: [start=%d (decoding=%d), finalize=%d]", - //entry[start].count, excess, start * 4, stamp1 - stamp0, time0, get_system_time() - stamp1); m_enc->compiler = nullptr; first = false; } @@ -200,7 +198,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) return 0; } - typedef u32(*Func)(void* _cpu, void* _ls, const SPUImmTable* _imm, u32 _pos); + typedef u32(*Func)(void* _cpu, void* _ls, const void* _imm, u32 _pos); Func func = asmjit_cast(entry[pos].pointer); @@ -217,7 +215,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) } u16 res = pos; - res = (u16)func(cpu, &Memory[m_offset], &g_spu_imm, res); + res = (u16)func(cpu, &Memory[m_offset], imm_table.data(), res); if (did_compile) { @@ -238,6 +236,4 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) CPU.SetBranch((u64)res << 2); return 0; } - /*Decode(Memory.Read32(address)); - return 4;*/ } \ No newline at end of file From 78757383911f7e0f8b7b40b98df6f1ea3f6d97ef Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sat, 19 Apr 2014 02:11:07 +0400 Subject: [PATCH 12/14] More opcodes --- rpcs3/Emu/Cell/SPUInterpreter.h | 1 + rpcs3/Emu/Cell/SPURecompiler.h | 1209 ++++++++++++++++++-------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 10 +- 3 files changed, 857 insertions(+), 363 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUInterpreter.h b/rpcs3/Emu/Cell/SPUInterpreter.h index 269fcd2043..e93e1e1fa7 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.h +++ b/rpcs3/Emu/Cell/SPUInterpreter.h @@ -60,6 +60,7 @@ private: } void MFSPR(u32 rt, u32 sa) { + UNIMPLEMENTED(); //If register is a dummy register (register labeled 0x0) if(sa == 0x0) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 7a2932bd0f..51c909b575 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -183,7 +183,7 @@ public: if (last >= 0) { // (saving cached data?) - //c.movaps(cpu_xmm(GPR[xmm_var[last].reg]), *xmm_var[last].data); + //c.movdqa(cpu_xmm(GPR[xmm_var[last].reg]), *xmm_var[last].data); xmm_var[last].taken = true; xmm_var[last].got = false; LOG4_OPCODE("cached reg taken (i=%d): GPR[%d] lost", last, xmm_var[last].reg); @@ -241,17 +241,17 @@ public: res = &(XmmLink&)XmmAlloc(target); /*if (target != res->reg) { - c.movaps(*res->data, cpu_xmm(GPR[reg])); + c.movdqa(*res->data, cpu_xmm(GPR[reg])); } else*/ { if (const XmmLink* source = XmmRead(reg)) { - c.movaps(*res->data, source->read()); + c.movdqa(*res->data, source->read()); } else { - c.movaps(*res->data, cpu_xmm(GPR[reg])); + c.movdqa(*res->data, cpu_xmm(GPR[reg])); } } res->reg = -1; // ??? @@ -263,7 +263,7 @@ public: const XmmLink& XmmCopy(const XmmLink& from, s8 pref = -1) // XmmAlloc + mov { XmmLink* res = &(XmmLink&)XmmAlloc(pref); - c.movaps(*res->data, *from.data); + c.movdqa(*res->data, *from.data); res->reg = -1; // ??? LOG4_OPCODE("*"); return *res; @@ -304,7 +304,7 @@ public: // save immediately: if (reg >= 0) { - c.movaps(cpu_xmm(GPR[reg]), *xmm_var[i].data); + c.movdqa(cpu_xmm(GPR[reg]), *xmm_var[i].data); } else { @@ -328,7 +328,7 @@ public: { if (xmm_var[i].reg >= 0) { - //c.movaps(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); + //c.movdqa(cpu_xmm(GPR[xmm_var[i].reg]), *xmm_var[i].data); LOG4_OPCODE("GPR[%d] released (i=%d)", xmm_var[i].reg, i); xmm_var[i].reg = -1; xmm_var[i].access = 0; @@ -396,7 +396,7 @@ private: } void MFSPR(u32 rt, u32 sa) { - WRAPPER_BEGIN(rt, sa, yy, zz); + UNIMPLEMENTED(); //If register is a dummy register (register labeled 0x0) if(sa == 0x0) { @@ -408,7 +408,6 @@ private: CPU.GPR[rt]._u128.hi = CPU.SPR[sa]._u128.hi; CPU.GPR[rt]._u128.lo = CPU.SPR[sa]._u128.lo; } - WRAPPER_END(rt, sa, 0, 0); } void RDCH(u32 rt, u32 ra) { @@ -433,7 +432,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -470,11 +469,11 @@ private: const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.orps(vb.get(), va->read()); + c.por(vb.get(), va->read()); } else { - c.orps(vb.get(), cpu_xmm(GPR[ra])); + c.por(vb.get(), cpu_xmm(GPR[ra])); } XmmFinalize(vb, rt); } @@ -485,7 +484,7 @@ private: if (ra == rb) { const XmmLink& v1 = XmmAlloc(rt); - c.movaps(v1.get(), XmmConst(_mm_set1_epi32(1))); + c.movdqa(v1.get(), XmmConst(_mm_set1_epi32(1))); XmmFinalize(v1, rt); } else @@ -518,7 +517,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -543,14 +542,14 @@ private: { if (const XmmLink* vb = XmmRead(rb)) { - c.orps(va.get(), vb->read()); + c.por(va.get(), vb->read()); } else { - c.orps(va.get(), cpu_xmm(GPR[rb])); + c.por(va.get(), cpu_xmm(GPR[rb])); } } - c.xorps(va.get(), XmmConst(_mm_set1_epi32(-1))); + c.pxor(va.get(), XmmConst(_mm_set1_epi32(-1))); XmmFinalize(va, rt); LOG_OPCODE(); } @@ -560,7 +559,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -690,7 +689,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else if (s == 0) @@ -741,7 +740,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else if (s == 0) @@ -795,7 +794,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else if (s == 0) @@ -846,7 +845,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else if (s == 0) @@ -909,11 +908,11 @@ private: const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.andps(vb.get(), va->read()); + c.pand(vb.get(), va->read()); } else { - c.andps(vb.get(), cpu_xmm(GPR[ra])); + c.pand(vb.get(), cpu_xmm(GPR[ra])); } XmmFinalize(vb, rt); } @@ -970,7 +969,7 @@ private: { // not const XmmLink& va = XmmGet(ra, rt); - c.xorps(va.get(), XmmConst(_mm_set1_epi32(-1))); + c.pxor(va.get(), XmmConst(_mm_set1_epi32(-1))); XmmFinalize(va, rt); } else @@ -979,13 +978,13 @@ private: const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.andps(va.get(), vb->read()); + c.pand(va.get(), vb->read()); } else { - c.andps(va.get(), cpu_xmm(GPR[rb])); + c.pand(va.get(), cpu_xmm(GPR[rb])); } - c.xorps(va.get(), XmmConst(_mm_set1_epi32(-1))); + c.pxor(va.get(), XmmConst(_mm_set1_epi32(-1))); XmmFinalize(va, rt); } LOG_OPCODE(); @@ -1291,43 +1290,88 @@ private: } void CBX(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0xF; - - CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; - CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; - CPU.GPR[rt]._u8[15 - t] = 0x03; - WRAPPER_END(rt, ra, rb, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + if (ra == rb) + { + c.add(*addr, *addr); + } + else + { + c.add(*addr, cpu_dword(GPR[rb]._u32[3])); + } + c.and_(*addr, 0xf); + c.neg(*addr); + c.add(*addr, 0xf); + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + XmmFinalize(vr, rt); + XmmInvalidate(rt); + c.mov(byte_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u8[0])), 0x03); + LOG_OPCODE(); } void CHX(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0xE; - - CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; - CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; - CPU.GPR[rt]._u16[7 - (t >> 1)] = 0x0203; - WRAPPER_END(rt, ra, rb, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + if (ra == rb) + { + c.add(*addr, *addr); + } + else + { + c.add(*addr, cpu_dword(GPR[rb]._u32[3])); + } + c.and_(*addr, 0xe); + c.neg(*addr); + c.add(*addr, 0xe); + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + XmmFinalize(vr, rt); + XmmInvalidate(rt); + c.mov(word_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u16[0])), 0x0203); + LOG_OPCODE(); } void CWX(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const u32 t = (CPU.GPR[ra]._u32[3] + CPU.GPR[rb]._u32[3]) & 0xC; - - CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; - CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; - CPU.GPR[rt]._u32[3 - (t >> 2)] = 0x00010203; - WRAPPER_END(rt, ra, rb, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + if (ra == rb) + { + c.add(*addr, *addr); + } + else + { + c.add(*addr, cpu_dword(GPR[rb]._u32[3])); + } + c.and_(*addr, 0xc); + c.neg(*addr); + c.add(*addr, 0xc); + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + XmmFinalize(vr, rt); + XmmInvalidate(rt); + c.mov(dword_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u32[0])), 0x00010203); + LOG_OPCODE(); } void CDX(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const u32 t = (CPU.GPR[rb]._u32[3] + CPU.GPR[ra]._u32[3]) & 0x8; - - CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; - CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; - CPU.GPR[rt]._u64[1 - (t >> 3)] = (u64)0x0001020304050607; - WRAPPER_END(rt, ra, rb, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + if (ra == rb) + { + c.add(*addr, *addr); + } + else + { + c.add(*addr, cpu_dword(GPR[rb]._u32[3])); + } + c.and_(*addr, 0x8); + c.neg(*addr); + c.add(*addr, 0x8); + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + XmmFinalize(vr, rt); + XmmInvalidate(rt); + c.mov(dword_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u32[0])), 0x00010203); + c.mov(dword_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u32[1])), 0x04050607); + LOG_OPCODE(); } void ROTQBI(u32 rt, u32 ra, u32 rb) { @@ -1401,41 +1445,41 @@ private: } void CBD(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int t = (CPU.GPR[ra]._u32[3] + i7) & 0xF; - - CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; - CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; - CPU.GPR[rt]._u8[15 - t] = 0x03; - WRAPPER_END(rt, ra, i7, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.add(*addr, i7); + c.and_(*addr, 0xf); + c.neg(*addr); + c.add(*addr, 0xf); + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + XmmFinalize(vr, rt); + XmmInvalidate(rt); + c.mov(byte_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u8[0])), 0x03); + LOG_OPCODE(); } void CHD(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int t = (CPU.GPR[ra]._u32[3] + (s32)i7) & 0xE; - - CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; - CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; - CPU.GPR[rt]._u16[7 - (t >> 1)] = 0x0203; - WRAPPER_END(rt, ra, i7, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.add(*addr, i7); + c.and_(*addr, 0xe); + c.neg(*addr); + c.add(*addr, 0xe); + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + XmmFinalize(vr, rt); + XmmInvalidate(rt); + c.mov(word_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u16[0])), 0x0203); + LOG_OPCODE(); } void CWD(u32 rt, u32 ra, s32 i7) { - /*WRAPPER_BEGIN(rt, ra, i7, zz); - const int t = (CPU.GPR[ra]._u32[3] + (s32)i7) & 0xC; - - CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; - CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; - CPU.GPR[rt]._u32[3 - (t >> 2)] = 0x00010203; - WRAPPER_END(rt, ra, i7, 0);*/ - c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); c.add(*addr, i7); - c.and_(*addr, 3 << 2); + c.and_(*addr, 0xc); c.neg(*addr); - c.add(*addr, 3 << 2); + c.add(*addr, 0xc); const XmmLink& vr = XmmAlloc(rt); - c.movaps(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + c.movdqa(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); XmmFinalize(vr, rt); XmmInvalidate(rt); c.mov(dword_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u32[0])), 0x00010203); @@ -1443,13 +1487,18 @@ private: } void CDD(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int t = (CPU.GPR[ra]._u32[3] + i7) & 0x8; - - CPU.GPR[rt]._u64[0] = (u64)0x18191A1B1C1D1E1F; - CPU.GPR[rt]._u64[1] = (u64)0x1011121314151617; - CPU.GPR[rt]._u64[1 - (t >> 3)] = (u64)0x0001020304050607; - WRAPPER_END(rt, ra, i7, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.add(*addr, i7); + c.and_(*addr, 0x8); + c.neg(*addr); + c.add(*addr, 0x8); + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set_epi32(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f))); + XmmFinalize(vr, rt); + XmmInvalidate(rt); + c.mov(dword_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u32[0])), 0x00010203); + c.mov(dword_ptr(*cpu_var, *addr, 0, offsetof(SPUThread, GPR[rt]._u32[1])), 0x04050607); + LOG_OPCODE(); } void ROTQBII(u32 rt, u32 ra, s32 i7) { @@ -1526,7 +1575,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -1555,7 +1604,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -1577,7 +1626,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -1601,7 +1650,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -1610,11 +1659,11 @@ private: const XmmLink& va = XmmGet(ra, rt); if (const XmmLink* vb = XmmRead(rb)) { - c.xorps(va.get(), vb->read()); + c.pxor(va.get(), vb->read()); } else { - c.xorps(va.get(), cpu_xmm(GPR[rb])); + c.pxor(va.get(), cpu_xmm(GPR[rb])); } XmmFinalize(va, rt); } @@ -1622,24 +1671,75 @@ private: } void CGTH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._i16[h] > CPU.GPR[rb]._i16[h] ? 0xffff : 0; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.pcmpgtw(va.get(), vb->read()); + } + else + { + c.pcmpgtw(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void EQV(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] ^ (~CPU.GPR[rb]._u32[w]); - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + const XmmLink& v1 = XmmAlloc(rt); + c.pcmpeqd(v1.get(), v1.get()); + XmmFinalize(v1, rt); + } + else + { + const XmmLink& vb = XmmGet(rb, rt); + c.pxor(vb.get(), XmmConst(_mm_set1_epi32(-1))); + if (const XmmLink* va = XmmRead(ra)) + { + c.pxor(vb.get(), va->read()); + } + else + { + c.pxor(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); + } + LOG_OPCODE(); } void CGTB(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._i8[b] > CPU.GPR[rb]._i8[b] ? 0xff : 0; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.pcmpgtb(va.get(), vb->read()); + } + else + { + c.pcmpgtb(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void SUMB(u32 rt, u32 ra, u32 rb) { @@ -1656,11 +1756,15 @@ private: //HGT uses signed values. HLGT uses unsigned values void HGT(u32 rt, s32 ra, s32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - if(CPU.GPR[ra]._i32[3] > CPU.GPR[rb]._i32[3]) CPU.Stop(); - WRAPPER_END(rt, ra, rb, 0); + c.mov(*addr, cpu_dword(GPR[ra]._i32[3])); + c.cmp(*addr, cpu_dword(GPR[rb]._i32[3])); + c.mov(*addr, 0); + c.setg(*addr); + c.neg(*addr); c.mov(*pos_var, (CPU.PC >> 2) + 1); + c.xor_(*pos_var, *addr); do_finalize = true; + LOG_OPCODE(); } void CLZ(u32 rt, u32 ra) { @@ -1679,17 +1783,20 @@ private: } void XSWD(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - CPU.GPR[rt]._i64[0] = (s64)CPU.GPR[ra]._i32[0]; - CPU.GPR[rt]._i64[1] = (s64)CPU.GPR[ra]._i32[2]; - WRAPPER_END(rt, ra, 0, 0); + c.movsxd(*qw0, cpu_dword(GPR[ra]._i32[0])); + c.movsxd(*qw1, cpu_dword(GPR[ra]._i32[2])); + c.mov(cpu_qword(GPR[rt]._i64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._i64[1]), *qw1); + XmmInvalidate(rt); + LOG_OPCODE(); } void XSHW(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = (s32)CPU.GPR[ra]._i16[w*2]; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& va = XmmGet(ra, rt); + c.pslld(va.get(), 16); + c.psrad(va.get(), 16); + XmmFinalize(va, rt); + LOG_OPCODE(); } void CNTB(u32 rt, u32 ra) { @@ -1703,10 +1810,11 @@ private: } void XSBH(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = (s16)CPU.GPR[ra]._i8[h*2]; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& va = XmmGet(ra, rt); + c.psllw(va.get(), 8); + c.psraw(va.get(), 8); + XmmFinalize(va, rt); + LOG_OPCODE(); } void CLGT(u32 rt, u32 ra, u32 rb) { @@ -1714,7 +1822,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -1736,7 +1844,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -1745,11 +1853,11 @@ private: const XmmLink& vb = XmmGet(rb, rt); if (const XmmLink* va = XmmRead(ra)) { - c.andnps(vb.get(), va->read()); + c.pandn(vb.get(), va->read()); } else { - c.andnps(vb.get(), cpu_xmm(GPR[ra])); + c.pandn(vb.get(), cpu_xmm(GPR[ra])); } XmmFinalize(vb, rt); } @@ -1757,35 +1865,33 @@ private: } void FCGT(u32 rt, u32 ra, u32 rb) { - if (ra == rb) + // reverted less-than + const XmmLink& vb = XmmGet(rb, rt); + if (const XmmLink* va = XmmRead(ra)) { - // zero - const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); - XmmFinalize(v0, rt); + c.cmpps(vb.get(), va->read(), 1); } else { - // not-less-or-equal - const XmmLink& va = XmmGet(ra, rt); - if (const XmmLink* vb = XmmRead(rb)) - { - c.cmpps(va.get(), vb->read(), 6); - } - else - { - c.cmpps(va.get(), cpu_xmm(GPR[rb]), 6); - } - XmmFinalize(va, rt); + c.cmpps(vb.get(), cpu_xmm(GPR[ra]), 1); } + XmmFinalize(vb, rt); LOG_OPCODE(); } void DFCGT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u64[0] = CPU.GPR[ra]._d[0] > CPU.GPR[rb]._d[0] ? 0xffffffffffffffff : 0; - CPU.GPR[rt]._u64[1] = CPU.GPR[ra]._d[1] > CPU.GPR[rb]._d[1] ? 0xffffffffffffffff : 0; - WRAPPER_END(rt, ra, rb, 0);; + // reverted less-than + const XmmLink& vb = XmmGet(rb, rt); + if (const XmmLink* va = XmmRead(ra)) + { + c.cmppd(vb.get(), va->read(), 1); + } + else + { + c.cmppd(vb.get(), cpu_xmm(GPR[ra]), 1); + } + XmmFinalize(vb, rt); + LOG_OPCODE(); } void FA(u32 rt, u32 ra, u32 rb) { @@ -1812,9 +1918,9 @@ private: { if (ra == rb) { - // zero + // zero (?) const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.subps(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -1857,111 +1963,251 @@ private: } void CLGTH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] > CPU.GPR[rb]._u16[h] ? 0xffff : 0; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else + { + // compare if-greater-than + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = XmmGet(rb); + c.psubw(va.get(), XmmConst(_mm_set1_epi32(0x80008000))); + c.psubw(vb.get(), XmmConst(_mm_set1_epi32(0x80008000))); + c.pcmpgtw(va.get(), vb.get()); + XmmFinalize(va, rt); + XmmFinalize(vb); + } + LOG_OPCODE(); } void ORC(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u32[w] | (~CPU.GPR[rb]._u32[w]); - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + const XmmLink& v1 = XmmAlloc(rt); + c.pcmpeqd(v1.get(), v1.get()); + XmmFinalize(v1, rt); + } + else + { + const XmmLink& vb = XmmGet(rb, rt); + c.pxor(vb.get(), XmmConst(_mm_set1_epi32(-1))); + if (const XmmLink* va = XmmRead(ra)) + { + c.por(vb.get(), va->read()); + } + else + { + c.por(vb.get(), cpu_xmm(GPR[ra])); + } + XmmFinalize(vb, rt); + } + LOG_OPCODE(); } void FCMGT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = fabs(CPU.GPR[ra]._f[0]) > fabs(CPU.GPR[rb]._f[0]) ? 0xffffffff : 0; - CPU.GPR[rt]._u32[1] = fabs(CPU.GPR[ra]._f[1]) > fabs(CPU.GPR[rb]._f[1]) ? 0xffffffff : 0; - CPU.GPR[rt]._u32[2] = fabs(CPU.GPR[ra]._f[2]) > fabs(CPU.GPR[rb]._f[2]) ? 0xffffffff : 0; - CPU.GPR[rt]._u32[3] = fabs(CPU.GPR[ra]._f[3]) > fabs(CPU.GPR[rb]._f[3]) ? 0xffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); + // reverted less-than + const XmmLink& vb = XmmGet(rb, rt); + const XmmLink& va = XmmGet(ra); + c.andps(vb.get(), XmmConst(_mm_set1_epi32(0x7fffffff))); // abs + c.andps(va.get(), XmmConst(_mm_set1_epi32(0x7fffffff))); // abs + c.cmpps(vb.get(), va.get(), 1); + XmmFinalize(vb, rt); + XmmFinalize(va); + LOG_OPCODE(); } void DFCMGT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u64[0] = fabs(CPU.GPR[ra]._d[0]) > fabs(CPU.GPR[rb]._d[0]) ? 0xffffffffffffffff : 0; - CPU.GPR[rt]._u64[1] = fabs(CPU.GPR[ra]._d[1]) > fabs(CPU.GPR[rb]._d[1]) ? 0xffffffffffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); + // reverted less-than + const XmmLink& vb = XmmGet(rb, rt); + const XmmLink& va = XmmGet(ra); + c.andpd(vb.get(), XmmConst(_mm_set_epi32(0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff))); // abs + c.andpd(va.get(), XmmConst(_mm_set_epi32(0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff))); // abs + c.cmppd(vb.get(), va.get(), 1); + XmmFinalize(vb, rt); + XmmFinalize(va); + LOG_OPCODE(); } void DFA(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] + CPU.GPR[rb]._d[0]; - CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] + CPU.GPR[rb]._d[1]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + if (ra == rb) + { + c.addpd(va.get(), va.get()); + } + else + { + if (const XmmLink* vb = XmmRead(rb)) + { + c.addpd(va.get(), vb->read()); + } + else + { + c.addpd(va.get(), cpu_xmm(GPR[rb])); + } + } + XmmFinalize(va, rt); + LOG_OPCODE(); } void DFS(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] - CPU.GPR[rb]._d[0]; - CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] - CPU.GPR[rb]._d[1]; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + // zero (?) + const XmmLink& v0 = XmmAlloc(rt); + c.subpd(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.subpd(va.get(), vb->read()); + } + else + { + c.subpd(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void DFM(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; - CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + const XmmLink& va = XmmGet(ra, rt); + c.mulpd(va.get(), va.get()); + XmmFinalize(va, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.mulpd(va.get(), vb->read()); + } + else + { + c.mulpd(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void CLGTB(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] > CPU.GPR[rb]._u8[b] ? 0xff : 0; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else + { + // compare if-greater-than + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = XmmGet(rb); + c.psubb(va.get(), XmmConst(_mm_set1_epi32(0x80808080))); + c.psubb(vb.get(), XmmConst(_mm_set1_epi32(0x80808080))); + c.pcmpgtb(va.get(), vb.get()); + XmmFinalize(va, rt); + XmmFinalize(vb); + } + LOG_OPCODE(); } void HLGT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - if(CPU.GPR[ra]._u32[3] > CPU.GPR[rb]._u32[3]) CPU.Stop(); - WRAPPER_END(rt, ra, rb, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.cmp(*addr, cpu_dword(GPR[rb]._u32[3])); + c.mov(*addr, 0); + c.seta(*addr); + c.neg(*addr); c.mov(*pos_var, (CPU.PC >> 2) + 1); + c.xor_(*pos_var, *addr); do_finalize = true; + LOG_OPCODE(); } void DFMA(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._d[0] += CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; - CPU.GPR[rt]._d[1] += CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& vr = XmmGet(rt, rt); + const XmmLink& va = XmmGet(ra); + c.mulpd(va.get(), cpu_xmm(GPR[rb])); + c.addpd(vr.get(), va.get()); + XmmFinalize(vr, rt); + XmmFinalize(va); + LOG_OPCODE(); } void DFMS(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._d[0] = CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0] - CPU.GPR[rt]._d[0]; - CPU.GPR[rt]._d[1] = CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1] - CPU.GPR[rt]._d[1]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& vr = XmmGet(rt, rt); + const XmmLink& va = XmmGet(ra); + c.mulpd(va.get(), cpu_xmm(GPR[rb])); + c.xorpd(vr.get(), XmmConst(_mm_set_epi32(0x80000000, 0, 0x80000000, 0))); // neg + c.addpd(vr.get(), va.get()); + XmmFinalize(vr, rt); + XmmFinalize(va); + LOG_OPCODE(); } void DFNMS(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._d[0] -= CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0]; - CPU.GPR[rt]._d[1] -= CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& vr = XmmGet(rt, rt); + const XmmLink& va = XmmGet(ra); + c.mulpd(va.get(), cpu_xmm(GPR[rb])); + c.subpd(vr.get(), va.get()); + XmmFinalize(vr, rt); + XmmFinalize(va); + LOG_OPCODE(); } void DFNMA(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._d[0] = -(CPU.GPR[ra]._d[0] * CPU.GPR[rb]._d[0] + CPU.GPR[rt]._d[0]); - CPU.GPR[rt]._d[1] = -(CPU.GPR[ra]._d[1] * CPU.GPR[rb]._d[1] + CPU.GPR[rt]._d[1]); - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& vr = XmmGet(rt, rt); + const XmmLink& va = XmmGet(ra); + c.mulpd(va.get(), cpu_xmm(GPR[rb])); + c.addpd(vr.get(), va.get()); + c.xorpd(vr.get(), XmmConst(_mm_set_epi32(0x80000000, 0, 0x80000000, 0))); // neg + XmmFinalize(vr, rt); + XmmFinalize(va); + LOG_OPCODE(); } void CEQ(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._i32[w] == CPU.GPR[rb]._i32[w] ? 0xffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + const XmmLink& v1 = XmmAlloc(rt); + c.pcmpeqd(v1.get(), v1.get()); + XmmFinalize(v1, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.pcmpeqd(va.get(), vb->read()); + } + else + { + c.pcmpeqd(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void MPYHHU(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2+1] * CPU.GPR[rb]._u16[w*2+1]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + c.psrld(va.get(), 16); + c.psrld(vb.get(), 16); + c.pmulld(va.get(), vb.get()); + XmmFinalize(va, rt); + XmmFinalize(vb); + LOG_OPCODE(); } void ADDX(u32 rt, u32 ra, u32 rb) { @@ -2017,17 +2263,31 @@ private: } void MPYHHA(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] += CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2+1]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& vt = XmmGet(rt, rt); + const XmmLink& va = XmmGet(ra); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + c.psrad(va.get(), 16); + c.psrad(vb.get(), 16); + c.pmulld(va.get(), vb.get()); + c.paddd(vt.get(), va.get()); + XmmFinalize(vt, rt); + XmmFinalize(va); + XmmFinalize(vb); + LOG_OPCODE(); } void MPYHHAU(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] += CPU.GPR[ra]._u16[w*2+1] * CPU.GPR[rb]._u16[w*2+1]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& vt = XmmGet(rt, rt); + const XmmLink& va = XmmGet(ra); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + c.psrld(va.get(), 16); + c.psrld(vb.get(), 16); + c.pmulld(va.get(), vb.get()); + c.paddd(vt.get(), va.get()); + XmmFinalize(vt, rt); + XmmFinalize(va); + XmmFinalize(vb); + LOG_OPCODE(); } //Forced bits to 0, hence the shift: @@ -2115,34 +2375,53 @@ private: } void FCEQ(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = CPU.GPR[ra]._f[0] == CPU.GPR[rb]._f[0] ? 0xffffffff : 0; - CPU.GPR[rt]._u32[1] = CPU.GPR[ra]._f[1] == CPU.GPR[rb]._f[1] ? 0xffffffff : 0; - CPU.GPR[rt]._u32[2] = CPU.GPR[ra]._f[2] == CPU.GPR[rb]._f[2] ? 0xffffffff : 0; - CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._f[3] == CPU.GPR[rb]._f[3] ? 0xffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); + // compare equal + const XmmLink& vb = XmmGet(rb, rt); + if (const XmmLink* va = XmmRead(ra)) + { + c.cmpps(vb.get(), va->read(), 0); + } + else + { + c.cmpps(vb.get(), cpu_xmm(GPR[ra]), 0); + } + XmmFinalize(vb, rt); + LOG_OPCODE(); } void DFCEQ(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u64[0] = CPU.GPR[ra]._d[0] == CPU.GPR[rb]._d[0] ? 0xffffffffffffffff : 0; - CPU.GPR[rt]._u64[1] = CPU.GPR[ra]._d[1] == CPU.GPR[rb]._d[1] ? 0xffffffffffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); + // compare equal + const XmmLink& vb = XmmGet(rb, rt); + if (const XmmLink* va = XmmRead(ra)) + { + c.cmppd(vb.get(), va->read(), 0); + } + else + { + c.cmppd(vb.get(), cpu_xmm(GPR[ra]), 0); + } + XmmFinalize(vb, rt); + LOG_OPCODE(); } void MPY(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + c.pslld(va.get(), 16); + c.pslld(vb.get(), 16); + c.psrad(va.get(), 16); + c.psrad(vb.get(), 16); + c.pmulld(va.get(), vb.get()); + XmmFinalize(va, rt); + XmmFinalize(vb); + LOG_OPCODE(); } void MPYH(u32 rt, u32 ra, u32 rb) { const XmmLink& va = XmmGet(ra, rt); const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); c.psrld(va.get(), 16); - c.pand(vb.get(), XmmConst(_mm_set1_epi32(0xffff))); - c.pmulld(va.get(), vb.get()); + c.pmullw(va.get(), vb.get()); c.pslld(va.get(), 16); XmmFinalize(va, rt); XmmFinalize(vb); @@ -2150,47 +2429,78 @@ private: } void MPYHH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2+1] * CPU.GPR[rb]._i16[w*2+1]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + c.psrad(va.get(), 16); + c.psrad(vb.get(), 16); + c.pmulld(va.get(), vb.get()); + XmmFinalize(va, rt); + XmmFinalize(vb); + LOG_OPCODE(); } void MPYS(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = (CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2]) >> 16; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + c.pmulhw(va.get(), vb.get()); + c.pslld(va.get(), 16); + c.psrad(va.get(), 16); + XmmFinalize(va, rt); + XmmFinalize(vb); + LOG_OPCODE(); } void CEQH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._u16[h] == CPU.GPR[rb]._u16[h] ? 0xffff : 0; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + const XmmLink& v1 = XmmAlloc(rt); + c.pcmpeqw(v1.get(), v1.get()); + XmmFinalize(v1, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.pcmpeqw(va.get(), vb->read()); + } + else + { + c.pcmpeqw(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void FCMEQ(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = fabs(CPU.GPR[ra]._f[0]) == fabs(CPU.GPR[rb]._f[0]) ? 0xffffffff : 0; - CPU.GPR[rt]._u32[1] = fabs(CPU.GPR[ra]._f[1]) == fabs(CPU.GPR[rb]._f[1]) ? 0xffffffff : 0; - CPU.GPR[rt]._u32[2] = fabs(CPU.GPR[ra]._f[2]) == fabs(CPU.GPR[rb]._f[2]) ? 0xffffffff : 0; - CPU.GPR[rt]._u32[3] = fabs(CPU.GPR[ra]._f[3]) == fabs(CPU.GPR[rb]._f[3]) ? 0xffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& vb = XmmGet(rb, rt); + const XmmLink& va = XmmGet(ra); + c.andps(vb.get(), XmmConst(_mm_set1_epi32(0x7fffffff))); // abs + c.andps(va.get(), XmmConst(_mm_set1_epi32(0x7fffffff))); // abs + c.cmpps(vb.get(), va.get(), 0); // == + XmmFinalize(vb, rt); + XmmFinalize(va); + LOG_OPCODE(); } void DFCMEQ(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u64[0] = fabs(CPU.GPR[ra]._d[0]) == fabs(CPU.GPR[rb]._d[0]) ? 0xffffffffffffffff : 0; - CPU.GPR[rt]._u64[1] = fabs(CPU.GPR[ra]._d[1]) == fabs(CPU.GPR[rb]._d[1]) ? 0xffffffffffffffff : 0; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& vb = XmmGet(rb, rt); + const XmmLink& va = XmmGet(ra); + c.andpd(vb.get(), XmmConst(_mm_set_epi32(0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff))); // abs + c.andpd(va.get(), XmmConst(_mm_set_epi32(0x7fffffff, 0xffffffff, 0x7fffffff, 0xffffffff))); // abs + c.cmppd(vb.get(), va.get(), 0); // == + XmmFinalize(vb, rt); + XmmFinalize(va); + LOG_OPCODE(); } void MPYU(u32 rt, u32 ra, u32 rb) { const XmmLink& va = XmmGet(ra, rt); if (ra == rb) { - c.pand(va.get(), XmmConst(_mm_set1_epi32(0xffff))); + c.pslld(va.get(), 16); + c.psrld(va.get(), 16); c.pmulld(va.get(), va.get()); } else @@ -2207,10 +2517,26 @@ private: } void CEQB(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] == CPU.GPR[rb]._u8[b] ? 0xff : 0; - WRAPPER_END(rt, ra, rb, 0); + if (ra == rb) + { + const XmmLink& v1 = XmmAlloc(rt); + c.pcmpeqb(v1.get(), v1.get()); + XmmFinalize(v1, rt); + } + else + { + const XmmLink& va = XmmGet(ra, rt); + if (const XmmLink* vb = XmmRead(rb)) + { + c.pcmpeqb(va.get(), vb->read()); + } + else + { + c.pcmpeqb(va.get(), cpu_xmm(GPR[rb])); + } + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void FI(u32 rt, u32 ra, u32 rb) { @@ -2220,11 +2546,15 @@ private: } void HEQ(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - if(CPU.GPR[ra]._i32[3] == CPU.GPR[rb]._i32[3]) CPU.Stop(); - WRAPPER_END(rt, ra, rb, 0); + c.mov(*addr, cpu_dword(GPR[ra]._i32[3])); + c.cmp(*addr, cpu_dword(GPR[rb]._i32[3])); + c.mov(*addr, 0); + c.sete(*addr); + c.neg(*addr); c.mov(*pos_var, (CPU.PC >> 2) + 1); + c.xor_(*pos_var, *addr); do_finalize = true; + LOG_OPCODE(); } //0 - 9 @@ -2433,7 +2763,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -2444,7 +2774,7 @@ private: { fsmbi_mask.m128i_i8[j] = ((i16 >> j) & 0x1) ? 0xff : 0; } - c.movaps(vr.get(), XmmConst(fsmbi_mask)); + c.movdqa(vr.get(), XmmConst(fsmbi_mask)); XmmFinalize(vr, rt); } LOG_OPCODE(); @@ -2482,7 +2812,7 @@ private: const XmmLink& vr = XmmAlloc(rt); if (i16 == 0) { - c.xorps(vr.get(), vr.get()); + c.pxor(vr.get(), vr.get()); } else if (i16 == -1) { @@ -2490,7 +2820,7 @@ private: } else { - c.movaps(vr.get(), XmmConst(_mm_set1_epi32(i16))); + c.movdqa(vr.get(), XmmConst(_mm_set1_epi32(i16))); } XmmFinalize(vr, rt); LOG_OPCODE(); @@ -2500,21 +2830,28 @@ private: const XmmLink& vr = XmmAlloc(rt); if (i16 == 0) { - c.xorps(vr.get(), vr.get()); + c.pxor(vr.get(), vr.get()); } else { - c.movaps(vr.get(), XmmConst(_mm_set1_epi32(i16 << 16))); + c.movdqa(vr.get(), XmmConst(_mm_set1_epi32(i16 << 16))); } XmmFinalize(vr, rt); LOG_OPCODE(); } void ILH(u32 rt, s32 i16) { - WRAPPER_BEGIN(rt, i16, yy, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = (s32)i16; - WRAPPER_END(rt, i16, 0, 0); + const XmmLink& vr = XmmAlloc(rt); + if (i16 == 0) + { + c.pxor(vr.get(), vr.get()); + } + else + { + c.movdqa(vr.get(), XmmConst(_mm_set1_epi16(i16))); + } + XmmFinalize(vr, rt); + LOG_OPCODE(); } void IOHL(u32 rt, s32 i16) { @@ -2525,7 +2862,7 @@ private: else { const XmmLink& vt = XmmGet(rt, rt); - c.orps(vt.get(), XmmConst(_mm_set1_epi32(i16 & 0xffff))); + c.por(vt.get(), XmmConst(_mm_set1_epi32(i16 & 0xffff))); XmmFinalize(vt, rt); } LOG_OPCODE(); @@ -2555,24 +2892,64 @@ private: else { const XmmLink& va = XmmGet(ra, rt); - c.orps(va.get(), XmmConst(_mm_set1_epi32(i10))); + c.por(va.get(), XmmConst(_mm_set1_epi32(i10))); XmmFinalize(va, rt); } LOG_OPCODE(); } void ORHI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] | (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + if (i10 == -1) + { + // fill with 1 + const XmmLink& v1 = XmmAlloc(rt); + c.pcmpeqd(v1.get(), v1.get()); + XmmFinalize(v1, rt); + } + else if (i10 == 0) + { + if (rt != ra) + { + // mov + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + const XmmLink& va = XmmGet(ra, rt); + c.por(va.get(), XmmConst(_mm_set1_epi16(i10))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void ORBI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] | (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + if (i10 == -1) + { + // fill with 1 + const XmmLink& v1 = XmmAlloc(rt); + c.pcmpeqd(v1.get(), v1.get()); + XmmFinalize(v1, rt); + } + else if (i10 == 0) + { + if (rt != ra) + { + // mov + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + const XmmLink& va = XmmGet(ra, rt); + c.por(va.get(), XmmConst(_mm_set1_epi8(i10))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void SFI(u32 rt, u32 ra, s32 i10) { @@ -2580,7 +2957,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); c.psubd(v0.get(), cpu_xmm(GPR[ra])); XmmFinalize(v0, rt); } @@ -2603,10 +2980,30 @@ private: } void SFHI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = (s32)i10 - CPU.GPR[ra]._i16[h]; - WRAPPER_END(rt, ra, i10, 0); + if (i10 == 0) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + c.psubw(v0.get(), cpu_xmm(GPR[ra])); + XmmFinalize(v0, rt); + } + else if (i10 == -1) + { + // fill with 1 + const XmmLink& v1 = XmmAlloc(rt); + c.pcmpeqw(v1.get(), v1.get()); + c.psubw(v1.get(), cpu_xmm(GPR[ra])); + XmmFinalize(v1, rt); + } + else + { + const XmmLink& vr = XmmAlloc(rt); + c.movdqa(vr.get(), XmmConst(_mm_set1_epi16(i10))); + c.psubw(vr.get(), cpu_xmm(GPR[ra])); + XmmFinalize(vr, rt); + } + LOG_OPCODE(); } void ANDI(u32 rt, u32 ra, s32 i10) { @@ -2614,7 +3011,7 @@ private: { // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else if (i10 == -1) @@ -2630,24 +3027,64 @@ private: else { const XmmLink& va = XmmGet(ra, rt); - c.andps(va.get(), XmmConst(_mm_set1_epi32(i10))); + c.pand(va.get(), XmmConst(_mm_set1_epi32(i10))); XmmFinalize(va, rt); } LOG_OPCODE(); } void ANDHI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] & (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + if (i10 == 0) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else if (i10 == -1) + { + // mov + if (ra != rt) + { + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + const XmmLink& va = XmmGet(ra, rt); + c.pand(va.get(), XmmConst(_mm_set1_epi16(i10))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void ANDBI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] & (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + if (i10 == 0) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else if (i10 == -1) + { + // mov + if (ra != rt) + { + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + const XmmLink& va = XmmGet(ra, rt); + c.pand(va.get(), XmmConst(_mm_set1_epi8(i10))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void AI(u32 rt, u32 ra, s32 i10) { @@ -2672,10 +3109,24 @@ private: } void AHI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for(u32 h = 0; h < 8; ++h) - CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] + (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + if (i10 == 0) + { + if (rt != ra) + { + // mov + const XmmLink& va = XmmGet(ra, rt); + XmmFinalize(va, rt); + } + // else nop + } + else + { + // add + const XmmLink& va = XmmGet(ra, rt); + c.paddw(va.get(), XmmConst(_mm_set1_epi16(i10))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void STQD(u32 rt, s32 i10, u32 ra) // i10 is shifted left by 4 while decoding { @@ -2707,24 +3158,24 @@ private: } void XORI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i32[w] ^ (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra); + c.pxor(va.get(), XmmConst(_mm_set1_epi32(i10))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void XORHI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = CPU.GPR[ra]._i16[h] ^ (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra); + c.pxor(va.get(), XmmConst(_mm_set1_epi16(i10))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void XORBI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._i8[b] = CPU.GPR[ra]._i8[b] ^ (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra); + c.pxor(va.get(), XmmConst(_mm_set1_epi8(i10))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void CGTI(u32 rt, u32 ra, s32 i10) { @@ -2735,33 +3186,37 @@ private: } void CGTHI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = CPU.GPR[ra]._i16[h] > (s32)i10 ? 0xffff : 0; - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra); + c.pcmpgtw(va.get(), XmmConst(_mm_set1_epi16(i10))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void CGTBI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._i8[b] > (s8)(i10 & 0xff) ? 0xff : 0; - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra); + c.pcmpgtb(va.get(), XmmConst(_mm_set1_epi8(i10))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void HGTI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - if(CPU.GPR[ra]._i32[3] > (s32)i10) CPU.Stop(); - WRAPPER_END(rt, ra, i10, 0); + c.mov(*addr, cpu_dword(GPR[ra]._i32[3])); + c.cmp(*addr, i10); + c.mov(*addr, 0); + c.setg(*addr); + c.neg(*addr); c.mov(*pos_var, (CPU.PC >> 2) + 1); + c.xor_(*pos_var, *addr); do_finalize = true; + LOG_OPCODE(); } void CLGTI(u32 rt, u32 ra, s32 i10) { if (i10 == -1) { - // zero result + // zero const XmmLink& v0 = XmmAlloc(rt); - c.xorps(v0.get(), v0.get()); + c.pxor(v0.get(), v0.get()); XmmFinalize(v0, rt); } else @@ -2775,41 +3230,69 @@ private: } void CLGTHI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for(u32 i = 0; i < 8; ++i) + if (i10 == -1) { - CPU.GPR[rt]._u16[i] = (CPU.GPR[ra]._u16[i] > (u16)i10) ? 0xffff : 0x0000; + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + XmmFinalize(v0, rt); } - WRAPPER_END(rt, ra, i10, 0); + else + { + const XmmLink& va = XmmGet(ra); + c.psubw(va.get(), XmmConst(_mm_set1_epi16((u16)0x8000))); + c.pcmpgtw(va.get(), XmmConst(_mm_set1_epi16((u16)i10 - 0x8000))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void CLGTBI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = CPU.GPR[ra]._u8[b] > (u8)(i10 & 0xff) ? 0xff : 0; - WRAPPER_END(rt, ra, i10, 0); + if (i10 == -1) + { + // zero + const XmmLink& v0 = XmmAlloc(rt); + c.pxor(v0.get(), v0.get()); + XmmFinalize(v0, rt); + } + else + { + const XmmLink& va = XmmGet(ra); + c.psubb(va.get(), XmmConst(_mm_set1_epi8((s8)0x80))); + c.pcmpgtb(va.get(), XmmConst(_mm_set1_epi8((s8)i10 - 0x80))); + XmmFinalize(va, rt); + } + LOG_OPCODE(); } void HLGTI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - if(CPU.GPR[ra]._u32[3] > (u32)i10) CPU.Stop(); - WRAPPER_END(rt, ra, i10, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.cmp(*addr, i10); + c.mov(*addr, 0); + c.seta(*addr); + c.neg(*addr); c.mov(*pos_var, (CPU.PC >> 2) + 1); + c.xor_(*pos_var, *addr); do_finalize = true; + LOG_OPCODE(); } void MPYI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * (s32)i10; - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra, rt); + c.pslld(va.get(), 16); + c.psrad(va.get(), 16); + c.pmulld(va.get(), XmmConst(_mm_set1_epi32(i10))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void MPYUI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = CPU.GPR[ra]._u16[w*2] * (u16)(i10 & 0xffff); - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra, rt); + c.pslld(va.get(), 16); + c.psrld(va.get(), 16); + c.pmulld(va.get(), XmmConst(_mm_set1_epi32(i10 & 0xffff))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void CEQI(u32 rt, u32 ra, s32 i10) { @@ -2820,25 +3303,29 @@ private: } void CEQHI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._i16[h] == (s16)(s32)i10) ? 0xffff : 0; - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra); + c.pcmpeqw(va.get(), XmmConst(_mm_set1_epi16((s16)i10))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void CEQBI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._i8[b] = (CPU.GPR[ra]._i8[b] == (s8)(i10 & 0xff)) ? 0xff : 0; - WRAPPER_END(rt, ra, i10, 0); + const XmmLink& va = XmmGet(ra); + c.pcmpeqb(va.get(), XmmConst(_mm_set1_epi8((s8)i10))); + XmmFinalize(va, rt); + LOG_OPCODE(); } void HEQI(u32 rt, u32 ra, s32 i10) { - WRAPPER_BEGIN(rt, ra, i10, zz); - if(CPU.GPR[ra]._i32[3] == (s32)i10) CPU.Stop(); - WRAPPER_END(rt, ra, i10, 0); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.cmp(*addr, i10); + c.mov(*addr, 0); + c.sete(*addr); + c.neg(*addr); c.mov(*pos_var, (CPU.PC >> 2) + 1); + c.xor_(*pos_var, *addr); do_finalize = true; + LOG_OPCODE(); } @@ -2856,11 +3343,11 @@ private: const XmmLink& vr = XmmAlloc(rt); if (i18 == 0) { - c.xorps(vr.get(), vr.get()); + c.pxor(vr.get(), vr.get()); } else { - c.movaps(vr.get(), XmmConst(_mm_set1_epi32(i18 & 0x3ffff))); + c.movdqa(vr.get(), XmmConst(_mm_set1_epi32(i18 & 0x3ffff))); } XmmFinalize(vr, rt); LOG_OPCODE(); @@ -2871,9 +3358,9 @@ private: { const XmmLink& vb = XmmGet(rb); const XmmLink& vc = XmmGet(rc); - c.andps(vb.get(), vc.get()); - c.andnps(vc.get(), cpu_xmm(GPR[ra])); - c.orps(vb.get(), vc.get()); + c.pand(vb.get(), vc.get()); + c.pandn(vc.get(), cpu_xmm(GPR[ra])); + c.por(vb.get(), vc.get()); XmmFinalize(vb, rt); XmmFinalize(vc); LOG_OPCODE(); diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index f1a5801950..ae498fd2e4 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -214,8 +214,14 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) } } - u16 res = pos; - res = (u16)func(cpu, &Memory[m_offset], imm_table.data(), res); + u32 res = pos; + res = func(cpu, &Memory[m_offset], imm_table.data(), res); + + if (res > 0xffff) + { + CPU.Stop(); + res = ~res; + } if (did_compile) { From e1bbedd4bfbc7773538e51c9d46afcfad10f73be Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 20 Apr 2014 02:53:42 +0400 Subject: [PATCH 13/14] Accuracy improved --- rpcs3/Emu/Cell/SPUInterpreter.h | 9 +- rpcs3/Emu/Cell/SPURecompiler.h | 145 ++++++++++----------------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 4 +- 3 files changed, 61 insertions(+), 97 deletions(-) diff --git a/rpcs3/Emu/Cell/SPUInterpreter.h b/rpcs3/Emu/Cell/SPUInterpreter.h index e93e1e1fa7..8522a34889 100644 --- a/rpcs3/Emu/Cell/SPUInterpreter.h +++ b/rpcs3/Emu/Cell/SPUInterpreter.h @@ -1034,9 +1034,13 @@ private: CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] & 0x807fffff) | (exp << 23); - CPU.GPR[rt]._u32[i] = (u32)CPU.GPR[rt]._f[i]; //trunc + if (CPU.GPR[rt]._f[i] > 0x7fffffff) + CPU.GPR[rt]._u32[i] = 0x7fffffff; + else if (CPU.GPR[rt]._f[i] < -pow(2, 31)) + CPU.GPR[rt]._u32[i] = 0x80000000; + else + CPU.GPR[rt]._i32[i] = (s32)CPU.GPR[rt]._f[i]; //trunc } - //CPU.GPR[rt]._m128i = _mm_cvttps_epi32(CPU.GPR[rt]._m128); } void CFLTU(u32 rt, u32 ra, s32 i8) { @@ -1063,7 +1067,6 @@ private: } void CSFLT(u32 rt, u32 ra, s32 i8) { - //CPU.GPR[rt]._m128 = _mm_cvtepi32_ps(CPU.GPR[ra]._m128i); const u32 scale = 155 - (i8 & 0xff); //unsigned immediate for (int i = 0; i < 4; i++) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 51c909b575..5d6e3ca906 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -605,29 +605,15 @@ private: } void SHL(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = (CPU.GPR[rb]._u32[0] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[0] << (CPU.GPR[rb]._u32[0] & 0x3f); - CPU.GPR[rt]._u32[1] = (CPU.GPR[rb]._u32[1] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[1] << (CPU.GPR[rb]._u32[1] & 0x3f); - CPU.GPR[rt]._u32[2] = (CPU.GPR[rb]._u32[2] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x3f); - CPU.GPR[rt]._u32[3] = (CPU.GPR[rb]._u32[3] & 0x3f) > 31 ? 0 : CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x3f); - WRAPPER_END(rt, ra, rb, 0); - - // AVX2: masking with 0x3f + VPSLLVD may be better - /*XmmInvalidate(rt); + XmmInvalidate(rt); for (u32 i = 0; i < 4; i++) { - GpVar v0(c, kVarTypeUInt32); - c.mov(v0, cpu_dword(GPR[ra]._u32[i])); - GpVar shift(c, kVarTypeUInt32); - c.mov(shift, cpu_dword(GPR[rb]._u32[i])); - GpVar z(c); - c.xor_(z, z); - c.test(shift, 0x20); - c.cmovnz(v0, z); - c.shl(v0, shift); - c.mov(cpu_dword(GPR[rt]._u32[i]), v0); + c.mov(qw0->r32(), cpu_dword(GPR[ra]._u32[i])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[i])); + c.shl(*qw0, *addr); + c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32()); } - LOG_OPCODE();*/ + LOG_OPCODE(); } void ROTH(u32 rt, u32 ra, u32 rb) { @@ -2289,30 +2275,25 @@ private: XmmFinalize(vb); LOG_OPCODE(); } - //Forced bits to 0, hence the shift: - void FSCRRD(u32 rt) { - /*CPU.GPR[rt]._u128.lo = - CPU.FPSCR.Exception0 << 20 & - CPU.FPSCR.*/ UNIMPLEMENTED(); } void FESD(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - CPU.GPR[rt]._d[0] = (double)CPU.GPR[ra]._f[1]; - CPU.GPR[rt]._d[1] = (double)CPU.GPR[ra]._f[3]; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& va = XmmGet(ra, rt); + c.shufps(va.get(), va.get(), 0x8d); // _f[0] = _f[1]; _f[1] = _f[3]; + c.cvtps2pd(va.get(), va.get()); + XmmFinalize(va, rt); + LOG_OPCODE(); } void FRDS(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - CPU.GPR[rt]._f[1] = (float)CPU.GPR[ra]._d[0]; - CPU.GPR[rt]._u32[0] = 0x00000000; - CPU.GPR[rt]._f[3] = (float)CPU.GPR[ra]._d[1]; - CPU.GPR[rt]._u32[2] = 0x00000000; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& va = XmmGet(ra, rt); + c.cvtpd2ps(va.get(), va.get()); + c.shufps(va.get(), va.get(), 0x72); // _f[1] = _f[0]; _f[3] = _f[1]; _f[0] = _f[2] = 0; + XmmFinalize(va, rt); + LOG_OPCODE(); } void FSCRWR(u32 rt, u32 ra) { @@ -2565,46 +2546,29 @@ private: { c.mulps(va.get(), XmmConst(_mm_set1_ps(pow(2, 173 - (i8 & 0xff))))); // scale } + c.maxps(va.get(), XmmConst(_mm_set1_ps(-pow(2, 31)))); // saturate + c.minps(va.get(), XmmConst(_mm_set1_ps((float)0x7fffffff))); c.cvttps2dq(va.get(), va.get()); // convert to ints with truncation XmmFinalize(va, rt); LOG_OPCODE(); } void CFLTU(u32 rt, u32 ra, s32 i8) { - WRAPPER_BEGIN(rt, ra, i8, zz); - const u32 scale = 173 - (i8 & 0xff); //unsigned immediate - for (int i = 0; i < 4; i++) - { - u32 exp = ((CPU.GPR[ra]._u32[i] >> 23) & 0xff) + scale; - - if (exp > 255) - exp = 255; - - if (CPU.GPR[ra]._u32[i] & 0x80000000) //if negative, result = 0 - CPU.GPR[rt]._u32[i] = 0; - else - { - CPU.GPR[rt]._u32[i] = (CPU.GPR[ra]._u32[i] & 0x807fffff) | (exp << 23); - - if (CPU.GPR[rt]._f[i] > 0xffffffff) //if big, result = max - CPU.GPR[rt]._u32[i] = 0xffffffff; - else - CPU.GPR[rt]._u32[i] = floor(CPU.GPR[rt]._f[i]); - } - } - WRAPPER_END(rt, ra, i8, 0); - - /*XmmVar v0(c); - c.movaps(v0, cpu_xmm(GPR[ra])); + const XmmLink& va = XmmGet(ra, rt); if (i8 != 173) { - c.mulps(v0, XmmConst(_mm_set1_ps(pow(2, 173 - (i8 & 0xff))))); // scale + c.mulps(va.get(), XmmConst(_mm_set1_ps(pow(2, 173 - (i8 & 0xff))))); // scale } - // TODO: handle negative values and convert to unsigned value - // c.int3(); - c.cvtps2dq(v0, v0); // convert to signed ints - c.movdqa(cpu_xmm(GPR[rt]), v0); - LOG_OPCODE();*/ + c.maxps(va.get(), XmmConst(_mm_set1_ps(0.0f))); // saturate + c.minps(va.get(), XmmConst(_mm_set1_ps((float)0xffffffff))); + const XmmLink& v1 = XmmCopy(va); + c.cmpps(v1.get(), XmmConst(_mm_set1_ps(pow(2, 31))), 5); // generate mask of big values + c.andps(v1.get(), XmmConst(_mm_set1_ps(pow(2, 32)))); // generate correction component + c.subps(va.get(), v1.get()); // subtract correction component + c.cvttps2dq(va.get(), va.get()); // convert to ints with truncation + XmmFinalize(va, rt); + XmmFinalize(v1); + LOG_OPCODE(); } void CSFLT(u32 rt, u32 ra, s32 i8) { @@ -2619,31 +2583,19 @@ private: } void CUFLT(u32 rt, u32 ra, s32 i8) { - WRAPPER_BEGIN(rt, ra, i8, zz); - const u32 scale = 155 - (i8 & 0xff); //unsigned immediate - for (int i = 0; i < 4; i++) - { - CPU.GPR[rt]._f[i] = (float)CPU.GPR[ra]._u32[i]; - u32 exp = ((CPU.GPR[rt]._u32[i] >> 23) & 0xff) - scale; - - if (exp > 255) //< 0 - exp = 0; - - CPU.GPR[rt]._u32[i] = (CPU.GPR[rt]._u32[i] & 0x807fffff) | (exp << 23); - } - WRAPPER_END(rt, ra, i8, 0); - - /*XmmVar v0(c); - c.movdqa(v0, cpu_xmm(GPR[ra])); - // TODO: convert from unsigned value - // c.int3(); - c.cvtdq2ps(v0, v0); // convert to floats as signed + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& v1 = XmmCopy(va); + c.cvtdq2ps(va.get(), va.get()); // convert to floats + c.psrad(v1.get(), 32); // generate mask from sign bit + c.andps(v1.get(), XmmConst(_mm_set1_ps(pow(2, 32)))); // generate correction component + c.addps(va.get(), v1.get()); // add correction component if (i8 != 155) { - c.mulps(v0, XmmConst(_mm_set1_ps(pow(2, (i8 & 0xff) - 155)))); // scale + c.mulps(va.get(), XmmConst(_mm_set1_ps(pow(2, (i8 & 0xff) - 155)))); // scale } - c.movaps(cpu_xmm(GPR[rt]), v0); - LOG_OPCODE();*/ + XmmFinalize(va, rt); + XmmFinalize(v1); + LOG_OPCODE(); } //0 - 8 @@ -3438,10 +3390,17 @@ private: } void MPYA(u32 rt, u32 ra, u32 rb, u32 rc) { - WRAPPER_BEGIN(rt, ra, rb, rc); - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._i32[w] = CPU.GPR[ra]._i16[w*2] * CPU.GPR[rb]._i16[w*2] + CPU.GPR[rc]._i32[w]; - WRAPPER_END(rt, ra, rb, rc); + const XmmLink& va = XmmGet(ra, rt); + const XmmLink& vb = XmmGet(rb); + c.pslld(va.get(), 16); + c.pslld(vb.get(), 16); + c.psrad(va.get(), 16); + c.psrad(vb.get(), 16); + c.pmulld(va.get(), vb.get()); + c.paddd(va.get(), cpu_xmm(GPR[rc])); + XmmFinalize(va, rt); + XmmFinalize(vb); + LOG_OPCODE(); } void FNMS(u32 rt, u32 ra, u32 rb, u32 rc) { @@ -3655,6 +3614,8 @@ private: do_finalize = true; Emu.Pause(); } + + }; #undef c \ No newline at end of file diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index ae498fd2e4..5a6cd5e880 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -160,14 +160,14 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) { // check data (hard way) bool is_valid = true; - for (u32 i = pos; i < (u32)(entry[pos].count + pos); i++) + /*for (u32 i = pos; i < (u32)(entry[pos].count + pos); i++) { if (entry[i].valid != ls[i]) { is_valid = false; break; } - } + }*/ // invalidate if necessary if (!is_valid) { From 525084e7ccb12b8441cad75b5c3e6f2e7231249f Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Sun, 20 Apr 2014 23:36:53 +0400 Subject: [PATCH 14/14] Some stuff --- rpcs3/Emu/Cell/PPUInterpreter.h | 68 ++-- rpcs3/Emu/Cell/SPURecompiler.h | 491 +++++++++++++++++---------- rpcs3/Emu/Cell/SPURecompilerCore.cpp | 17 +- 3 files changed, 366 insertions(+), 210 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUInterpreter.h b/rpcs3/Emu/Cell/PPUInterpreter.h index 6613a70109..c049a7e95a 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.h +++ b/rpcs3/Emu/Cell/PPUInterpreter.h @@ -181,7 +181,7 @@ private: CPU.VSCR.VSCR = CPU.VPR[vb]._u32[0]; CPU.VSCR.X = CPU.VSCR.Y = 0; } - void VADDCUW(u32 vd, u32 va, u32 vb) + void VADDCUW(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -195,7 +195,7 @@ private: CPU.VPR[vd]._f[w] = CPU.VPR[va]._f[w] + CPU.VPR[vb]._f[w]; } } - void VADDSBS(u32 vd, u32 va, u32 vb) + void VADDSBS(u32 vd, u32 va, u32 vb) //nf { for(u32 b=0; b<16; ++b) { @@ -235,7 +235,7 @@ private: CPU.VPR[vd]._s16[h] = result; } } - void VADDSWS(u32 vd, u32 va, u32 vb) + void VADDSWS(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -335,21 +335,21 @@ private: CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] & (~CPU.VPR[vb]._u32[w]); } } - void VAVGSB(u32 vd, u32 va, u32 vb) + void VAVGSB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { CPU.VPR[vd]._s8[b] = (CPU.VPR[va]._s8[b] + CPU.VPR[vb]._s8[b] + 1) >> 1; } } - void VAVGSH(u32 vd, u32 va, u32 vb) + void VAVGSH(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { CPU.VPR[vd]._s16[h] = (CPU.VPR[va]._s16[h] + CPU.VPR[vb]._s16[h] + 1) >> 1; } } - void VAVGSW(u32 vd, u32 va, u32 vb) + void VAVGSW(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -361,14 +361,14 @@ private: for (uint b = 0; b < 16; b++) CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] + CPU.VPR[vb]._u8[b] + 1) >> 1; } - void VAVGUH(u32 vd, u32 va, u32 vb) + void VAVGUH(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { CPU.VPR[vd]._u16[h] = (CPU.VPR[va]._u16[h] + CPU.VPR[vb]._u16[h] + 1) >> 1; } } - void VAVGUW(u32 vd, u32 va, u32 vb) + void VAVGUW(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -487,14 +487,14 @@ private: CPU.CR.cr6 = all_equal | none_equal; } - void VCMPEQUH(u32 vd, u32 va, u32 vb) + void VCMPEQUH(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] == CPU.VPR[vb]._u16[h] ? 0xffff : 0; } } - void VCMPEQUH_(u32 vd, u32 va, u32 vb) + void VCMPEQUH_(u32 vd, u32 va, u32 vb) //nf { int all_equal = 0x8; int none_equal = 0x2; @@ -599,7 +599,7 @@ private: CPU.CR.cr6 = all_ge | none_ge; } - void VCMPGTSB(u32 vd, u32 va, u32 vb) + void VCMPGTSB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -833,7 +833,7 @@ private: CPU.VPR[vd]._f[w] = max(CPU.VPR[va]._f[w], CPU.VPR[vb]._f[w]); } } - void VMAXSB(u32 vd, u32 va, u32 vb) + void VMAXSB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) CPU.VPR[vd]._s8[b] = max(CPU.VPR[va]._s8[b], CPU.VPR[vb]._s8[b]); @@ -918,7 +918,7 @@ private: CPU.VPR[vd]._f[w] = min(CPU.VPR[va]._f[w], CPU.VPR[vb]._f[w]); } } - void VMINSB(u32 vd, u32 va, u32 vb) + void VMINSB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -1021,7 +1021,7 @@ private: CPU.VPR[vd]._u32[3 - d*2 - 1] = CPU.VPR[vb]._u32[1 - d]; } } - void VMSUMMBM(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMMBM(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1036,7 +1036,7 @@ private: CPU.VPR[vd]._s32[w] = result; } } - void VMSUMSHM(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMSHM(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1051,7 +1051,7 @@ private: CPU.VPR[vd]._s32[w] = result; } } - void VMSUMSHS(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMSHS(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1096,7 +1096,7 @@ private: CPU.VPR[vd]._u32[w] = result; } } - void VMSUMUHM(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMUHM(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1111,7 +1111,7 @@ private: CPU.VPR[vd]._u32[w] = result; } } - void VMSUMUHS(u32 vd, u32 va, u32 vb, u32 vc) + void VMSUMUHS(u32 vd, u32 va, u32 vb, u32 vc) //nf { for (uint w = 0; w < 4; w++) { @@ -1136,7 +1136,7 @@ private: CPU.VPR[vd]._u32[w] = saturated; } } - void VMULESB(u32 vd, u32 va, u32 vb) + void VMULESB(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { @@ -1164,7 +1164,7 @@ private: CPU.VPR[vd]._u32[w] = (u32)CPU.VPR[va]._u16[w*2+1] * (u32)CPU.VPR[vb]._u16[w*2+1]; } } - void VMULOSB(u32 vd, u32 va, u32 vb) + void VMULOSB(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { @@ -1243,7 +1243,7 @@ private: CPU.VPR[vd]._u16[4 + (3 - h)] = (ab7 << 15) | (ab8 << 10) | (ab16 << 5) | ab24; } } - void VPKSHSS(u32 vd, u32 va, u32 vb) + void VPKSHSS(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 8; b++) { @@ -1348,7 +1348,7 @@ private: CPU.VPR[vd]._s16[h] = result; } } - void VPKSWUS(u32 vd, u32 va, u32 vb) + void VPKSWUS(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 4; h++) { @@ -1383,7 +1383,7 @@ private: CPU.VPR[vd]._u16[h] = result; } } - void VPKUHUM(u32 vd, u32 va, u32 vb) + void VPKUHUM(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 8; b++) { @@ -1424,7 +1424,7 @@ private: CPU.VPR[vd]._u16[h ] = CPU.VPR[vb]._u16[h*2]; } } - void VPKUWUS(u32 vd, u32 va, u32 vb) + void VPKUWUS(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 4; h++) { @@ -1486,7 +1486,7 @@ private: CPU.VPR[vd]._f[w] = f; } } - void VRLB(u32 vd, u32 va, u32 vb) + void VRLB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -1495,7 +1495,7 @@ private: CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] << nRot) | (CPU.VPR[va]._u8[b] >> (8 - nRot)); } } - void VRLH(u32 vd, u32 va, u32 vb) + void VRLH(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { @@ -1524,7 +1524,7 @@ private: CPU.VPR[vd]._u8[b] = (CPU.VPR[vb]._u8[b] & CPU.VPR[vc]._u8[b]) | (CPU.VPR[va]._u8[b] & (~CPU.VPR[vc]._u8[b])); } } - void VSL(u32 vd, u32 va, u32 vb) + void VSL(u32 vd, u32 va, u32 vb) //nf { u8 sh = CPU.VPR[vb]._u8[0] & 0x7; @@ -1648,7 +1648,7 @@ private: CPU.VPR[vd]._u32[w] = word; } } - void VSR(u32 vd, u32 va, u32 vb) + void VSR(u32 vd, u32 va, u32 vb) //nf { u8 sh = CPU.VPR[vb]._u8[0] & 0x7; u32 t = 1; @@ -1676,7 +1676,7 @@ private: CPU.VPR[vd]._u32[3] = 0xCDCDCDCD; } } - void VSRAB(u32 vd, u32 va, u32 vb) + void VSRAB(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -1729,7 +1729,7 @@ private: CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] >> (CPU.VPR[vb]._u8[w*4] & 0x1f); } } - void VSUBCUW(u32 vd, u32 va, u32 vb) + void VSUBCUW(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -1743,7 +1743,7 @@ private: CPU.VPR[vd]._f[w] = CPU.VPR[va]._f[w] - CPU.VPR[vb]._f[w]; } } - void VSUBSBS(u32 vd, u32 va, u32 vb) + void VSUBSBS(u32 vd, u32 va, u32 vb) //nf { for (uint b = 0; b < 16; b++) { @@ -1832,7 +1832,7 @@ private: CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] - CPU.VPR[vb]._u16[h]; } } - void VSUBUHS(u32 vd, u32 va, u32 vb) + void VSUBUHS(u32 vd, u32 va, u32 vb) //nf { for (uint h = 0; h < 8; h++) { @@ -1915,7 +1915,7 @@ private: CPU.VPR[vd]._s32[1] = 0; CPU.VPR[vd]._s32[3] = 0; } - void VSUM4SBS(u32 vd, u32 va, u32 vb) + void VSUM4SBS(u32 vd, u32 va, u32 vb) //nf { for (uint w = 0; w < 4; w++) { @@ -2019,7 +2019,7 @@ private: CPU.VPR[vd]._u8[(3 - w)*4 + 0] = CPU.VPR[vb]._u8[8 + w*2 + 1] & 0x1f; } } - void VUPKLSB(u32 vd, u32 vb) + void VUPKLSB(u32 vd, u32 vb) //nf { for (uint h = 0; h < 8; h++) { diff --git a/rpcs3/Emu/Cell/SPURecompiler.h b/rpcs3/Emu/Cell/SPURecompiler.h index 5d6e3ca906..4d07456c0c 100644 --- a/rpcs3/Emu/Cell/SPURecompiler.h +++ b/rpcs3/Emu/Cell/SPURecompiler.h @@ -14,6 +14,58 @@ using namespace asmjit::host; #define UNIMPLEMENTED() UNK(__FUNCTION__) +struct g_imm_table_struct +{ + u16 cntb_table[65536]; + + __m128i fsmb_table[65536]; + __m128i fsmh_table[256]; + __m128i fsm_table[16]; + + __m128i sldq_pshufb[32]; + __m128i srdq_pshufb[32]; + __m128i rldq_pshufb[16]; + + g_imm_table_struct() + { + static_assert(offsetof(g_imm_table_struct, cntb_table) == 0, "offsetof(cntb_table) != 0"); + for (u32 i = 0; i < sizeof(cntb_table) / sizeof(cntb_table[0]); i++) + { + u32 cnt_low = 0, cnt_high = 0; + for (u32 j = 0; j < 8; j++) + { + cnt_low += (i >> j) & 1; + cnt_high += (i >> (j + 8)) & 1; + } + cntb_table[i] = (cnt_high << 8) | cnt_low; + } + for (u32 i = 0; i < sizeof(fsm_table) / sizeof(fsm_table[0]); i++) + { + for (u32 j = 0; j < 4; j++) fsm_table[i].m128i_u32[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(fsmh_table) / sizeof(fsmh_table[0]); i++) + { + for (u32 j = 0; j < 8; j++) fsmh_table[i].m128i_u16[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(fsmb_table) / sizeof(fsmb_table[0]); i++) + { + for (u32 j = 0; j < 16; j++) fsmb_table[i].m128i_u8[j] = (i & (1 << j)) ? ~0 : 0; + } + for (u32 i = 0; i < sizeof(sldq_pshufb) / sizeof(sldq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) sldq_pshufb[i].m128i_u8[j] = (u8)(j - i); + } + for (u32 i = 0; i < sizeof(srdq_pshufb) / sizeof(srdq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) srdq_pshufb[i].m128i_u8[j] = (j + i > 15) ? 0xff : (u8)(j + i); + } + for (u32 i = 0; i < sizeof(rldq_pshufb) / sizeof(rldq_pshufb[0]); i++) + { + for (u32 j = 0; j < 16; j++) rldq_pshufb[i].m128i_u8[j] = (u8)(j - i) & 0xf; + } + } +}; + class SPURecompiler; class SPURecompilerCore : public CPUDecoder @@ -57,6 +109,9 @@ public: #define cpu_word(x) word_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 2) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 2") #define cpu_byte(x) byte_ptr(*cpu_var, (sizeof((*(SPUThread*)nullptr).x) == 1) ? offsetof(SPUThread, x) : throw "sizeof("#x") != 1") +#define g_imm_xmm(x) oword_ptr(*g_imm_var, offsetof(g_imm_table_struct, x)) +#define g_imm2_xmm(x, y) oword_ptr(*g_imm_var, y, 0, offsetof(g_imm_table_struct, x)) + #define LOG_OPCODE(...) //ConLog.Write("Compiled "__FUNCTION__"(): "__VA_ARGS__) #define LOG3_OPCODE(...) //ConLog.Write("Linked "__FUNCTION__"(): "__VA_ARGS__) @@ -97,12 +152,14 @@ public: GpVar* cpu_var; GpVar* ls_var; GpVar* imm_var; - // (input) output: + GpVar* g_imm_var; + // output: GpVar* pos_var; // temporary: GpVar* addr; GpVar* qw0; GpVar* qw1; + GpVar* qw2; struct XmmLink { @@ -578,30 +635,41 @@ private: } void ROT(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = (CPU.GPR[ra]._u32[0] << (CPU.GPR[rb]._u32[0] & 0x1f)) | (CPU.GPR[ra]._u32[0] >> (32 - (CPU.GPR[rb]._u32[0] & 0x1f))); - CPU.GPR[rt]._u32[1] = (CPU.GPR[ra]._u32[1] << (CPU.GPR[rb]._u32[1] & 0x1f)) | (CPU.GPR[ra]._u32[1] >> (32 - (CPU.GPR[rb]._u32[1] & 0x1f))); - CPU.GPR[rt]._u32[2] = (CPU.GPR[ra]._u32[2] << (CPU.GPR[rb]._u32[2] & 0x1f)) | (CPU.GPR[ra]._u32[2] >> (32 - (CPU.GPR[rb]._u32[2] & 0x1f))); - CPU.GPR[rt]._u32[3] = (CPU.GPR[ra]._u32[3] << (CPU.GPR[rb]._u32[3] & 0x1f)) | (CPU.GPR[ra]._u32[3] >> (32 - (CPU.GPR[rb]._u32[3] & 0x1f))); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 4; i++) + { + c.mov(qw0->r32(), cpu_dword(GPR[ra]._u32[i])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[i])); + c.rol(qw0->r32(), *addr); + c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32()); + } + LOG_OPCODE(); } void ROTM(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._u32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : 0; - CPU.GPR[rt]._u32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : 0; - CPU.GPR[rt]._u32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : 0; - CPU.GPR[rt]._u32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._u32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : 0; - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 4; i++) + { + c.mov(qw0->r32(), cpu_dword(GPR[ra]._u32[i])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[i])); + c.neg(*addr); + c.shr(*qw0, *addr); + c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32()); + } + LOG_OPCODE(); } void ROTMA(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - CPU.GPR[rt]._i32[0] = ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[0] >> ((0 - CPU.GPR[rb]._u32[0]) & 0x3f) : CPU.GPR[ra]._i32[0] >> 31; - CPU.GPR[rt]._i32[1] = ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[1] >> ((0 - CPU.GPR[rb]._u32[1]) & 0x3f) : CPU.GPR[ra]._i32[1] >> 31; - CPU.GPR[rt]._i32[2] = ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[2] >> ((0 - CPU.GPR[rb]._u32[2]) & 0x3f) : CPU.GPR[ra]._i32[2] >> 31; - CPU.GPR[rt]._i32[3] = ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) < 32 ? CPU.GPR[ra]._i32[3] >> ((0 - CPU.GPR[rb]._u32[3]) & 0x3f) : CPU.GPR[ra]._i32[3] >> 31; - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 4; i++) + { + c.movsxd(*qw0, cpu_dword(GPR[ra]._u32[i])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[i])); + c.neg(*addr); + c.sar(*qw0, *addr); + c.mov(cpu_dword(GPR[rt]._u32[i]), qw0->r32()); + } + LOG_OPCODE(); } void SHL(u32 rt, u32 ra, u32 rb) { @@ -617,31 +685,53 @@ private: } void ROTH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0xf)) | (CPU.GPR[ra]._u16[h] >> (16 - (CPU.GPR[rb]._u16[h] & 0xf))); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, cpu_word(GPR[rb]._u16[i])); + c.rol(qw0->r16(), *addr); + c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16()); + } + LOG_OPCODE(); } void ROTHM(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._u16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : 0; - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, cpu_word(GPR[rb]._u16[i])); + c.neg(*addr); + c.shr(qw0->r32(), *addr); + c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16()); + } + LOG_OPCODE(); } void ROTMAH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._i16[h] = ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) < 16 ? CPU.GPR[ra]._i16[h] >> ((0 - CPU.GPR[rb]._u16[h]) & 0x1f) : CPU.GPR[ra]._i16[h] >> 15; - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movsx(qw0->r32(), cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, cpu_word(GPR[rb]._u16[i])); + c.neg(*addr); + c.sar(qw0->r32(), *addr); + c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16()); + } + LOG_OPCODE(); } void SHLH(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (CPU.GPR[rb]._u16[h] & 0x1f) > 15 ? 0 : CPU.GPR[ra]._u16[h] << (CPU.GPR[rb]._u16[h] & 0x1f); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movzx(qw0->r32(), cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, cpu_word(GPR[rb]._u16[i])); + c.shl(qw0->r32(), *addr); + c.mov(cpu_word(GPR[rt]._u16[i]), qw0->r16()); + } + LOG_OPCODE(); } void ROTI(u32 rt, u32 ra, s32 i7) { @@ -1186,27 +1276,33 @@ private: } void FSM(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - const u32 pref = CPU.GPR[ra]._u32[3]; - for (int w = 0; w < 4; w++) - CPU.GPR[rt]._u32[w] = (pref & (1 << w)) ? ~0 : 0; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& vr = XmmAlloc(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.and_(*addr, 0xf); + c.shl(*addr, 4); + c.movdqa(vr.get(), g_imm2_xmm(fsm_table[0], *addr)); + XmmFinalize(vr, rt); + LOG_OPCODE(); } void FSMH(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - const u32 pref = CPU.GPR[ra]._u32[3]; - for (int h = 0; h < 8; h++) - CPU.GPR[rt]._u16[h] = (pref & (1 << h)) ? ~0 : 0; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& vr = XmmAlloc(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.and_(*addr, 0xff); + c.shl(*addr, 4); + c.movdqa(vr.get(), g_imm2_xmm(fsmh_table[0], *addr)); + XmmFinalize(vr, rt); + LOG_OPCODE(); } void FSMB(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - const u32 pref = CPU.GPR[ra]._u32[3]; - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = (pref & (1 << b)) ? ~0 : 0; - WRAPPER_END(rt, ra, 0, 0); + const XmmLink& vr = XmmAlloc(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[3])); + c.and_(*addr, 0xffff); + c.shl(*addr, 4); + c.movdqa(vr.get(), g_imm2_xmm(fsmb_table[0], *addr)); + XmmFinalize(vr, rt); + LOG_OPCODE(); } void FREST(u32 rt, u32 ra) { @@ -1247,32 +1343,35 @@ private: } void ROTQBYBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0xf; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - for (int b = 0; b < 16; b++) - CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 0xf << 3); + c.shl(*addr, 1); + c.pshufb(va.get(), g_imm2_xmm(rldq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void ROTQMBYBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = (0 - (CPU.GPR[rb]._u32[3] >> 3)) & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = 0; b < 16 - s; b++) - CPU.GPR[rt]._u8[b] = temp._u8[(b + s) & 0xf]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.shr(*addr, 3); + c.neg(*addr); + c.and_(*addr, 0x1f); + c.shl(*addr, 4); + c.pshufb(va.get(), g_imm2_xmm(srdq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void SHLQBYBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = (CPU.GPR[rb]._u32[3] >> 3) & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = s; b < 16; b++) - CPU.GPR[rt]._u8[b] = temp._u8[b - s]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 0x1f << 3); + c.shl(*addr, 1); + c.pshufb(va.get(), g_imm2_xmm(sldq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void CBX(u32 rt, u32 ra, u32 rb) { @@ -1361,73 +1460,89 @@ private: } void ROTQBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int t = CPU.GPR[rb]._u32[3] & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] << t) | (temp._u32[3] >> (32 - t)); - CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.mov(*qw2, *qw0); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 7); + c.shld(*qw0, *qw1, *addr); + c.shld(*qw1, *qw2, *addr); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void ROTQMBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int t = (0 - CPU.GPR[rb]._u32[3]) & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] >> t) | (temp._u32[1] << (32 - t)); - CPU.GPR[rt]._u32[1] = (temp._u32[1] >> t) | (temp._u32[2] << (32 - t)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] >> t) | (temp._u32[3] << (32 - t)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] >> t); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.neg(*addr); + c.and_(*addr, 7); + c.shrd(*qw0, *qw1, *addr); + c.shr(*qw1, *addr); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void SHLQBI(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int t = CPU.GPR[rb]._u32[3] & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] << t); - CPU.GPR[rt]._u32[1] = (temp._u32[1] << t) | (temp._u32[0] >> (32 - t)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] << t) | (temp._u32[1] >> (32 - t)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] << t) | (temp._u32[2] >> (32 - t)); - WRAPPER_END(rt, ra, rb, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 7); + c.shld(*qw1, *qw0, *addr); + c.shl(*qw0, *addr); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void ROTQBY(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = CPU.GPR[rb]._u32[3] & 0xf; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - for (int b = 0; b < 16; ++b) - CPU.GPR[rt]._u8[b] = temp._u8[(b - s) & 0xf]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 0xf); + c.shl(*addr, 4); + c.pshufb(va.get(), g_imm2_xmm(rldq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void ROTQMBY(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = (0 - CPU.GPR[rb]._u32[3]) & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = 0; b < 16 - s; b++) - CPU.GPR[rt]._u8[b] = temp._u8[(b + s) & 0xf]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.neg(*addr); + c.and_(*addr, 0x1f); + c.shl(*addr, 4); + c.pshufb(va.get(), g_imm2_xmm(srdq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void SHLQBY(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); - const int s = CPU.GPR[rb]._u32[3] & 0x1f; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = s; b < 16; b++) - CPU.GPR[rt]._u8[b] = temp._u8[b - s]; - WRAPPER_END(rt, ra, rb, 0); + const XmmLink& va = XmmGet(ra, rt); + c.mov(*addr, cpu_dword(GPR[rb]._u32[3])); + c.and_(*addr, 0x1f); + c.shl(*addr, 4); + c.pshufb(va.get(), g_imm2_xmm(sldq_pshufb[0], *addr)); + XmmFinalize(va, rt); + LOG_OPCODE(); } void ORX(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - CPU.GPR[rt]._u32[3] = CPU.GPR[ra]._u32[0] | CPU.GPR[ra]._u32[1] | CPU.GPR[ra]._u32[2] | CPU.GPR[ra]._u32[3]; - CPU.GPR[rt]._u32[2] = 0; - CPU.GPR[rt]._u64[0] = 0; - WRAPPER_END(rt, ra, 0, 0); + XmmInvalidate(rt); + c.mov(*addr, cpu_dword(GPR[ra]._u32[0])); + c.or_(*addr, cpu_dword(GPR[ra]._u32[1])); + c.or_(*addr, cpu_dword(GPR[ra]._u32[2])); + c.or_(*addr, cpu_dword(GPR[ra]._u32[3])); + c.mov(cpu_dword(GPR[rt]._u32[3]), *addr); + c.xor_(*addr, *addr); + c.mov(cpu_dword(GPR[rt]._u32[0]), *addr); + c.mov(cpu_dword(GPR[rt]._u32[1]), *addr); + c.mov(cpu_dword(GPR[rt]._u32[2]), *addr); + LOG_OPCODE(); } void CBD(u32 rt, u32 ra, s32 i7) { @@ -1488,36 +1603,37 @@ private: } void ROTQBII(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = i7 & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] << s) | (temp._u32[3] >> (32 - s)); - CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); - WRAPPER_END(rt, ra, i7, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.mov(*qw2, *qw0); + c.shld(*qw0, *qw1, i7 & 0x7); + c.shld(*qw1, *qw2, i7 & 0x7); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void ROTQMBII(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = (0 - (s32)i7) & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] >> s) | (temp._u32[1] << (32 - s)); - CPU.GPR[rt]._u32[1] = (temp._u32[1] >> s) | (temp._u32[2] << (32 - s)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] >> s) | (temp._u32[3] << (32 - s)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] >> s); - WRAPPER_END(rt, ra, i7, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.shrd(*qw0, *qw1, (0 - i7) & 0x7); + c.shr(*qw1, (0 - i7) & 0x7); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void SHLQBII(u32 rt, u32 ra, s32 i7) { - WRAPPER_BEGIN(rt, ra, i7, zz); - const int s = i7 & 0x7; - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt]._u32[0] = (temp._u32[0] << s); - CPU.GPR[rt]._u32[1] = (temp._u32[1] << s) | (temp._u32[0] >> (32 - s)); - CPU.GPR[rt]._u32[2] = (temp._u32[2] << s) | (temp._u32[1] >> (32 - s)); - CPU.GPR[rt]._u32[3] = (temp._u32[3] << s) | (temp._u32[2] >> (32 - s)); - WRAPPER_END(rt, ra, i7, 0); + XmmInvalidate(rt); + c.mov(*qw0, cpu_qword(GPR[ra]._u64[0])); + c.mov(*qw1, cpu_qword(GPR[ra]._u64[1])); + c.shld(*qw1, *qw0, i7 & 0x7); + c.shl(*qw0, i7 & 0x7); + c.mov(cpu_qword(GPR[rt]._u64[0]), *qw0); + c.mov(cpu_qword(GPR[rt]._u64[1]), *qw1); + LOG_OPCODE(); } void ROTQBYI(u32 rt, u32 ra, s32 i7) { @@ -1729,7 +1845,7 @@ private: } void SUMB(u32 rt, u32 ra, u32 rb) { - WRAPPER_BEGIN(rt, ra, rb, zz); + /*WRAPPER_BEGIN(rt, ra, rb, zz); const SPU_GPR_hdr _a = CPU.GPR[ra]; const SPU_GPR_hdr _b = CPU.GPR[rb]; for (int w = 0; w < 4; w++) @@ -1737,7 +1853,46 @@ private: CPU.GPR[rt]._u16[w*2] = _a._u8[w*4] + _a._u8[w*4 + 1] + _a._u8[w*4 + 2] + _a._u8[w*4 + 3]; CPU.GPR[rt]._u16[w*2 + 1] = _b._u8[w*4] + _b._u8[w*4 + 1] + _b._u8[w*4 + 2] + _b._u8[w*4 + 3]; } - WRAPPER_END(rt, ra, rb, 0); + WRAPPER_END(rt, ra, rb, 0);*/ + + const XmmLink& va = XmmGet(ra); + const XmmLink& vb = (ra == rb) ? XmmCopy(va) : XmmGet(rb); + const XmmLink& v1 = XmmCopy(vb, rt); + const XmmLink& v2 = XmmCopy(vb); + const XmmLink& vFF = XmmAlloc(); + c.movdqa(vFF.get(), XmmConst(_mm_set1_epi32(0xff))); + c.pand(v1.get(), vFF.get()); + c.psrld(v2.get(), 8); + c.pand(v2.get(), vFF.get()); + c.paddd(v1.get(), v2.get()); + c.movdqa(v2.get(), vb.get()); + c.psrld(v2.get(), 16); + c.pand(v2.get(), vFF.get()); + c.paddd(v1.get(), v2.get()); + c.movdqa(v2.get(), vb.get()); + c.psrld(v2.get(), 24); + c.paddd(v1.get(), v2.get()); + c.pslld(v1.get(), 16); + c.movdqa(v2.get(), va.get()); + c.pand(v2.get(), vFF.get()); + c.por(v1.get(), v2.get()); + c.movdqa(v2.get(), va.get()); + c.psrld(v2.get(), 8); + c.pand(v2.get(), vFF.get()); + c.paddd(v1.get(), v2.get()); + c.movdqa(v2.get(), va.get()); + c.psrld(v2.get(), 16); + c.pand(v2.get(), vFF.get()); + c.paddd(v1.get(), v2.get()); + c.movdqa(v2.get(), va.get()); + c.psrld(v2.get(), 24); + c.paddd(v1.get(), v2.get()); + XmmFinalize(vb); + XmmFinalize(va); + XmmFinalize(v1, rt); + XmmFinalize(v2); + XmmFinalize(vFF); + LOG_OPCODE(); } //HGT uses signed values. HLGT uses unsigned values void HGT(u32 rt, s32 ra, s32 rb) @@ -1754,18 +1909,16 @@ private: } void CLZ(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - for (int w = 0; w < 4; w++) + XmmInvalidate(rt); + for (u32 i = 0; i < 4; i++) { - int nPos; - - for (nPos = 0; nPos < 32; nPos++) - if (CPU.GPR[ra]._u32[w] & (1 << (31 - nPos))) - break; - - CPU.GPR[rt]._u32[w] = nPos; + c.bsr(*addr, cpu_dword(GPR[ra]._u32[i])); + c.cmovz(*addr, dword_ptr(*g_imm_var, offsetof(g_imm_table_struct, fsmb_table[0xffff]))); // load 0xffffffff + c.neg(*addr); + c.add(*addr, 31); + c.mov(cpu_dword(GPR[rt]._u32[i]), *addr); } - WRAPPER_END(rt, ra, 0, 0); + LOG_OPCODE(); } void XSWD(u32 rt, u32 ra) { @@ -1786,13 +1939,14 @@ private: } void CNTB(u32 rt, u32 ra) { - WRAPPER_BEGIN(rt, ra, yy, zz); - const SPU_GPR_hdr temp = CPU.GPR[ra]; - CPU.GPR[rt].Reset(); - for (int b = 0; b < 16; b++) - for (int i = 0; i < 8; i++) - CPU.GPR[rt]._u8[b] += (temp._u8[b] & (1 << i)) ? 1 : 0; - WRAPPER_END(rt, ra, 0, 0); + XmmInvalidate(rt); + for (u32 i = 0; i < 8; i++) + { + c.movzx(*addr, cpu_word(GPR[ra]._u16[i])); + c.movzx(*addr, word_ptr(*g_imm_var, *addr, 1, offsetof(g_imm_table_struct, cntb_table[0]))); + c.mov(cpu_word(GPR[rt]._u16[i]), addr->r16()); + } + LOG_OPCODE(); } void XSBH(u32 rt, u32 ra) { @@ -2228,14 +2382,14 @@ private: XmmFinalize(vt); LOG_OPCODE(); } - void CGX(u32 rt, u32 ra, u32 rb) + void CGX(u32 rt, u32 ra, u32 rb) //nf { WRAPPER_BEGIN(rt, ra, rb, zz); for (int w = 0; w < 4; w++) CPU.GPR[rt]._u32[w] = ((u64)CPU.GPR[ra]._u32[w] + (u64)CPU.GPR[rb]._u32[w] + (u64)(CPU.GPR[rt]._u32[w] & 1)) >> 32; WRAPPER_END(rt, ra, rb, 0); } - void BGX(u32 rt, u32 ra, u32 rb) + void BGX(u32 rt, u32 ra, u32 rb) //nf { WRAPPER_BEGIN(rt, ra, rb, zz); s64 nResult; @@ -2299,7 +2453,7 @@ private: { UNIMPLEMENTED(); } - void DFTSV(u32 rt, u32 ra, s32 i7) + void DFTSV(u32 rt, u32 ra, s32 i7) //nf { WRAPPER_BEGIN(rt, ra, i7, zz); const u64 DoubleExpMask = 0x7ff0000000000000; @@ -2721,12 +2875,7 @@ private: else { const XmmLink& vr = XmmAlloc(rt); - __m128i fsmbi_mask; - for (u32 j = 0; j < 16; j++) - { - fsmbi_mask.m128i_i8[j] = ((i16 >> j) & 0x1) ? 0xff : 0; - } - c.movdqa(vr.get(), XmmConst(fsmbi_mask)); + c.movdqa(vr.get(), g_imm_xmm(fsmb_table[i16 & 0xffff])); XmmFinalize(vr, rt); } LOG_OPCODE(); diff --git a/rpcs3/Emu/Cell/SPURecompilerCore.cpp b/rpcs3/Emu/Cell/SPURecompilerCore.cpp index 5a6cd5e880..56d7e65d38 100644 --- a/rpcs3/Emu/Cell/SPURecompilerCore.cpp +++ b/rpcs3/Emu/Cell/SPURecompilerCore.cpp @@ -4,6 +4,8 @@ #include "SPUInterpreter.h" #include "SPURecompiler.h" +static const g_imm_table_struct g_imm_table; + SPURecompilerCore::SPURecompilerCore(SPUThread& cpu) : m_enc(new SPURecompiler(cpu, *this)) , inter(new SPUInterpreter(cpu)) @@ -58,16 +60,21 @@ void SPURecompilerCore::Compile(u16 pos) compiler.alloc(imm_var); m_enc->imm_var = &imm_var; - GpVar pos_var(compiler, kVarTypeUInt32, "pos"); - compiler.setArg(3, pos_var); - m_enc->pos_var = &pos_var; + GpVar g_imm_var(compiler, kVarTypeIntPtr, "g_imm"); + compiler.setArg(3, g_imm_var); + compiler.alloc(g_imm_var); + m_enc->g_imm_var = &g_imm_var; + GpVar pos_var(compiler, kVarTypeUInt32, "pos"); + m_enc->pos_var = &pos_var; GpVar addr_var(compiler, kVarTypeUInt32, "addr"); m_enc->addr = &addr_var; GpVar qw0_var(compiler, kVarTypeUInt64, "qw0"); m_enc->qw0 = &qw0_var; GpVar qw1_var(compiler, kVarTypeUInt64, "qw1"); m_enc->qw1 = &qw1_var; + GpVar qw2_var(compiler, kVarTypeUInt64, "qw2"); + m_enc->qw2 = &qw2_var; for (u32 i = 0; i < 16; i++) { @@ -198,7 +205,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) return 0; } - typedef u32(*Func)(void* _cpu, void* _ls, const void* _imm, u32 _pos); + typedef u32(*Func)(const void* _cpu, const void* _ls, const void* _imm, const void* _g_imm); Func func = asmjit_cast(entry[pos].pointer); @@ -215,7 +222,7 @@ u8 SPURecompilerCore::DecodeMemory(const u64 address) } u32 res = pos; - res = func(cpu, &Memory[m_offset], imm_table.data(), res); + res = func(cpu, &Memory[m_offset], imm_table.data(), &g_imm_table); if (res > 0xffff) {