From 6ea50567b67903a1e097ac2b68c112e1d28cea99 Mon Sep 17 00:00:00 2001 From: S Gopal Rajagopal Date: Sat, 29 Nov 2014 01:39:59 +0530 Subject: [PATCH] PPU: Implemented some instructions in the recompiler. Fixed some bugs in the interpreter. --- rpcs3/Emu/Cell/PPUInterpreter.h | 172 +++++++++++++--------- rpcs3/Emu/Cell/PPULLVMRecompiler.cpp | 137 +++++++++++++++-- rpcs3/Emu/Cell/PPULLVMRecompiler.h | 6 +- rpcs3/Emu/Cell/PPULLVMRecompilerTests.cpp | 11 +- 4 files changed, 241 insertions(+), 85 deletions(-) diff --git a/rpcs3/Emu/Cell/PPUInterpreter.h b/rpcs3/Emu/Cell/PPUInterpreter.h index f96d634f23..76a9991a62 100644 --- a/rpcs3/Emu/Cell/PPUInterpreter.h +++ b/rpcs3/Emu/Cell/PPUInterpreter.h @@ -834,11 +834,11 @@ private: } void VCTSXS(u32 vd, u32 uimm5, u32 vb) { - int nScale = 1 << uimm5; - + u32 nScale = 1 << uimm5; + for (uint w = 0; w < 4; w++) - { - float result = CPU.VPR[vb]._f[w] * nScale; + { + double result = (double)CPU.VPR[vb]._f[w] * nScale; if (result > 0x7fffffff) { @@ -856,12 +856,12 @@ private: } void VCTUXS(u32 vd, u32 uimm5, u32 vb) { - int nScale = 1 << uimm5; + u32 nScale = 1 << uimm5; for (uint w = 0; w < 4; w++) { // C rounding = Round towards 0 - float result = CPU.VPR[vb]._f[w] * nScale; + double result = (double)CPU.VPR[vb]._f[w] * nScale; if (result > 0xffffffffu) { @@ -1078,26 +1078,32 @@ private: } void VMRGLB(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint h = 0; h < 8; h++) { - CPU.VPR[vd]._u8[15 - h*2] = CPU.VPR[va]._u8[7 - h]; - CPU.VPR[vd]._u8[15 - h*2 - 1] = CPU.VPR[vb]._u8[7 - h]; + CPU.VPR[vd]._u8[15 - h*2] = VA._u8[7 - h]; + CPU.VPR[vd]._u8[15 - h*2 - 1] = VB._u8[7 - h]; } } void VMRGLH(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._u16[7 - w*2] = CPU.VPR[va]._u16[3 - w]; - CPU.VPR[vd]._u16[7 - w*2 - 1] = CPU.VPR[vb]._u16[3 - w]; + CPU.VPR[vd]._u16[7 - w*2] = VA._u16[3 - w]; + CPU.VPR[vd]._u16[7 - w*2 - 1] = VB._u16[3 - w]; } } void VMRGLW(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint d = 0; d < 2; d++) { - CPU.VPR[vd]._u32[3 - d*2] = CPU.VPR[va]._u32[1 - d]; - CPU.VPR[vd]._u32[3 - d*2 - 1] = CPU.VPR[vb]._u32[1 - d]; + CPU.VPR[vd]._u32[3 - d*2] = VA._u32[1 - d]; + CPU.VPR[vd]._u32[3 - d*2 - 1] = VB._u32[1 - d]; } } void VMSUMMBM(u32 vd, u32 va, u32 vb, u32 vc) //nf @@ -1168,7 +1174,7 @@ private: for (uint b = 0; b < 4; b++) { - result += CPU.VPR[va]._u8[w*4 + b] * CPU.VPR[vb]._u8[w*4 + b]; + result += (u32)CPU.VPR[va]._u8[w*4 + b] * (u32)CPU.VPR[vb]._u8[w*4 + b]; } result += CPU.VPR[vc]._u32[w]; @@ -1183,7 +1189,7 @@ private: for (uint h = 0; h < 2; h++) { - result += CPU.VPR[va]._u16[w*2 + h] * CPU.VPR[vb]._u16[w*2 + h]; + result += (u32)CPU.VPR[va]._u16[w*2 + h] * (u32)CPU.VPR[vb]._u16[w*2 + h]; } result += CPU.VPR[vc]._u32[w]; @@ -1199,7 +1205,7 @@ private: for (uint h = 0; h < 2; h++) { - result += CPU.VPR[va]._u16[w*2 + h] * CPU.VPR[vb]._u16[w*2 + h]; + result += (u64)CPU.VPR[va]._u16[w*2 + h] * (u64)CPU.VPR[vb]._u16[w*2 + h]; } result += CPU.VPR[vc]._u32[w]; @@ -1307,16 +1313,18 @@ private: } void VPKPX(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint h = 0; h < 4; h++) { - u16 bb7 = CPU.VPR[vb]._u8[15 - (h*4 + 0)] & 0x1; - u16 bb8 = CPU.VPR[vb]._u8[15 - (h*4 + 1)] >> 3; - u16 bb16 = CPU.VPR[vb]._u8[15 - (h*4 + 2)] >> 3; - u16 bb24 = CPU.VPR[vb]._u8[15 - (h*4 + 3)] >> 3; - u16 ab7 = CPU.VPR[va]._u8[15 - (h*4 + 0)] & 0x1; - u16 ab8 = CPU.VPR[va]._u8[15 - (h*4 + 1)] >> 3; - u16 ab16 = CPU.VPR[va]._u8[15 - (h*4 + 2)] >> 3; - u16 ab24 = CPU.VPR[va]._u8[15 - (h*4 + 3)] >> 3; + u16 bb7 = VB._u8[15 - (h*4 + 0)] & 0x1; + u16 bb8 = VB._u8[15 - (h*4 + 1)] >> 3; + u16 bb16 = VB._u8[15 - (h*4 + 2)] >> 3; + u16 bb24 = VB._u8[15 - (h*4 + 3)] >> 3; + u16 ab7 = VA._u8[15 - (h*4 + 0)] & 0x1; + u16 ab8 = VA._u8[15 - (h*4 + 1)] >> 3; + u16 ab16 = VA._u8[15 - (h*4 + 2)] >> 3; + u16 ab24 = VA._u8[15 - (h*4 + 3)] >> 3; CPU.VPR[vd]._u16[3 - h] = (bb7 << 15) | (bb8 << 10) | (bb16 << 5) | bb24; CPU.VPR[vd]._u16[4 + (3 - h)] = (ab7 << 15) | (ab8 << 10) | (ab16 << 5) | ab24; @@ -1324,9 +1332,11 @@ private: } void VPKSHSS(u32 vd, u32 va, u32 vb) //nf { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint b = 0; b < 8; b++) { - s16 result = CPU.VPR[va]._s16[b]; + s16 result = VA._s16[b]; if (result > INT8_MAX) { @@ -1341,7 +1351,7 @@ private: CPU.VPR[vd]._s8[b+8] = (s8)result; - result = CPU.VPR[vb]._s16[b]; + result = VB._s16[b]; if (result > INT8_MAX) { @@ -1359,9 +1369,11 @@ private: } void VPKSHUS(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint b = 0; b < 8; b++) { - s16 result = CPU.VPR[va]._s16[b]; + s16 result = VA._s16[b]; if (result > UINT8_MAX) { @@ -1376,7 +1388,7 @@ private: CPU.VPR[vd]._u8[b+8] = (u8)result; - result = CPU.VPR[vb]._s16[b]; + result = VB._s16[b]; if (result > UINT8_MAX) { @@ -1394,9 +1406,11 @@ private: } void VPKSWSS(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint h = 0; h < 4; h++) { - s32 result = CPU.VPR[va]._s32[h]; + s32 result = VA._s32[h]; if (result > INT16_MAX) { @@ -1411,7 +1425,7 @@ private: CPU.VPR[vd]._s16[h+4] = result; - result = CPU.VPR[vb]._s32[h]; + result = VB._s32[h]; if (result > INT16_MAX) { @@ -1429,9 +1443,11 @@ private: } void VPKSWUS(u32 vd, u32 va, u32 vb) //nf { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint h = 0; h < 4; h++) { - s32 result = CPU.VPR[va]._s32[h]; + s32 result = VA._s32[h]; if (result > UINT16_MAX) { @@ -1446,7 +1462,7 @@ private: CPU.VPR[vd]._u16[h+4] = result; - result = CPU.VPR[vb]._s32[h]; + result = VB._s32[h]; if (result > UINT16_MAX) { @@ -1464,17 +1480,21 @@ private: } void VPKUHUM(u32 vd, u32 va, u32 vb) //nf { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint b = 0; b < 8; b++) { - CPU.VPR[vd]._u8[b+8] = CPU.VPR[va]._u8[b*2]; - CPU.VPR[vd]._u8[b ] = CPU.VPR[vb]._u8[b*2]; + CPU.VPR[vd]._u8[b+8] = VA._u8[b*2]; + CPU.VPR[vd]._u8[b ] = VB._u8[b*2]; } } void VPKUHUS(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint b = 0; b < 8; b++) { - u16 result = CPU.VPR[va]._u16[b]; + u16 result = VA._u16[b]; if (result > UINT8_MAX) { @@ -1484,7 +1504,7 @@ private: CPU.VPR[vd]._u8[b+8] = (u8)result; - result = CPU.VPR[vb]._u16[b]; + result = VB._u16[b]; if (result > UINT8_MAX) { @@ -1497,17 +1517,21 @@ private: } void VPKUWUM(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint h = 0; h < 4; h++) { - CPU.VPR[vd]._u16[h+4] = CPU.VPR[va]._u16[h*2]; - CPU.VPR[vd]._u16[h ] = CPU.VPR[vb]._u16[h*2]; + CPU.VPR[vd]._u16[h+4] = VA._u16[h*2]; + CPU.VPR[vd]._u16[h ] = VB._u16[h*2]; } } void VPKUWUS(u32 vd, u32 va, u32 vb) //nf { + u128 VA = CPU.VPR[va]; + u128 VB = CPU.VPR[vb]; for (uint h = 0; h < 4; h++) { - u32 result = CPU.VPR[va]._u32[h]; + u32 result = VA._u32[h]; if (result > UINT16_MAX) { @@ -1517,7 +1541,7 @@ private: CPU.VPR[vd]._u16[h+4] = result; - result = CPU.VPR[vb]._u32[h]; + result = VB._u32[h]; if (result > UINT16_MAX) { @@ -1539,30 +1563,28 @@ private: { for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._f[w] = floor(CPU.VPR[vb]._f[w]); + CPU.VPR[vd]._f[w] = floorf(CPU.VPR[vb]._f[w]); } } void VRFIN(u32 vd, u32 vb) { for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._f[w] = floor(CPU.VPR[vb]._f[w] + 0.5f); + CPU.VPR[vd]._f[w] = nearbyintf(CPU.VPR[vb]._f[w]); } } void VRFIP(u32 vd, u32 vb) { for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._f[w] = ceil(CPU.VPR[vb]._f[w]); + CPU.VPR[vd]._f[w] = ceilf(CPU.VPR[vb]._f[w]); } } void VRFIZ(u32 vd, u32 vb) { for (uint w = 0; w < 4; w++) { - float f; - modff(CPU.VPR[vb]._f[w], &f); - CPU.VPR[vd]._f[w] = f; + CPU.VPR[vd]._f[w] = truncf(CPU.VPR[vb]._f[w]); } } void VRLB(u32 vd, u32 va, u32 vb) //nf @@ -1605,12 +1627,13 @@ private: } void VSL(u32 vd, u32 va, u32 vb) //nf { + u128 VA = CPU.VPR[va]; u8 sh = CPU.VPR[vb]._u8[0] & 0x7; - CPU.VPR[vd]._u8[0] = CPU.VPR[va]._u8[0] << sh; + CPU.VPR[vd]._u8[0] = VA._u8[0] << sh; for (uint b = 1; b < 16; b++) { - CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] << sh) | (CPU.VPR[va]._u8[b-1] >> (8 - sh)); + CPU.VPR[vd]._u8[b] = (VA._u8[b] << sh) | (VA._u8[b-1] >> (8 - sh)); } } void VSLB(u32 vd, u32 va, u32 vb) @@ -1635,18 +1658,19 @@ private: { for (uint h = 0; h < 8; h++) { - CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] << (CPU.VPR[vb]._u8[h*2] & 0xf); + CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] << (CPU.VPR[vb]._u16[h] & 0xf); } } void VSLO(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; u8 nShift = (CPU.VPR[vb]._u8[0] >> 3) & 0xf; CPU.VPR[vd].clear(); for (u8 b = 0; b < 16 - nShift; b++) { - CPU.VPR[vd]._u8[15 - b] = CPU.VPR[va]._u8[15 - (b + nShift)]; + CPU.VPR[vd]._u8[15 - b] = VA._u8[15 - (b + nShift)]; } } void VSLW(u32 vd, u32 va, u32 vb) @@ -1710,12 +1734,13 @@ private: } void VSR(u32 vd, u32 va, u32 vb) //nf { + u128 VA = CPU.VPR[va]; u8 sh = CPU.VPR[vb]._u8[0] & 0x7; - CPU.VPR[vd]._u8[15] = CPU.VPR[va]._u8[15] >> sh; + CPU.VPR[vd]._u8[15] = VA._u8[15] >> sh; for (uint b = 14; ~b; b--) { - CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] >> sh) | (CPU.VPR[va]._u8[b+1] << (8 - sh)); + CPU.VPR[vd]._u8[b] = (VA._u8[b] >> sh) | (VA._u8[b+1] << (8 - sh)); } } void VSRAB(u32 vd, u32 va, u32 vb) //nf @@ -1729,14 +1754,14 @@ private: { for (uint h = 0; h < 8; h++) { - CPU.VPR[vd]._s16[h] = CPU.VPR[va]._s16[h] >> (CPU.VPR[vb]._u8[h*2] & 0xf); + CPU.VPR[vd]._s16[h] = CPU.VPR[va]._s16[h] >> (CPU.VPR[vb]._u16[h] & 0xf); } } void VSRAW(u32 vd, u32 va, u32 vb) { for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._s32[w] = CPU.VPR[va]._s32[w] >> (CPU.VPR[vb]._u8[w*4] & 0x1f); + CPU.VPR[vd]._s32[w] = CPU.VPR[va]._s32[w] >> (CPU.VPR[vb]._u32[w] & 0x1f); } } void VSRB(u32 vd, u32 va, u32 vb) @@ -1750,25 +1775,26 @@ private: { for (uint h = 0; h < 8; h++) { - CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] >> (CPU.VPR[vb]._u8[h*2] & 0xf); + CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] >> (CPU.VPR[vb]._u16[h] & 0xf); } } void VSRO(u32 vd, u32 va, u32 vb) { + u128 VA = CPU.VPR[va]; u8 nShift = (CPU.VPR[vb]._u8[0] >> 3) & 0xf; CPU.VPR[vd].clear(); for (u8 b = 0; b < 16 - nShift; b++) { - CPU.VPR[vd]._u8[b] = CPU.VPR[va]._u8[b + nShift]; + CPU.VPR[vd]._u8[b] = VA._u8[b + nShift]; } } void VSRW(u32 vd, u32 va, u32 vb) { for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] >> (CPU.VPR[vb]._u8[w*4] & 0x1f); + CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] >> (CPU.VPR[vb]._u32[w] & 0x1f); } } void VSUBCUW(u32 vd, u32 va, u32 vb) //nf @@ -2029,50 +2055,56 @@ private: } void VUPKHPX(u32 vd, u32 vb) { + u128 VB = CPU.VPR[vb]; for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._s8[(3 - w)*4 + 3] = CPU.VPR[vb]._s8[w*2 + 0] >> 7; // signed shift sign extends - CPU.VPR[vd]._u8[(3 - w)*4 + 2] = (CPU.VPR[vb]._u8[w*2 + 0] >> 2) & 0x1f; - CPU.VPR[vd]._u8[(3 - w)*4 + 1] = ((CPU.VPR[vb]._u8[w*2 + 0] & 0x3) << 3) | ((CPU.VPR[vb]._u8[w*2 + 1] >> 5) & 0x7); - CPU.VPR[vd]._u8[(3 - w)*4 + 0] = CPU.VPR[vb]._u8[w*2 + 1] & 0x1f; + CPU.VPR[vd]._s8[w*4 + 3] = VB._s8[8 + w*2 + 1] >> 7; // signed shift sign extends + CPU.VPR[vd]._u8[w*4 + 2] = (VB._u8[8 + w*2 + 1] >> 2) & 0x1f; + CPU.VPR[vd]._u8[w*4 + 1] = ((VB._u8[8 + w*2 + 1] & 0x3) << 3) | ((VB._u8[8 + w*2 + 0] >> 5) & 0x7); + CPU.VPR[vd]._u8[w*4 + 0] = VB._u8[8 + w*2 + 0] & 0x1f; } } void VUPKHSB(u32 vd, u32 vb) { + u128 VB = CPU.VPR[vb]; for (uint h = 0; h < 8; h++) { - CPU.VPR[vd]._s16[h] = CPU.VPR[vb]._s8[h]; + CPU.VPR[vd]._s16[h] = VB._s8[8 + h]; } } void VUPKHSH(u32 vd, u32 vb) { + u128 VB = CPU.VPR[vb]; for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._s32[w] = CPU.VPR[vb]._s16[w]; + CPU.VPR[vd]._s32[w] = VB._s16[4 + w]; } } void VUPKLPX(u32 vd, u32 vb) { + u128 VB = CPU.VPR[vb]; for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._s8[(3 - w)*4 + 3] = CPU.VPR[vb]._s8[8 + w*2 + 0] >> 7; // signed shift sign extends - CPU.VPR[vd]._u8[(3 - w)*4 + 2] = (CPU.VPR[vb]._u8[8 + w*2 + 0] >> 2) & 0x1f; - CPU.VPR[vd]._u8[(3 - w)*4 + 1] = ((CPU.VPR[vb]._u8[8 + w*2 + 0] & 0x3) << 3) | ((CPU.VPR[vb]._u8[8 + w*2 + 1] >> 5) & 0x7); - CPU.VPR[vd]._u8[(3 - w)*4 + 0] = CPU.VPR[vb]._u8[8 + w*2 + 1] & 0x1f; + CPU.VPR[vd]._s8[w*4 + 3] = VB._s8[w*2 + 1] >> 7; // signed shift sign extends + CPU.VPR[vd]._u8[w*4 + 2] = (VB._u8[w*2 + 1] >> 2) & 0x1f; + CPU.VPR[vd]._u8[w*4 + 1] = ((VB._u8[w*2 + 1] & 0x3) << 3) | ((VB._u8[w*2 + 0] >> 5) & 0x7); + CPU.VPR[vd]._u8[w*4 + 0] = VB._u8[w*2 + 0] & 0x1f; } } void VUPKLSB(u32 vd, u32 vb) //nf { + u128 VB = CPU.VPR[vb]; for (uint h = 0; h < 8; h++) { - CPU.VPR[vd]._s16[h] = CPU.VPR[vb]._s8[8 + h]; + CPU.VPR[vd]._s16[h] = VB._s8[h]; } } void VUPKLSH(u32 vd, u32 vb) { + u128 VB = CPU.VPR[vb]; for (uint w = 0; w < 4; w++) { - CPU.VPR[vd]._s32[w] = CPU.VPR[vb]._s16[4 + w]; + CPU.VPR[vd]._s32[w] = VB._s16[w]; } } void VXOR(u32 vd, u32 va, u32 vb) @@ -2792,7 +2824,7 @@ private: return; } const u8 eb = (addr & 0xf) >> 1; - vm::write16((u32)addr, CPU.VPR[vs]._u16[7 - eb]); + vm::write16((u32)addr & 0xFFFFFFFE, CPU.VPR[vs]._u16[7 - eb]); } void STDUX(u32 rs, u32 ra, u32 rb) { @@ -2828,7 +2860,7 @@ private: return; } const u8 eb = (addr & 0xf) >> 2; - vm::write32((u32)addr, CPU.VPR[vs]._u32[3 - eb]); + vm::write32((u32)addr & 0xFFFFFFFC, CPU.VPR[vs]._u32[3 - eb]); } void ADDZE(u32 rd, u32 ra, u32 oe, bool rc) { diff --git a/rpcs3/Emu/Cell/PPULLVMRecompiler.cpp b/rpcs3/Emu/Cell/PPULLVMRecompiler.cpp index 85e75b9a03..b88489bc5e 100644 --- a/rpcs3/Emu/Cell/PPULLVMRecompiler.cpp +++ b/rpcs3/Emu/Cell/PPULLVMRecompiler.cpp @@ -1016,7 +1016,26 @@ void Compiler::VMSUMSHM(u32 vd, u32 va, u32 vb, u32 vc) { } void Compiler::VMSUMSHS(u32 vd, u32 va, u32 vb, u32 vc) { - InterpreterCall("VMSUMSHS", &PPUInterpreter::VMSUMSHS, vd, va, vb, vc); + auto va_v8i16 = GetVrAsIntVec(va, 16); + auto vb_v8i16 = GetVrAsIntVec(vb, 16); + auto vc_v4i32 = GetVrAsIntVec(vc, 32); + auto res_v4i32 = (Value *)m_ir_builder->CreateCall2(Intrinsic::getDeclaration(m_module, Intrinsic::x86_sse2_pmadd_wd), va_v8i16, vb_v8i16); + + auto tmp1_v4i32 = m_ir_builder->CreateLShr(vc_v4i32, 31); + tmp1_v4i32 = m_ir_builder->CreateAdd(tmp1_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x7FFFFFFF))); + auto tmp1_v16i8 = m_ir_builder->CreateBitCast(tmp1_v4i32, VectorType::get(m_ir_builder->getInt8Ty(), 16)); + auto tmp2_v4i32 = m_ir_builder->CreateXor(vc_v4i32, res_v4i32); + tmp2_v4i32 = m_ir_builder->CreateNot(tmp2_v4i32); + auto sum_v4i32 = m_ir_builder->CreateAdd(vc_v4i32, res_v4i32); + auto sum_v16i8 = m_ir_builder->CreateBitCast(sum_v4i32, VectorType::get(m_ir_builder->getInt8Ty(), 16)); + auto tmp3_v4i32 = m_ir_builder->CreateXor(vc_v4i32, sum_v4i32); + tmp3_v4i32 = m_ir_builder->CreateAnd(tmp2_v4i32, tmp3_v4i32); + tmp3_v4i32 = m_ir_builder->CreateAShr(tmp3_v4i32, 31); + auto tmp3_v16i8 = m_ir_builder->CreateBitCast(tmp3_v4i32, VectorType::get(m_ir_builder->getInt8Ty(), 16)); + auto res_v16i8 = m_ir_builder->CreateCall3(Intrinsic::getDeclaration(m_module, Intrinsic::x86_sse41_pblendvb), sum_v16i8, tmp1_v16i8, tmp3_v16i8); + SetVr(vd, res_v16i8); + + // TODO: Set VSCR.SAT } void Compiler::VMSUMUBM(u32 vd, u32 va, u32 vb, u32 vc) { @@ -1074,7 +1093,31 @@ void Compiler::VMSUMUHM(u32 vd, u32 va, u32 vb, u32 vc) { } void Compiler::VMSUMUHS(u32 vd, u32 va, u32 vb, u32 vc) { - InterpreterCall("VMSUMUHS", &PPUInterpreter::VMSUMUHS, vd, va, vb, vc); + auto va_v8i16 = GetVrAsIntVec(va, 16); + auto vb_v8i16 = GetVrAsIntVec(vb, 16); + auto va_v8i32 = m_ir_builder->CreateZExt(va_v8i16, VectorType::get(m_ir_builder->getInt32Ty(), 8)); + auto vb_v8i32 = m_ir_builder->CreateZExt(vb_v8i16, VectorType::get(m_ir_builder->getInt32Ty(), 8)); + auto tmp_v8i32 = m_ir_builder->CreateMul(va_v8i32, vb_v8i32); + + auto undef_v8i32 = UndefValue::get(VectorType::get(m_ir_builder->getInt32Ty(), 8)); + u32 mask1_v4i32[4] = {0, 2, 4, 6}; + auto tmp1_v4i32 = m_ir_builder->CreateShuffleVector(tmp_v8i32, undef_v8i32, ConstantDataVector::get(m_ir_builder->getContext(), mask1_v4i32)); + u32 mask2_v4i32[4] = {1, 3, 5, 7}; + auto tmp2_v4i32 = m_ir_builder->CreateShuffleVector(tmp_v8i32, undef_v8i32, ConstantDataVector::get(m_ir_builder->getContext(), mask2_v4i32)); + + auto vc_v4i32 = GetVrAsIntVec(vc, 32); + auto res_v4i32 = m_ir_builder->CreateAdd(tmp1_v4i32, tmp2_v4i32); + auto cmp_v4i1 = m_ir_builder->CreateICmpULT(res_v4i32, tmp1_v4i32); + auto cmp_v4i32 = m_ir_builder->CreateSExt(cmp_v4i1, VectorType::get(m_ir_builder->getInt32Ty(), 4)); + res_v4i32 = m_ir_builder->CreateOr(res_v4i32, cmp_v4i32); + res_v4i32 = m_ir_builder->CreateAdd(res_v4i32, vc_v4i32); + cmp_v4i1 = m_ir_builder->CreateICmpULT(res_v4i32, vc_v4i32); + cmp_v4i32 = m_ir_builder->CreateSExt(cmp_v4i1, VectorType::get(m_ir_builder->getInt32Ty(), 4)); + res_v4i32 = m_ir_builder->CreateOr(res_v4i32, cmp_v4i32); + + SetVr(vd, res_v4i32); + + // TODO: Set VSCR.SAT } void Compiler::VMULESB(u32 vd, u32 va, u32 vb) { @@ -1204,7 +1247,37 @@ void Compiler::VPERM(u32 vd, u32 va, u32 vb, u32 vc) { } void Compiler::VPKPX(u32 vd, u32 va, u32 vb) { - InterpreterCall("VPKPX", &PPUInterpreter::VPKPX, vd, va, vb); + auto va_v4i32 = GetVrAsIntVec(va, 32); + auto vb_v4i32 = GetVrAsIntVec(vb, 32); + + auto tmpa_v4i32 = m_ir_builder->CreateShl(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(7))); + tmpa_v4i32 = m_ir_builder->CreateAnd(tmpa_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFC000000))); + va_v4i32 = m_ir_builder->CreateShl(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(10))); + va_v4i32 = m_ir_builder->CreateAnd(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(~0xFC000000))); + tmpa_v4i32 = m_ir_builder->CreateOr(tmpa_v4i32, va_v4i32); + tmpa_v4i32 = m_ir_builder->CreateAnd(tmpa_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFFE00000))); + va_v4i32 = m_ir_builder->CreateShl(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(3))); + va_v4i32 = m_ir_builder->CreateAnd(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(~0xFFE00000))); + tmpa_v4i32 = m_ir_builder->CreateOr(tmpa_v4i32, va_v4i32); + auto tmpa_v8i16 = m_ir_builder->CreateBitCast(tmpa_v4i32, VectorType::get(m_ir_builder->getInt16Ty(), 8)); + + auto tmpb_v4i32 = m_ir_builder->CreateShl(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(7))); + tmpb_v4i32 = m_ir_builder->CreateAnd(tmpb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFC000000))); + vb_v4i32 = m_ir_builder->CreateShl(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(10))); + vb_v4i32 = m_ir_builder->CreateAnd(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(~0xFC000000))); + tmpb_v4i32 = m_ir_builder->CreateOr(tmpb_v4i32, vb_v4i32); + tmpb_v4i32 = m_ir_builder->CreateAnd(tmpb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFFE00000))); + vb_v4i32 = m_ir_builder->CreateShl(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(3))); + vb_v4i32 = m_ir_builder->CreateAnd(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(~0xFFE00000))); + tmpb_v4i32 = m_ir_builder->CreateOr(tmpb_v4i32, vb_v4i32); + auto tmpb_v8i16 = m_ir_builder->CreateBitCast(tmpb_v4i32, VectorType::get(m_ir_builder->getInt16Ty(), 8)); + + u32 mask_v8i32[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + auto res_v8i16 = m_ir_builder->CreateShuffleVector(tmpb_v8i16, tmpa_v8i16, ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32)); + + SetVr(vd, res_v8i16); + + // TODO: Implement with pext on CPUs with BMI } void Compiler::VPKSHSS(u32 vd, u32 va, u32 vb) { @@ -1669,27 +1742,69 @@ void Compiler::VSUM4UBS(u32 vd, u32 va, u32 vb) { } void Compiler::VUPKHPX(u32 vd, u32 vb) { - InterpreterCall("VUPKHPX", &PPUInterpreter::VUPKHPX, vd, vb); + auto vb_v8i16 = GetVrAsIntVec(vb, 16); + u32 mask_v8i32[8] = { 4, 4, 5, 5, 6, 6, 7, 7 }; + vb_v8i16 = m_ir_builder->CreateShuffleVector(vb_v8i16, UndefValue::get(VectorType::get(m_ir_builder->getInt16Ty(), 8)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32)); + + auto vb_v4i32 = m_ir_builder->CreateBitCast(vb_v8i16, VectorType::get(m_ir_builder->getInt32Ty(), 4)); + vb_v4i32 = m_ir_builder->CreateAShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(10))); + auto tmp1_v4i32 = m_ir_builder->CreateLShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(3))); + tmp1_v4i32 = m_ir_builder->CreateAnd(tmp1_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x00001F00))); + auto tmp2_v4i32 = m_ir_builder->CreateLShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(6))); + tmp2_v4i32 = m_ir_builder->CreateAnd(tmp2_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x0000001F))); + auto res_v4i32 = m_ir_builder->CreateAnd(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFF1F0000))); + res_v4i32 = m_ir_builder->CreateOr(res_v4i32, tmp1_v4i32); + res_v4i32 = m_ir_builder->CreateOr(res_v4i32, tmp2_v4i32); + SetVr(vd, res_v4i32); } void Compiler::VUPKHSB(u32 vd, u32 vb) { - InterpreterCall("VUPKHSB", &PPUInterpreter::VUPKHSB, vd, vb); + auto vb_v16i8 = GetVrAsIntVec(vb, 8); + u32 mask_v8i32[8] = { 8, 9, 10, 11, 12, 13, 14, 15 }; + auto vb_v8i8 = m_ir_builder->CreateShuffleVector(vb_v16i8, UndefValue::get(VectorType::get(m_ir_builder->getInt8Ty(), 16)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32)); + auto res_v8i16 = m_ir_builder->CreateSExt(vb_v8i8, VectorType::get(m_ir_builder->getInt16Ty(), 8)); + SetVr(vd, res_v8i16); } void Compiler::VUPKHSH(u32 vd, u32 vb) { - InterpreterCall("VUPKHSH", &PPUInterpreter::VUPKHSH, vd, vb); + auto vb_v8i16 = GetVrAsIntVec(vb, 16); + u32 mask_v4i32[4] = { 4, 5, 6, 7 }; + auto vb_v4i16 = m_ir_builder->CreateShuffleVector(vb_v8i16, UndefValue::get(VectorType::get(m_ir_builder->getInt16Ty(), 8)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v4i32)); + auto res_v4i32 = m_ir_builder->CreateSExt(vb_v4i16, VectorType::get(m_ir_builder->getInt32Ty(), 4)); + SetVr(vd, res_v4i32); } void Compiler::VUPKLPX(u32 vd, u32 vb) { - InterpreterCall("VUPKLPX", &PPUInterpreter::VUPKLPX, vd, vb); + auto vb_v8i16 = GetVrAsIntVec(vb, 16); + u32 mask_v8i32[8] = { 0, 0, 1, 1, 2, 2, 3, 3 }; + vb_v8i16 = m_ir_builder->CreateShuffleVector(vb_v8i16, UndefValue::get(VectorType::get(m_ir_builder->getInt16Ty(), 8)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32)); + + auto vb_v4i32 = m_ir_builder->CreateBitCast(vb_v8i16, VectorType::get(m_ir_builder->getInt32Ty(), 4)); + vb_v4i32 = m_ir_builder->CreateAShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(10))); + auto tmp1_v4i32 = m_ir_builder->CreateLShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(3))); + tmp1_v4i32 = m_ir_builder->CreateAnd(tmp1_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x00001F00))); + auto tmp2_v4i32 = m_ir_builder->CreateLShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(6))); + tmp2_v4i32 = m_ir_builder->CreateAnd(tmp2_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x0000001F))); + auto res_v4i32 = m_ir_builder->CreateAnd(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFF1F0000))); + res_v4i32 = m_ir_builder->CreateOr(res_v4i32, tmp1_v4i32); + res_v4i32 = m_ir_builder->CreateOr(res_v4i32, tmp2_v4i32); + SetVr(vd, res_v4i32); } void Compiler::VUPKLSB(u32 vd, u32 vb) { - InterpreterCall("VUPKLSB", &PPUInterpreter::VUPKLSB, vd, vb); + auto vb_v16i8 = GetVrAsIntVec(vb, 8); + u32 mask_v8i32[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; + auto vb_v8i8 = m_ir_builder->CreateShuffleVector(vb_v16i8, UndefValue::get(VectorType::get(m_ir_builder->getInt8Ty(), 16)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32)); + auto res_v8i16 = m_ir_builder->CreateSExt(vb_v8i8, VectorType::get(m_ir_builder->getInt16Ty(), 8)); + SetVr(vd, res_v8i16); } void Compiler::VUPKLSH(u32 vd, u32 vb) { - InterpreterCall("VUPKLSH", &PPUInterpreter::VUPKLSH, vd, vb); + auto vb_v8i16 = GetVrAsIntVec(vb, 16); + u32 mask_v4i32[4] = { 0, 1, 2, 3 }; + auto vb_v4i16 = m_ir_builder->CreateShuffleVector(vb_v8i16, UndefValue::get(VectorType::get(m_ir_builder->getInt16Ty(), 8)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v4i32)); + auto res_v4i32 = m_ir_builder->CreateSExt(vb_v4i16, VectorType::get(m_ir_builder->getInt32Ty(), 4)); + SetVr(vd, res_v4i32); } void Compiler::VXOR(u32 vd, u32 va, u32 vb) { @@ -5250,9 +5365,9 @@ std::shared_ptr RecompilationEngine::s_the_instance = nullp RecompilationEngine::RecompilationEngine() : ThreadBase("PPU Recompilation Engine") + , m_log(nullptr) , m_next_ordinal(0) - , m_compiler(*this, ExecutionEngine::ExecuteFunction, ExecutionEngine::ExecuteTillReturn) - , m_log(nullptr) { + , m_compiler(*this, ExecutionEngine::ExecuteFunction, ExecutionEngine::ExecuteTillReturn) { m_compiler.RunAllTests(); } diff --git a/rpcs3/Emu/Cell/PPULLVMRecompiler.h b/rpcs3/Emu/Cell/PPULLVMRecompiler.h index cc52a24858..8249da1c97 100644 --- a/rpcs3/Emu/Cell/PPULLVMRecompiler.h +++ b/rpcs3/Emu/Cell/PPULLVMRecompiler.h @@ -1022,6 +1022,9 @@ namespace ppu_recompiler_llvm { }; }; + /// Log + llvm::raw_fd_ostream * m_log; + /// Lock for accessing m_pending_execution_traces. TODO: Eliminate this and use a lock-free queue. std::mutex m_pending_execution_traces_lock; @@ -1047,9 +1050,6 @@ namespace ppu_recompiler_llvm { /// PPU Compiler Compiler m_compiler; - /// Log - llvm::raw_fd_ostream * m_log; - /// Executable lookup table Executable m_executable_lookup[10000]; // TODO: Adjust size diff --git a/rpcs3/Emu/Cell/PPULLVMRecompilerTests.cpp b/rpcs3/Emu/Cell/PPULLVMRecompilerTests.cpp index b7638eeeca..f4c3319428 100644 --- a/rpcs3/Emu/Cell/PPULLVMRecompilerTests.cpp +++ b/rpcs3/Emu/Cell/PPULLVMRecompilerTests.cpp @@ -432,9 +432,10 @@ void Compiler::RunAllTests() { VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMRGLW, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMMBM, 0, 5, 0, 1, 2, 3); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMSHM, 0, 5, 0, 1, 2, 3); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMSHS, 0, 5, 0, 1, 2, 3); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMUBM, 0, 5, 0, 1, 2, 3); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMUHM, 0, 5, 0, 1, 2, 3); - VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VNMSUBFP, 0, 5, 0, 1, 2, 3); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMUHS, 0, 5, 0, 1, 2, 3); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULESB, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULESH, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULEUB, 0, 5, 0, 1, 2); @@ -443,9 +444,11 @@ void Compiler::RunAllTests() { VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULOSH, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULOUB, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULOUH, 0, 5, 0, 1, 2); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VNMSUBFP, 0, 5, 0, 1, 2, 3); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VNOR, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VOR, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPERM, 0, 5, 0, 1, 2, 3); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPKPX, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPKSHSS, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPKSHUS, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPKSWSS, 0, 5, 0, 1, 2); @@ -494,6 +497,12 @@ void Compiler::RunAllTests() { VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VSUBUHS, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VSUBUWM, 0, 5, 0, 1, 2); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VSUBUWS, 0, 5, 0, 1, 2); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKHPX, 0, 5, 0, 1); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKHSB, 0, 5, 0, 1); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKHSH, 0, 5, 0, 1); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKLPX, 0, 5, 0, 1); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKLSB, 0, 5, 0, 1); + VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKLSH, 0, 5, 0, 1); VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VXOR, 0, 5, 0, 1, 2); // TODO: Rest of the vector instructions