PPU: Implemented some instructions in the recompiler. Fixed some bugs in the interpreter.

This commit is contained in:
S Gopal Rajagopal 2014-11-29 01:39:59 +05:30
parent 205e1d88b3
commit 6ea50567b6
4 changed files with 241 additions and 85 deletions

View File

@ -834,11 +834,11 @@ private:
}
void VCTSXS(u32 vd, u32 uimm5, u32 vb)
{
int nScale = 1 << uimm5;
u32 nScale = 1 << uimm5;
for (uint w = 0; w < 4; w++)
{
float result = CPU.VPR[vb]._f[w] * nScale;
double result = (double)CPU.VPR[vb]._f[w] * nScale;
if (result > 0x7fffffff)
{
@ -856,12 +856,12 @@ private:
}
void VCTUXS(u32 vd, u32 uimm5, u32 vb)
{
int nScale = 1 << uimm5;
u32 nScale = 1 << uimm5;
for (uint w = 0; w < 4; w++)
{
// C rounding = Round towards 0
float result = CPU.VPR[vb]._f[w] * nScale;
double result = (double)CPU.VPR[vb]._f[w] * nScale;
if (result > 0xffffffffu)
{
@ -1078,26 +1078,32 @@ private:
}
void VMRGLB(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint h = 0; h < 8; h++)
{
CPU.VPR[vd]._u8[15 - h*2] = CPU.VPR[va]._u8[7 - h];
CPU.VPR[vd]._u8[15 - h*2 - 1] = CPU.VPR[vb]._u8[7 - h];
CPU.VPR[vd]._u8[15 - h*2] = VA._u8[7 - h];
CPU.VPR[vd]._u8[15 - h*2 - 1] = VB._u8[7 - h];
}
}
void VMRGLH(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._u16[7 - w*2] = CPU.VPR[va]._u16[3 - w];
CPU.VPR[vd]._u16[7 - w*2 - 1] = CPU.VPR[vb]._u16[3 - w];
CPU.VPR[vd]._u16[7 - w*2] = VA._u16[3 - w];
CPU.VPR[vd]._u16[7 - w*2 - 1] = VB._u16[3 - w];
}
}
void VMRGLW(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint d = 0; d < 2; d++)
{
CPU.VPR[vd]._u32[3 - d*2] = CPU.VPR[va]._u32[1 - d];
CPU.VPR[vd]._u32[3 - d*2 - 1] = CPU.VPR[vb]._u32[1 - d];
CPU.VPR[vd]._u32[3 - d*2] = VA._u32[1 - d];
CPU.VPR[vd]._u32[3 - d*2 - 1] = VB._u32[1 - d];
}
}
void VMSUMMBM(u32 vd, u32 va, u32 vb, u32 vc) //nf
@ -1168,7 +1174,7 @@ private:
for (uint b = 0; b < 4; b++)
{
result += CPU.VPR[va]._u8[w*4 + b] * CPU.VPR[vb]._u8[w*4 + b];
result += (u32)CPU.VPR[va]._u8[w*4 + b] * (u32)CPU.VPR[vb]._u8[w*4 + b];
}
result += CPU.VPR[vc]._u32[w];
@ -1183,7 +1189,7 @@ private:
for (uint h = 0; h < 2; h++)
{
result += CPU.VPR[va]._u16[w*2 + h] * CPU.VPR[vb]._u16[w*2 + h];
result += (u32)CPU.VPR[va]._u16[w*2 + h] * (u32)CPU.VPR[vb]._u16[w*2 + h];
}
result += CPU.VPR[vc]._u32[w];
@ -1199,7 +1205,7 @@ private:
for (uint h = 0; h < 2; h++)
{
result += CPU.VPR[va]._u16[w*2 + h] * CPU.VPR[vb]._u16[w*2 + h];
result += (u64)CPU.VPR[va]._u16[w*2 + h] * (u64)CPU.VPR[vb]._u16[w*2 + h];
}
result += CPU.VPR[vc]._u32[w];
@ -1307,16 +1313,18 @@ private:
}
void VPKPX(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint h = 0; h < 4; h++)
{
u16 bb7 = CPU.VPR[vb]._u8[15 - (h*4 + 0)] & 0x1;
u16 bb8 = CPU.VPR[vb]._u8[15 - (h*4 + 1)] >> 3;
u16 bb16 = CPU.VPR[vb]._u8[15 - (h*4 + 2)] >> 3;
u16 bb24 = CPU.VPR[vb]._u8[15 - (h*4 + 3)] >> 3;
u16 ab7 = CPU.VPR[va]._u8[15 - (h*4 + 0)] & 0x1;
u16 ab8 = CPU.VPR[va]._u8[15 - (h*4 + 1)] >> 3;
u16 ab16 = CPU.VPR[va]._u8[15 - (h*4 + 2)] >> 3;
u16 ab24 = CPU.VPR[va]._u8[15 - (h*4 + 3)] >> 3;
u16 bb7 = VB._u8[15 - (h*4 + 0)] & 0x1;
u16 bb8 = VB._u8[15 - (h*4 + 1)] >> 3;
u16 bb16 = VB._u8[15 - (h*4 + 2)] >> 3;
u16 bb24 = VB._u8[15 - (h*4 + 3)] >> 3;
u16 ab7 = VA._u8[15 - (h*4 + 0)] & 0x1;
u16 ab8 = VA._u8[15 - (h*4 + 1)] >> 3;
u16 ab16 = VA._u8[15 - (h*4 + 2)] >> 3;
u16 ab24 = VA._u8[15 - (h*4 + 3)] >> 3;
CPU.VPR[vd]._u16[3 - h] = (bb7 << 15) | (bb8 << 10) | (bb16 << 5) | bb24;
CPU.VPR[vd]._u16[4 + (3 - h)] = (ab7 << 15) | (ab8 << 10) | (ab16 << 5) | ab24;
@ -1324,9 +1332,11 @@ private:
}
void VPKSHSS(u32 vd, u32 va, u32 vb) //nf
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint b = 0; b < 8; b++)
{
s16 result = CPU.VPR[va]._s16[b];
s16 result = VA._s16[b];
if (result > INT8_MAX)
{
@ -1341,7 +1351,7 @@ private:
CPU.VPR[vd]._s8[b+8] = (s8)result;
result = CPU.VPR[vb]._s16[b];
result = VB._s16[b];
if (result > INT8_MAX)
{
@ -1359,9 +1369,11 @@ private:
}
void VPKSHUS(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint b = 0; b < 8; b++)
{
s16 result = CPU.VPR[va]._s16[b];
s16 result = VA._s16[b];
if (result > UINT8_MAX)
{
@ -1376,7 +1388,7 @@ private:
CPU.VPR[vd]._u8[b+8] = (u8)result;
result = CPU.VPR[vb]._s16[b];
result = VB._s16[b];
if (result > UINT8_MAX)
{
@ -1394,9 +1406,11 @@ private:
}
void VPKSWSS(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint h = 0; h < 4; h++)
{
s32 result = CPU.VPR[va]._s32[h];
s32 result = VA._s32[h];
if (result > INT16_MAX)
{
@ -1411,7 +1425,7 @@ private:
CPU.VPR[vd]._s16[h+4] = result;
result = CPU.VPR[vb]._s32[h];
result = VB._s32[h];
if (result > INT16_MAX)
{
@ -1429,9 +1443,11 @@ private:
}
void VPKSWUS(u32 vd, u32 va, u32 vb) //nf
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint h = 0; h < 4; h++)
{
s32 result = CPU.VPR[va]._s32[h];
s32 result = VA._s32[h];
if (result > UINT16_MAX)
{
@ -1446,7 +1462,7 @@ private:
CPU.VPR[vd]._u16[h+4] = result;
result = CPU.VPR[vb]._s32[h];
result = VB._s32[h];
if (result > UINT16_MAX)
{
@ -1464,17 +1480,21 @@ private:
}
void VPKUHUM(u32 vd, u32 va, u32 vb) //nf
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint b = 0; b < 8; b++)
{
CPU.VPR[vd]._u8[b+8] = CPU.VPR[va]._u8[b*2];
CPU.VPR[vd]._u8[b ] = CPU.VPR[vb]._u8[b*2];
CPU.VPR[vd]._u8[b+8] = VA._u8[b*2];
CPU.VPR[vd]._u8[b ] = VB._u8[b*2];
}
}
void VPKUHUS(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint b = 0; b < 8; b++)
{
u16 result = CPU.VPR[va]._u16[b];
u16 result = VA._u16[b];
if (result > UINT8_MAX)
{
@ -1484,7 +1504,7 @@ private:
CPU.VPR[vd]._u8[b+8] = (u8)result;
result = CPU.VPR[vb]._u16[b];
result = VB._u16[b];
if (result > UINT8_MAX)
{
@ -1497,17 +1517,21 @@ private:
}
void VPKUWUM(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint h = 0; h < 4; h++)
{
CPU.VPR[vd]._u16[h+4] = CPU.VPR[va]._u16[h*2];
CPU.VPR[vd]._u16[h ] = CPU.VPR[vb]._u16[h*2];
CPU.VPR[vd]._u16[h+4] = VA._u16[h*2];
CPU.VPR[vd]._u16[h ] = VB._u16[h*2];
}
}
void VPKUWUS(u32 vd, u32 va, u32 vb) //nf
{
u128 VA = CPU.VPR[va];
u128 VB = CPU.VPR[vb];
for (uint h = 0; h < 4; h++)
{
u32 result = CPU.VPR[va]._u32[h];
u32 result = VA._u32[h];
if (result > UINT16_MAX)
{
@ -1517,7 +1541,7 @@ private:
CPU.VPR[vd]._u16[h+4] = result;
result = CPU.VPR[vb]._u32[h];
result = VB._u32[h];
if (result > UINT16_MAX)
{
@ -1539,30 +1563,28 @@ private:
{
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._f[w] = floor(CPU.VPR[vb]._f[w]);
CPU.VPR[vd]._f[w] = floorf(CPU.VPR[vb]._f[w]);
}
}
void VRFIN(u32 vd, u32 vb)
{
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._f[w] = floor(CPU.VPR[vb]._f[w] + 0.5f);
CPU.VPR[vd]._f[w] = nearbyintf(CPU.VPR[vb]._f[w]);
}
}
void VRFIP(u32 vd, u32 vb)
{
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._f[w] = ceil(CPU.VPR[vb]._f[w]);
CPU.VPR[vd]._f[w] = ceilf(CPU.VPR[vb]._f[w]);
}
}
void VRFIZ(u32 vd, u32 vb)
{
for (uint w = 0; w < 4; w++)
{
float f;
modff(CPU.VPR[vb]._f[w], &f);
CPU.VPR[vd]._f[w] = f;
CPU.VPR[vd]._f[w] = truncf(CPU.VPR[vb]._f[w]);
}
}
void VRLB(u32 vd, u32 va, u32 vb) //nf
@ -1605,12 +1627,13 @@ private:
}
void VSL(u32 vd, u32 va, u32 vb) //nf
{
u128 VA = CPU.VPR[va];
u8 sh = CPU.VPR[vb]._u8[0] & 0x7;
CPU.VPR[vd]._u8[0] = CPU.VPR[va]._u8[0] << sh;
CPU.VPR[vd]._u8[0] = VA._u8[0] << sh;
for (uint b = 1; b < 16; b++)
{
CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] << sh) | (CPU.VPR[va]._u8[b-1] >> (8 - sh));
CPU.VPR[vd]._u8[b] = (VA._u8[b] << sh) | (VA._u8[b-1] >> (8 - sh));
}
}
void VSLB(u32 vd, u32 va, u32 vb)
@ -1635,18 +1658,19 @@ private:
{
for (uint h = 0; h < 8; h++)
{
CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] << (CPU.VPR[vb]._u8[h*2] & 0xf);
CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] << (CPU.VPR[vb]._u16[h] & 0xf);
}
}
void VSLO(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u8 nShift = (CPU.VPR[vb]._u8[0] >> 3) & 0xf;
CPU.VPR[vd].clear();
for (u8 b = 0; b < 16 - nShift; b++)
{
CPU.VPR[vd]._u8[15 - b] = CPU.VPR[va]._u8[15 - (b + nShift)];
CPU.VPR[vd]._u8[15 - b] = VA._u8[15 - (b + nShift)];
}
}
void VSLW(u32 vd, u32 va, u32 vb)
@ -1710,12 +1734,13 @@ private:
}
void VSR(u32 vd, u32 va, u32 vb) //nf
{
u128 VA = CPU.VPR[va];
u8 sh = CPU.VPR[vb]._u8[0] & 0x7;
CPU.VPR[vd]._u8[15] = CPU.VPR[va]._u8[15] >> sh;
CPU.VPR[vd]._u8[15] = VA._u8[15] >> sh;
for (uint b = 14; ~b; b--)
{
CPU.VPR[vd]._u8[b] = (CPU.VPR[va]._u8[b] >> sh) | (CPU.VPR[va]._u8[b+1] << (8 - sh));
CPU.VPR[vd]._u8[b] = (VA._u8[b] >> sh) | (VA._u8[b+1] << (8 - sh));
}
}
void VSRAB(u32 vd, u32 va, u32 vb) //nf
@ -1729,14 +1754,14 @@ private:
{
for (uint h = 0; h < 8; h++)
{
CPU.VPR[vd]._s16[h] = CPU.VPR[va]._s16[h] >> (CPU.VPR[vb]._u8[h*2] & 0xf);
CPU.VPR[vd]._s16[h] = CPU.VPR[va]._s16[h] >> (CPU.VPR[vb]._u16[h] & 0xf);
}
}
void VSRAW(u32 vd, u32 va, u32 vb)
{
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._s32[w] = CPU.VPR[va]._s32[w] >> (CPU.VPR[vb]._u8[w*4] & 0x1f);
CPU.VPR[vd]._s32[w] = CPU.VPR[va]._s32[w] >> (CPU.VPR[vb]._u32[w] & 0x1f);
}
}
void VSRB(u32 vd, u32 va, u32 vb)
@ -1750,25 +1775,26 @@ private:
{
for (uint h = 0; h < 8; h++)
{
CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] >> (CPU.VPR[vb]._u8[h*2] & 0xf);
CPU.VPR[vd]._u16[h] = CPU.VPR[va]._u16[h] >> (CPU.VPR[vb]._u16[h] & 0xf);
}
}
void VSRO(u32 vd, u32 va, u32 vb)
{
u128 VA = CPU.VPR[va];
u8 nShift = (CPU.VPR[vb]._u8[0] >> 3) & 0xf;
CPU.VPR[vd].clear();
for (u8 b = 0; b < 16 - nShift; b++)
{
CPU.VPR[vd]._u8[b] = CPU.VPR[va]._u8[b + nShift];
CPU.VPR[vd]._u8[b] = VA._u8[b + nShift];
}
}
void VSRW(u32 vd, u32 va, u32 vb)
{
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] >> (CPU.VPR[vb]._u8[w*4] & 0x1f);
CPU.VPR[vd]._u32[w] = CPU.VPR[va]._u32[w] >> (CPU.VPR[vb]._u32[w] & 0x1f);
}
}
void VSUBCUW(u32 vd, u32 va, u32 vb) //nf
@ -2029,50 +2055,56 @@ private:
}
void VUPKHPX(u32 vd, u32 vb)
{
u128 VB = CPU.VPR[vb];
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._s8[(3 - w)*4 + 3] = CPU.VPR[vb]._s8[w*2 + 0] >> 7; // signed shift sign extends
CPU.VPR[vd]._u8[(3 - w)*4 + 2] = (CPU.VPR[vb]._u8[w*2 + 0] >> 2) & 0x1f;
CPU.VPR[vd]._u8[(3 - w)*4 + 1] = ((CPU.VPR[vb]._u8[w*2 + 0] & 0x3) << 3) | ((CPU.VPR[vb]._u8[w*2 + 1] >> 5) & 0x7);
CPU.VPR[vd]._u8[(3 - w)*4 + 0] = CPU.VPR[vb]._u8[w*2 + 1] & 0x1f;
CPU.VPR[vd]._s8[w*4 + 3] = VB._s8[8 + w*2 + 1] >> 7; // signed shift sign extends
CPU.VPR[vd]._u8[w*4 + 2] = (VB._u8[8 + w*2 + 1] >> 2) & 0x1f;
CPU.VPR[vd]._u8[w*4 + 1] = ((VB._u8[8 + w*2 + 1] & 0x3) << 3) | ((VB._u8[8 + w*2 + 0] >> 5) & 0x7);
CPU.VPR[vd]._u8[w*4 + 0] = VB._u8[8 + w*2 + 0] & 0x1f;
}
}
void VUPKHSB(u32 vd, u32 vb)
{
u128 VB = CPU.VPR[vb];
for (uint h = 0; h < 8; h++)
{
CPU.VPR[vd]._s16[h] = CPU.VPR[vb]._s8[h];
CPU.VPR[vd]._s16[h] = VB._s8[8 + h];
}
}
void VUPKHSH(u32 vd, u32 vb)
{
u128 VB = CPU.VPR[vb];
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._s32[w] = CPU.VPR[vb]._s16[w];
CPU.VPR[vd]._s32[w] = VB._s16[4 + w];
}
}
void VUPKLPX(u32 vd, u32 vb)
{
u128 VB = CPU.VPR[vb];
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._s8[(3 - w)*4 + 3] = CPU.VPR[vb]._s8[8 + w*2 + 0] >> 7; // signed shift sign extends
CPU.VPR[vd]._u8[(3 - w)*4 + 2] = (CPU.VPR[vb]._u8[8 + w*2 + 0] >> 2) & 0x1f;
CPU.VPR[vd]._u8[(3 - w)*4 + 1] = ((CPU.VPR[vb]._u8[8 + w*2 + 0] & 0x3) << 3) | ((CPU.VPR[vb]._u8[8 + w*2 + 1] >> 5) & 0x7);
CPU.VPR[vd]._u8[(3 - w)*4 + 0] = CPU.VPR[vb]._u8[8 + w*2 + 1] & 0x1f;
CPU.VPR[vd]._s8[w*4 + 3] = VB._s8[w*2 + 1] >> 7; // signed shift sign extends
CPU.VPR[vd]._u8[w*4 + 2] = (VB._u8[w*2 + 1] >> 2) & 0x1f;
CPU.VPR[vd]._u8[w*4 + 1] = ((VB._u8[w*2 + 1] & 0x3) << 3) | ((VB._u8[w*2 + 0] >> 5) & 0x7);
CPU.VPR[vd]._u8[w*4 + 0] = VB._u8[w*2 + 0] & 0x1f;
}
}
void VUPKLSB(u32 vd, u32 vb) //nf
{
u128 VB = CPU.VPR[vb];
for (uint h = 0; h < 8; h++)
{
CPU.VPR[vd]._s16[h] = CPU.VPR[vb]._s8[8 + h];
CPU.VPR[vd]._s16[h] = VB._s8[h];
}
}
void VUPKLSH(u32 vd, u32 vb)
{
u128 VB = CPU.VPR[vb];
for (uint w = 0; w < 4; w++)
{
CPU.VPR[vd]._s32[w] = CPU.VPR[vb]._s16[4 + w];
CPU.VPR[vd]._s32[w] = VB._s16[w];
}
}
void VXOR(u32 vd, u32 va, u32 vb)
@ -2792,7 +2824,7 @@ private:
return;
}
const u8 eb = (addr & 0xf) >> 1;
vm::write16((u32)addr, CPU.VPR[vs]._u16[7 - eb]);
vm::write16((u32)addr & 0xFFFFFFFE, CPU.VPR[vs]._u16[7 - eb]);
}
void STDUX(u32 rs, u32 ra, u32 rb)
{
@ -2828,7 +2860,7 @@ private:
return;
}
const u8 eb = (addr & 0xf) >> 2;
vm::write32((u32)addr, CPU.VPR[vs]._u32[3 - eb]);
vm::write32((u32)addr & 0xFFFFFFFC, CPU.VPR[vs]._u32[3 - eb]);
}
void ADDZE(u32 rd, u32 ra, u32 oe, bool rc)
{

View File

@ -1016,7 +1016,26 @@ void Compiler::VMSUMSHM(u32 vd, u32 va, u32 vb, u32 vc) {
}
void Compiler::VMSUMSHS(u32 vd, u32 va, u32 vb, u32 vc) {
InterpreterCall("VMSUMSHS", &PPUInterpreter::VMSUMSHS, vd, va, vb, vc);
auto va_v8i16 = GetVrAsIntVec(va, 16);
auto vb_v8i16 = GetVrAsIntVec(vb, 16);
auto vc_v4i32 = GetVrAsIntVec(vc, 32);
auto res_v4i32 = (Value *)m_ir_builder->CreateCall2(Intrinsic::getDeclaration(m_module, Intrinsic::x86_sse2_pmadd_wd), va_v8i16, vb_v8i16);
auto tmp1_v4i32 = m_ir_builder->CreateLShr(vc_v4i32, 31);
tmp1_v4i32 = m_ir_builder->CreateAdd(tmp1_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x7FFFFFFF)));
auto tmp1_v16i8 = m_ir_builder->CreateBitCast(tmp1_v4i32, VectorType::get(m_ir_builder->getInt8Ty(), 16));
auto tmp2_v4i32 = m_ir_builder->CreateXor(vc_v4i32, res_v4i32);
tmp2_v4i32 = m_ir_builder->CreateNot(tmp2_v4i32);
auto sum_v4i32 = m_ir_builder->CreateAdd(vc_v4i32, res_v4i32);
auto sum_v16i8 = m_ir_builder->CreateBitCast(sum_v4i32, VectorType::get(m_ir_builder->getInt8Ty(), 16));
auto tmp3_v4i32 = m_ir_builder->CreateXor(vc_v4i32, sum_v4i32);
tmp3_v4i32 = m_ir_builder->CreateAnd(tmp2_v4i32, tmp3_v4i32);
tmp3_v4i32 = m_ir_builder->CreateAShr(tmp3_v4i32, 31);
auto tmp3_v16i8 = m_ir_builder->CreateBitCast(tmp3_v4i32, VectorType::get(m_ir_builder->getInt8Ty(), 16));
auto res_v16i8 = m_ir_builder->CreateCall3(Intrinsic::getDeclaration(m_module, Intrinsic::x86_sse41_pblendvb), sum_v16i8, tmp1_v16i8, tmp3_v16i8);
SetVr(vd, res_v16i8);
// TODO: Set VSCR.SAT
}
void Compiler::VMSUMUBM(u32 vd, u32 va, u32 vb, u32 vc) {
@ -1074,7 +1093,31 @@ void Compiler::VMSUMUHM(u32 vd, u32 va, u32 vb, u32 vc) {
}
void Compiler::VMSUMUHS(u32 vd, u32 va, u32 vb, u32 vc) {
InterpreterCall("VMSUMUHS", &PPUInterpreter::VMSUMUHS, vd, va, vb, vc);
auto va_v8i16 = GetVrAsIntVec(va, 16);
auto vb_v8i16 = GetVrAsIntVec(vb, 16);
auto va_v8i32 = m_ir_builder->CreateZExt(va_v8i16, VectorType::get(m_ir_builder->getInt32Ty(), 8));
auto vb_v8i32 = m_ir_builder->CreateZExt(vb_v8i16, VectorType::get(m_ir_builder->getInt32Ty(), 8));
auto tmp_v8i32 = m_ir_builder->CreateMul(va_v8i32, vb_v8i32);
auto undef_v8i32 = UndefValue::get(VectorType::get(m_ir_builder->getInt32Ty(), 8));
u32 mask1_v4i32[4] = {0, 2, 4, 6};
auto tmp1_v4i32 = m_ir_builder->CreateShuffleVector(tmp_v8i32, undef_v8i32, ConstantDataVector::get(m_ir_builder->getContext(), mask1_v4i32));
u32 mask2_v4i32[4] = {1, 3, 5, 7};
auto tmp2_v4i32 = m_ir_builder->CreateShuffleVector(tmp_v8i32, undef_v8i32, ConstantDataVector::get(m_ir_builder->getContext(), mask2_v4i32));
auto vc_v4i32 = GetVrAsIntVec(vc, 32);
auto res_v4i32 = m_ir_builder->CreateAdd(tmp1_v4i32, tmp2_v4i32);
auto cmp_v4i1 = m_ir_builder->CreateICmpULT(res_v4i32, tmp1_v4i32);
auto cmp_v4i32 = m_ir_builder->CreateSExt(cmp_v4i1, VectorType::get(m_ir_builder->getInt32Ty(), 4));
res_v4i32 = m_ir_builder->CreateOr(res_v4i32, cmp_v4i32);
res_v4i32 = m_ir_builder->CreateAdd(res_v4i32, vc_v4i32);
cmp_v4i1 = m_ir_builder->CreateICmpULT(res_v4i32, vc_v4i32);
cmp_v4i32 = m_ir_builder->CreateSExt(cmp_v4i1, VectorType::get(m_ir_builder->getInt32Ty(), 4));
res_v4i32 = m_ir_builder->CreateOr(res_v4i32, cmp_v4i32);
SetVr(vd, res_v4i32);
// TODO: Set VSCR.SAT
}
void Compiler::VMULESB(u32 vd, u32 va, u32 vb) {
@ -1204,7 +1247,37 @@ void Compiler::VPERM(u32 vd, u32 va, u32 vb, u32 vc) {
}
void Compiler::VPKPX(u32 vd, u32 va, u32 vb) {
InterpreterCall("VPKPX", &PPUInterpreter::VPKPX, vd, va, vb);
auto va_v4i32 = GetVrAsIntVec(va, 32);
auto vb_v4i32 = GetVrAsIntVec(vb, 32);
auto tmpa_v4i32 = m_ir_builder->CreateShl(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(7)));
tmpa_v4i32 = m_ir_builder->CreateAnd(tmpa_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFC000000)));
va_v4i32 = m_ir_builder->CreateShl(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(10)));
va_v4i32 = m_ir_builder->CreateAnd(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(~0xFC000000)));
tmpa_v4i32 = m_ir_builder->CreateOr(tmpa_v4i32, va_v4i32);
tmpa_v4i32 = m_ir_builder->CreateAnd(tmpa_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFFE00000)));
va_v4i32 = m_ir_builder->CreateShl(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(3)));
va_v4i32 = m_ir_builder->CreateAnd(va_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(~0xFFE00000)));
tmpa_v4i32 = m_ir_builder->CreateOr(tmpa_v4i32, va_v4i32);
auto tmpa_v8i16 = m_ir_builder->CreateBitCast(tmpa_v4i32, VectorType::get(m_ir_builder->getInt16Ty(), 8));
auto tmpb_v4i32 = m_ir_builder->CreateShl(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(7)));
tmpb_v4i32 = m_ir_builder->CreateAnd(tmpb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFC000000)));
vb_v4i32 = m_ir_builder->CreateShl(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(10)));
vb_v4i32 = m_ir_builder->CreateAnd(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(~0xFC000000)));
tmpb_v4i32 = m_ir_builder->CreateOr(tmpb_v4i32, vb_v4i32);
tmpb_v4i32 = m_ir_builder->CreateAnd(tmpb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFFE00000)));
vb_v4i32 = m_ir_builder->CreateShl(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(3)));
vb_v4i32 = m_ir_builder->CreateAnd(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(~0xFFE00000)));
tmpb_v4i32 = m_ir_builder->CreateOr(tmpb_v4i32, vb_v4i32);
auto tmpb_v8i16 = m_ir_builder->CreateBitCast(tmpb_v4i32, VectorType::get(m_ir_builder->getInt16Ty(), 8));
u32 mask_v8i32[8] = {1, 3, 5, 7, 9, 11, 13, 15};
auto res_v8i16 = m_ir_builder->CreateShuffleVector(tmpb_v8i16, tmpa_v8i16, ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32));
SetVr(vd, res_v8i16);
// TODO: Implement with pext on CPUs with BMI
}
void Compiler::VPKSHSS(u32 vd, u32 va, u32 vb) {
@ -1669,27 +1742,69 @@ void Compiler::VSUM4UBS(u32 vd, u32 va, u32 vb) {
}
void Compiler::VUPKHPX(u32 vd, u32 vb) {
InterpreterCall("VUPKHPX", &PPUInterpreter::VUPKHPX, vd, vb);
auto vb_v8i16 = GetVrAsIntVec(vb, 16);
u32 mask_v8i32[8] = { 4, 4, 5, 5, 6, 6, 7, 7 };
vb_v8i16 = m_ir_builder->CreateShuffleVector(vb_v8i16, UndefValue::get(VectorType::get(m_ir_builder->getInt16Ty(), 8)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32));
auto vb_v4i32 = m_ir_builder->CreateBitCast(vb_v8i16, VectorType::get(m_ir_builder->getInt32Ty(), 4));
vb_v4i32 = m_ir_builder->CreateAShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(10)));
auto tmp1_v4i32 = m_ir_builder->CreateLShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(3)));
tmp1_v4i32 = m_ir_builder->CreateAnd(tmp1_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x00001F00)));
auto tmp2_v4i32 = m_ir_builder->CreateLShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(6)));
tmp2_v4i32 = m_ir_builder->CreateAnd(tmp2_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x0000001F)));
auto res_v4i32 = m_ir_builder->CreateAnd(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFF1F0000)));
res_v4i32 = m_ir_builder->CreateOr(res_v4i32, tmp1_v4i32);
res_v4i32 = m_ir_builder->CreateOr(res_v4i32, tmp2_v4i32);
SetVr(vd, res_v4i32);
}
void Compiler::VUPKHSB(u32 vd, u32 vb) {
InterpreterCall("VUPKHSB", &PPUInterpreter::VUPKHSB, vd, vb);
auto vb_v16i8 = GetVrAsIntVec(vb, 8);
u32 mask_v8i32[8] = { 8, 9, 10, 11, 12, 13, 14, 15 };
auto vb_v8i8 = m_ir_builder->CreateShuffleVector(vb_v16i8, UndefValue::get(VectorType::get(m_ir_builder->getInt8Ty(), 16)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32));
auto res_v8i16 = m_ir_builder->CreateSExt(vb_v8i8, VectorType::get(m_ir_builder->getInt16Ty(), 8));
SetVr(vd, res_v8i16);
}
void Compiler::VUPKHSH(u32 vd, u32 vb) {
InterpreterCall("VUPKHSH", &PPUInterpreter::VUPKHSH, vd, vb);
auto vb_v8i16 = GetVrAsIntVec(vb, 16);
u32 mask_v4i32[4] = { 4, 5, 6, 7 };
auto vb_v4i16 = m_ir_builder->CreateShuffleVector(vb_v8i16, UndefValue::get(VectorType::get(m_ir_builder->getInt16Ty(), 8)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v4i32));
auto res_v4i32 = m_ir_builder->CreateSExt(vb_v4i16, VectorType::get(m_ir_builder->getInt32Ty(), 4));
SetVr(vd, res_v4i32);
}
void Compiler::VUPKLPX(u32 vd, u32 vb) {
InterpreterCall("VUPKLPX", &PPUInterpreter::VUPKLPX, vd, vb);
auto vb_v8i16 = GetVrAsIntVec(vb, 16);
u32 mask_v8i32[8] = { 0, 0, 1, 1, 2, 2, 3, 3 };
vb_v8i16 = m_ir_builder->CreateShuffleVector(vb_v8i16, UndefValue::get(VectorType::get(m_ir_builder->getInt16Ty(), 8)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32));
auto vb_v4i32 = m_ir_builder->CreateBitCast(vb_v8i16, VectorType::get(m_ir_builder->getInt32Ty(), 4));
vb_v4i32 = m_ir_builder->CreateAShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(10)));
auto tmp1_v4i32 = m_ir_builder->CreateLShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(3)));
tmp1_v4i32 = m_ir_builder->CreateAnd(tmp1_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x00001F00)));
auto tmp2_v4i32 = m_ir_builder->CreateLShr(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(6)));
tmp2_v4i32 = m_ir_builder->CreateAnd(tmp2_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0x0000001F)));
auto res_v4i32 = m_ir_builder->CreateAnd(vb_v4i32, m_ir_builder->CreateVectorSplat(4, m_ir_builder->getInt32(0xFF1F0000)));
res_v4i32 = m_ir_builder->CreateOr(res_v4i32, tmp1_v4i32);
res_v4i32 = m_ir_builder->CreateOr(res_v4i32, tmp2_v4i32);
SetVr(vd, res_v4i32);
}
void Compiler::VUPKLSB(u32 vd, u32 vb) {
InterpreterCall("VUPKLSB", &PPUInterpreter::VUPKLSB, vd, vb);
auto vb_v16i8 = GetVrAsIntVec(vb, 8);
u32 mask_v8i32[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
auto vb_v8i8 = m_ir_builder->CreateShuffleVector(vb_v16i8, UndefValue::get(VectorType::get(m_ir_builder->getInt8Ty(), 16)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v8i32));
auto res_v8i16 = m_ir_builder->CreateSExt(vb_v8i8, VectorType::get(m_ir_builder->getInt16Ty(), 8));
SetVr(vd, res_v8i16);
}
void Compiler::VUPKLSH(u32 vd, u32 vb) {
InterpreterCall("VUPKLSH", &PPUInterpreter::VUPKLSH, vd, vb);
auto vb_v8i16 = GetVrAsIntVec(vb, 16);
u32 mask_v4i32[4] = { 0, 1, 2, 3 };
auto vb_v4i16 = m_ir_builder->CreateShuffleVector(vb_v8i16, UndefValue::get(VectorType::get(m_ir_builder->getInt16Ty(), 8)), ConstantDataVector::get(m_ir_builder->getContext(), mask_v4i32));
auto res_v4i32 = m_ir_builder->CreateSExt(vb_v4i16, VectorType::get(m_ir_builder->getInt32Ty(), 4));
SetVr(vd, res_v4i32);
}
void Compiler::VXOR(u32 vd, u32 va, u32 vb) {
@ -5250,9 +5365,9 @@ std::shared_ptr<RecompilationEngine> RecompilationEngine::s_the_instance = nullp
RecompilationEngine::RecompilationEngine()
: ThreadBase("PPU Recompilation Engine")
, m_log(nullptr)
, m_next_ordinal(0)
, m_compiler(*this, ExecutionEngine::ExecuteFunction, ExecutionEngine::ExecuteTillReturn)
, m_log(nullptr) {
, m_compiler(*this, ExecutionEngine::ExecuteFunction, ExecutionEngine::ExecuteTillReturn) {
m_compiler.RunAllTests();
}

View File

@ -1022,6 +1022,9 @@ namespace ppu_recompiler_llvm {
};
};
/// Log
llvm::raw_fd_ostream * m_log;
/// Lock for accessing m_pending_execution_traces. TODO: Eliminate this and use a lock-free queue.
std::mutex m_pending_execution_traces_lock;
@ -1047,9 +1050,6 @@ namespace ppu_recompiler_llvm {
/// PPU Compiler
Compiler m_compiler;
/// Log
llvm::raw_fd_ostream * m_log;
/// Executable lookup table
Executable m_executable_lookup[10000]; // TODO: Adjust size

View File

@ -432,9 +432,10 @@ void Compiler::RunAllTests() {
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMRGLW, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMMBM, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMSHM, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMSHS, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMUBM, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMUHM, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VNMSUBFP, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMSUMUHS, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULESB, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULESH, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULEUB, 0, 5, 0, 1, 2);
@ -443,9 +444,11 @@ void Compiler::RunAllTests() {
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULOSH, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULOUB, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VMULOUH, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VNMSUBFP, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VNOR, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VOR, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPERM, 0, 5, 0, 1, 2, 3);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPKPX, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPKSHSS, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPKSHUS, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VPKSWSS, 0, 5, 0, 1, 2);
@ -494,6 +497,12 @@ void Compiler::RunAllTests() {
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VSUBUHS, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VSUBUWM, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VSUBUWS, 0, 5, 0, 1, 2);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKHPX, 0, 5, 0, 1);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKHSB, 0, 5, 0, 1);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKHSH, 0, 5, 0, 1);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKLPX, 0, 5, 0, 1);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKLSB, 0, 5, 0, 1);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VUPKLSH, 0, 5, 0, 1);
VERIFY_INSTRUCTION_AGAINST_INTERPRETER_USING_RANDOM_INPUT(VXOR, 0, 5, 0, 1, 2);
// TODO: Rest of the vector instructions