Merge branch 'master' into d3d12
This commit is contained in:
commit
7648e45661
src/xenia/cpu
|
@ -666,7 +666,19 @@ static const vec128_t xmm_consts[] = {
|
||||||
vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30),
|
vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30),
|
||||||
/* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30),
|
/* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30),
|
||||||
/* XMMUnpackUINT_2101010_Overflow */ vec128i(0x403FFE00u),
|
/* XMMUnpackUINT_2101010_Overflow */ vec128i(0x403FFE00u),
|
||||||
/* XMMUnpackOverflowNaN */ vec128i(0x7FC00000u),
|
/* XMMPackULONG_4202020_MinUnpacked */
|
||||||
|
vec128i(0x40380001u, 0x40380001u, 0x40380001u, 0x40400000u),
|
||||||
|
/* XMMPackULONG_4202020_MaxUnpacked */
|
||||||
|
vec128i(0x4047FFFFu, 0x4047FFFFu, 0x4047FFFFu, 0x4040000Fu),
|
||||||
|
/* XMMPackULONG_4202020_MaskUnpacked */
|
||||||
|
vec128i(0xFFFFFu, 0xFFFFFu, 0xFFFFFu, 0xFu),
|
||||||
|
/* XMMPackULONG_4202020_PermuteXZ */
|
||||||
|
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0A0908FFu, 0xFF020100u),
|
||||||
|
/* XMMPackULONG_4202020_PermuteYW */
|
||||||
|
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0CFFFF06u, 0x0504FFFFu),
|
||||||
|
/* XMMUnpackULONG_4202020_Permute */
|
||||||
|
vec128i(0xFF0E0D0Cu, 0xFF0B0A09u, 0xFF080F0Eu, 0xFFFFFF0Bu),
|
||||||
|
/* XMMUnpackULONG_4202020_Overflow */ vec128i(0x40380000u),
|
||||||
/* XMMOneOver255 */ vec128f(1.0f / 255.0f),
|
/* XMMOneOver255 */ vec128f(1.0f / 255.0f),
|
||||||
/* XMMMaskEvenPI16 */
|
/* XMMMaskEvenPI16 */
|
||||||
vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu),
|
vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu),
|
||||||
|
@ -696,6 +708,7 @@ static const vec128_t xmm_consts[] = {
|
||||||
/* XMMIntMax */ vec128i(INT_MAX),
|
/* XMMIntMax */ vec128i(INT_MAX),
|
||||||
/* XMMIntMaxPD */ vec128d(INT_MAX),
|
/* XMMIntMaxPD */ vec128d(INT_MAX),
|
||||||
/* XMMPosIntMinPS */ vec128f((float)0x80000000u),
|
/* XMMPosIntMinPS */ vec128f((float)0x80000000u),
|
||||||
|
/* XMMQNaN */ vec128i(0x7FC00000u),
|
||||||
};
|
};
|
||||||
|
|
||||||
// First location to try and place constants.
|
// First location to try and place constants.
|
||||||
|
|
|
@ -85,7 +85,13 @@ enum XmmConst {
|
||||||
XMMPackUINT_2101010_MaskPacked,
|
XMMPackUINT_2101010_MaskPacked,
|
||||||
XMMPackUINT_2101010_Shift,
|
XMMPackUINT_2101010_Shift,
|
||||||
XMMUnpackUINT_2101010_Overflow,
|
XMMUnpackUINT_2101010_Overflow,
|
||||||
XMMUnpackOverflowNaN,
|
XMMPackULONG_4202020_MinUnpacked,
|
||||||
|
XMMPackULONG_4202020_MaxUnpacked,
|
||||||
|
XMMPackULONG_4202020_MaskUnpacked,
|
||||||
|
XMMPackULONG_4202020_PermuteXZ,
|
||||||
|
XMMPackULONG_4202020_PermuteYW,
|
||||||
|
XMMUnpackULONG_4202020_Permute,
|
||||||
|
XMMUnpackULONG_4202020_Overflow,
|
||||||
XMMOneOver255,
|
XMMOneOver255,
|
||||||
XMMMaskEvenPI16,
|
XMMMaskEvenPI16,
|
||||||
XMMShiftMaskEvenPI16,
|
XMMShiftMaskEvenPI16,
|
||||||
|
@ -105,6 +111,7 @@ enum XmmConst {
|
||||||
XMMIntMax,
|
XMMIntMax,
|
||||||
XMMIntMaxPD,
|
XMMIntMaxPD,
|
||||||
XMMPosIntMinPS,
|
XMMPosIntMinPS,
|
||||||
|
XMMQNaN,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
// Unfortunately due to the design of xbyak we have to pass this to the ctor.
|
||||||
|
|
|
@ -1822,6 +1822,9 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
case PACK_TYPE_UINT_2101010:
|
case PACK_TYPE_UINT_2101010:
|
||||||
EmitUINT_2101010(e, i);
|
EmitUINT_2101010(e, i);
|
||||||
break;
|
break;
|
||||||
|
case PACK_TYPE_ULONG_4202020:
|
||||||
|
EmitULONG_4202020(e, i);
|
||||||
|
break;
|
||||||
case PACK_TYPE_8_IN_16:
|
case PACK_TYPE_8_IN_16:
|
||||||
Emit8_IN_16(e, i, i.instr->flags);
|
Emit8_IN_16(e, i, i.instr->flags);
|
||||||
break;
|
break;
|
||||||
|
@ -2002,6 +2005,32 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
|
||||||
e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2));
|
e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2));
|
||||||
e.vorps(i.dest, e.xmm0);
|
e.vorps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
|
static void EmitULONG_4202020(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
// XYZ are 20 bits, signed and saturated.
|
||||||
|
// W is 4 bits, unsigned and saturated.
|
||||||
|
Xmm src;
|
||||||
|
if (i.src1.is_constant) {
|
||||||
|
src = i.dest;
|
||||||
|
e.LoadConstantXmm(src, i.src1.constant());
|
||||||
|
} else {
|
||||||
|
src = i.src1;
|
||||||
|
}
|
||||||
|
// Saturate.
|
||||||
|
e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackULONG_4202020_MinUnpacked));
|
||||||
|
e.vminps(i.dest, i.dest,
|
||||||
|
e.GetXmmConstPtr(XMMPackULONG_4202020_MaxUnpacked));
|
||||||
|
// Remove the unneeded bits of the floats (so excess nibbles will also be
|
||||||
|
// cleared).
|
||||||
|
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackULONG_4202020_MaskUnpacked));
|
||||||
|
// Store Y and W shifted left by 4 so vpshufb can be used with them.
|
||||||
|
e.vpslld(e.xmm0, i.dest, 4);
|
||||||
|
// Place XZ where they're supposed to be.
|
||||||
|
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackULONG_4202020_PermuteXZ));
|
||||||
|
// Place YW.
|
||||||
|
e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPackULONG_4202020_PermuteYW));
|
||||||
|
// Merge XZ and YW.
|
||||||
|
e.vorps(i.dest, e.xmm0);
|
||||||
|
}
|
||||||
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
|
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
|
||||||
__m128i src2) {
|
__m128i src2) {
|
||||||
alignas(16) uint16_t a[8];
|
alignas(16) uint16_t a[8];
|
||||||
|
@ -2214,6 +2243,9 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
||||||
case PACK_TYPE_UINT_2101010:
|
case PACK_TYPE_UINT_2101010:
|
||||||
EmitUINT_2101010(e, i);
|
EmitUINT_2101010(e, i);
|
||||||
break;
|
break;
|
||||||
|
case PACK_TYPE_ULONG_4202020:
|
||||||
|
EmitULONG_4202020(e, i);
|
||||||
|
break;
|
||||||
case PACK_TYPE_8_IN_16:
|
case PACK_TYPE_8_IN_16:
|
||||||
Emit8_IN_16(e, i, i.instr->flags);
|
Emit8_IN_16(e, i, i.instr->flags);
|
||||||
break;
|
break;
|
||||||
|
@ -2367,7 +2399,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
||||||
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
|
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
|
||||||
// Return quiet NaNs in case of negative overflow.
|
// Return quiet NaNs in case of negative overflow.
|
||||||
e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow));
|
e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow));
|
||||||
e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
|
e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
|
||||||
}
|
}
|
||||||
static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
|
static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
|
||||||
// (VD.x) = 3.0 + (VB.x>>16)*2^-22
|
// (VD.x) = 3.0 + (VB.x>>16)*2^-22
|
||||||
|
@ -2396,7 +2428,7 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
||||||
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
|
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
|
||||||
// Return quiet NaNs in case of negative overflow.
|
// Return quiet NaNs in case of negative overflow.
|
||||||
e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow));
|
e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow));
|
||||||
e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
|
e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
|
||||||
}
|
}
|
||||||
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
|
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
|
||||||
Xmm src;
|
Xmm src;
|
||||||
|
@ -2437,10 +2469,41 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
|
||||||
// Return quiet NaNs in case of negative overflow.
|
// Return quiet NaNs in case of negative overflow.
|
||||||
e.vcmpeqps(e.xmm0, i.dest,
|
e.vcmpeqps(e.xmm0, i.dest,
|
||||||
e.GetXmmConstPtr(XMMUnpackUINT_2101010_Overflow));
|
e.GetXmmConstPtr(XMMUnpackUINT_2101010_Overflow));
|
||||||
e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0);
|
e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
|
||||||
// To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030.
|
// To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030.
|
||||||
// For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.
|
// For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB.
|
||||||
}
|
}
|
||||||
|
static void EmitULONG_4202020(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
Xmm src;
|
||||||
|
if (i.src1.is_constant) {
|
||||||
|
if (i.src1.value->IsConstantZero()) {
|
||||||
|
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3331));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
src = i.dest;
|
||||||
|
e.LoadConstantXmm(src, i.src1.constant());
|
||||||
|
} else {
|
||||||
|
src = i.src1;
|
||||||
|
}
|
||||||
|
// Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW
|
||||||
|
// will have excess 4 lower bits.
|
||||||
|
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackULONG_4202020_Permute));
|
||||||
|
// Drop the excess nibble of YW.
|
||||||
|
e.vpsrld(e.xmm0, i.dest, 4);
|
||||||
|
// Merge XZ and YW now both starting at offset 0.
|
||||||
|
e.vshufps(i.dest, i.dest, e.xmm0, _MM_SHUFFLE(3, 2, 1, 0));
|
||||||
|
// Reorder as XYZW.
|
||||||
|
e.vshufps(i.dest, i.dest, _MM_SHUFFLE(3, 1, 2, 0));
|
||||||
|
// Drop the excess upper nibble in XZ and sign-extend XYZ.
|
||||||
|
e.vpslld(i.dest, 12);
|
||||||
|
e.vpsrad(i.dest, 12);
|
||||||
|
// Add 3,3,3,1.
|
||||||
|
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3331));
|
||||||
|
// Return quiet NaNs in case of negative overflow.
|
||||||
|
e.vcmpeqps(e.xmm0, i.dest,
|
||||||
|
e.GetXmmConstPtr(XMMUnpackULONG_4202020_Overflow));
|
||||||
|
e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0);
|
||||||
|
}
|
||||||
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
|
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
|
||||||
assert_false(IsPackOutSaturate(flags));
|
assert_false(IsPackOutSaturate(flags));
|
||||||
Xmm src;
|
Xmm src;
|
||||||
|
|
|
@ -81,10 +81,11 @@ enum PackType : uint16_t {
|
||||||
PACK_TYPE_FLOAT16_4 = 3,
|
PACK_TYPE_FLOAT16_4 = 3,
|
||||||
PACK_TYPE_SHORT_2 = 4,
|
PACK_TYPE_SHORT_2 = 4,
|
||||||
PACK_TYPE_UINT_2101010 = 5,
|
PACK_TYPE_UINT_2101010 = 5,
|
||||||
|
PACK_TYPE_ULONG_4202020 = 6,
|
||||||
|
|
||||||
// Types which use the bitmasks below for configuration:
|
// Types which use the bitmasks below for configuration:
|
||||||
PACK_TYPE_8_IN_16 = 6,
|
PACK_TYPE_8_IN_16 = 7,
|
||||||
PACK_TYPE_16_IN_32 = 7,
|
PACK_TYPE_16_IN_32 = 8,
|
||||||
|
|
||||||
PACK_TYPE_MODE = 0x000F, // just to get the mode
|
PACK_TYPE_MODE = 0x000F, // just to get the mode
|
||||||
// Unpack to low or high parts.
|
// Unpack to low or high parts.
|
||||||
|
|
|
@ -2052,18 +2052,22 @@ int InstrEmit_vpkd3d128(PPCHIRBuilder& f, const InstrData& i) {
|
||||||
case 1: // VPACK_NORMSHORT2
|
case 1: // VPACK_NORMSHORT2
|
||||||
v = f.Pack(v, PACK_TYPE_SHORT_2);
|
v = f.Pack(v, PACK_TYPE_SHORT_2);
|
||||||
break;
|
break;
|
||||||
case 2: // VPACK_... 2_10_10_10 w_z_y_x
|
case 2: // VPACK_NORMPACKED32 2_10_10_10 w_z_y_x
|
||||||
v = f.Pack(v, PACK_TYPE_UINT_2101010);
|
v = f.Pack(v, PACK_TYPE_UINT_2101010);
|
||||||
break;
|
break;
|
||||||
case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
|
case 3: // VPACK_FLOAT16_2 DXGI_FORMAT_R16G16_FLOAT
|
||||||
v = f.Pack(v, PACK_TYPE_FLOAT16_2);
|
v = f.Pack(v, PACK_TYPE_FLOAT16_2);
|
||||||
break;
|
break;
|
||||||
case 4: // VPACK_NORMSHORT4
|
case 4: // VPACK_NORMSHORT4
|
||||||
v = f.Pack(v, PACK_TYPE_SHORT_4);
|
v = f.Pack(v, PACK_TYPE_SHORT_4);
|
||||||
break;
|
break;
|
||||||
case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT
|
case 5: // VPACK_FLOAT16_4 DXGI_FORMAT_R16G16B16A16_FLOAT
|
||||||
v = f.Pack(v, PACK_TYPE_FLOAT16_4);
|
v = f.Pack(v, PACK_TYPE_FLOAT16_4);
|
||||||
break;
|
break;
|
||||||
|
case 6: // VPACK_NORMPACKED64 4_20_20_20 w_z_y_x
|
||||||
|
// Used in 2K games like NBA 2K9, pretty rarely in general.
|
||||||
|
v = f.Pack(v, PACK_TYPE_ULONG_4202020);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
assert_unhandled_case(type);
|
assert_unhandled_case(type);
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -2156,18 +2160,21 @@ int InstrEmit_vupkd3d128(PPCHIRBuilder& f, const InstrData& i) {
|
||||||
case 1: // VPACK_NORMSHORT2
|
case 1: // VPACK_NORMSHORT2
|
||||||
v = f.Unpack(v, PACK_TYPE_SHORT_2);
|
v = f.Unpack(v, PACK_TYPE_SHORT_2);
|
||||||
break;
|
break;
|
||||||
case 2: // VPACK_... 2_10_10_10 w_z_y_x
|
case 2: // VPACK_NORMPACKED32 2_10_10_10 w_z_y_x
|
||||||
v = f.Unpack(v, PACK_TYPE_UINT_2101010);
|
v = f.Unpack(v, PACK_TYPE_UINT_2101010);
|
||||||
break;
|
break;
|
||||||
case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
|
case 3: // VPACK_FLOAT16_2 DXGI_FORMAT_R16G16_FLOAT
|
||||||
v = f.Unpack(v, PACK_TYPE_FLOAT16_2);
|
v = f.Unpack(v, PACK_TYPE_FLOAT16_2);
|
||||||
break;
|
break;
|
||||||
case 4: // VPACK_NORMSHORT4
|
case 4: // VPACK_NORMSHORT4
|
||||||
v = f.Unpack(v, PACK_TYPE_SHORT_4);
|
v = f.Unpack(v, PACK_TYPE_SHORT_4);
|
||||||
break;
|
break;
|
||||||
case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT
|
case 5: // VPACK_FLOAT16_4 DXGI_FORMAT_R16G16B16A16_FLOAT
|
||||||
v = f.Unpack(v, PACK_TYPE_FLOAT16_4);
|
v = f.Unpack(v, PACK_TYPE_FLOAT16_4);
|
||||||
break;
|
break;
|
||||||
|
case 6: // VPACK_NORMPACKED64 4_20_20_20 w_z_y_x
|
||||||
|
v = f.Unpack(v, PACK_TYPE_ULONG_4202020);
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
assert_unhandled_case(type);
|
assert_unhandled_case(type);
|
||||||
return 1;
|
return 1;
|
||||||
|
|
Loading…
Reference in New Issue