Merge pull request #1166 from Triang3l/altivec_vpkd3d

[CPU] v(u)pkd3d128: Support UINT_2101010 and don't saturate D3DCOLOR
This commit is contained in:
Rick Gibbed 2018-05-31 07:15:35 -05:00 committed by GitHub
commit 20bcd3f3c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 115 additions and 85 deletions

View File

@ -626,7 +626,6 @@ static const vec128_t xmm_consts[] = {
vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu), vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
/* XMMPermuteControl15 */ vec128b(15), /* XMMPermuteControl15 */ vec128b(15),
/* XMMPermuteByteMask */ vec128b(0x1F), /* XMMPermuteByteMask */ vec128b(0x1F),
/* XMMPackD3DCOLORSat */ vec128i(0x404000FFu),
/* XMMPackD3DCOLOR */ /* XMMPackD3DCOLOR */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u), vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u),
/* XMMUnpackD3DCOLOR */ /* XMMUnpackD3DCOLOR */
@ -647,8 +646,19 @@ static const vec128_t xmm_consts[] = {
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, 0x09080D0Cu), vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, 0x09080D0Cu),
/* XMMUnpackSHORT_2 */ /* XMMUnpackSHORT_2 */
vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu), vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu),
/* XMMUnpackSHORT_2_Min */
vec128i(0x403F8001u, 0x403F8001u, 0x00000000u, 0x00000000u),
/* XMMUnpackSHORT_4 */ /* XMMUnpackSHORT_4 */
vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu), vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu),
/* XMMPackUINT_2101010_MinUnpacked */
vec128i(0x403FFE01u, 0x403FFE01u, 0x403FFE01u, 0x40400000u),
/* XMMPackUINT_2101010_MaxUnpacked */
vec128i(0x404001FFu, 0x404001FFu, 0x404001FFu, 0x40400003u),
/* XMMPackUINT_2101010_MaskUnpacked */
vec128i(0x3FFu, 0x3FFu, 0x3FFu, 0x3u),
/* XMMPackUINT_2101010_MaskPacked */
vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30),
/* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30),
/* XMMOneOver255 */ vec128f(1.0f / 255.0f), /* XMMOneOver255 */ vec128f(1.0f / 255.0f),
/* XMMMaskEvenPI16 */ /* XMMMaskEvenPI16 */
vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu), vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu),

View File

@ -64,7 +64,6 @@ enum XmmConst {
XMMByteOrderMask, XMMByteOrderMask,
XMMPermuteControl15, XMMPermuteControl15,
XMMPermuteByteMask, XMMPermuteByteMask,
XMMPackD3DCOLORSat,
XMMPackD3DCOLOR, XMMPackD3DCOLOR,
XMMUnpackD3DCOLOR, XMMUnpackD3DCOLOR,
XMMPackFLOAT16_2, XMMPackFLOAT16_2,
@ -76,7 +75,13 @@ enum XmmConst {
XMMPackSHORT_2, XMMPackSHORT_2,
XMMPackSHORT_4, XMMPackSHORT_4,
XMMUnpackSHORT_2, XMMUnpackSHORT_2,
XMMUnpackSHORT_2_Min,
XMMUnpackSHORT_4, XMMUnpackSHORT_4,
XMMPackUINT_2101010_MinUnpacked,
XMMPackUINT_2101010_MaxUnpacked,
XMMPackUINT_2101010_MaskUnpacked,
XMMPackUINT_2101010_MaskPacked,
XMMPackUINT_2101010_Shift,
XMMOneOver255, XMMOneOver255,
XMMMaskEvenPI16, XMMMaskEvenPI16,
XMMShiftMaskEvenPI16, XMMShiftMaskEvenPI16,

View File

@ -6964,20 +6964,24 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
} }
static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero()); assert_true(i.src2.value->IsConstantZero());
// Saturate to [3,3....] so that only values between 3...[00] and 3...[FF] // No saturation done here.
// are valid. // Unpacking D3DCOLOR gives (1.0f | bits), or from 3F800000 to 3F8000FF.
if (i.src1.is_constant) { // However, you can pack 3.0f + (value / (float) (1 << 22)), which creates
e.LoadConstantXmm(i.dest, i.src1.constant()); // a number between 40400000 and 404000FF:
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLORSat)); // https://github.com/ValveSoftware/source-sdk-2013/blob/master/sp/src/public/pixelwriter.h#L648
} else { // With saturation, you will get 0 when re-packing after unpacking.
e.vminps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLORSat)); // The above code also has to perform clamping explicitly.
}
e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMM3333));
// Extract bytes. // Extract bytes.
// RGBA (XYZW) -> ARGB (WXYZ) // RGBA (XYZW) -> ARGB (WXYZ)
// w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) |
// ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR)); if (i.src1.is_constant) {
e.LoadConstantXmm(i.dest, i.src1.constant());
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
} else {
e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLOR));
}
} }
static __m128i EmulateFLOAT16_2(void*, __m128 src1) { static __m128i EmulateFLOAT16_2(void*, __m128 src1) {
alignas(16) float a[4]; alignas(16) float a[4];
@ -7050,59 +7054,35 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
// Pack. // Pack.
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4)); e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4));
} }
static __m128i EmulatePackUINT_2101010(void*, __m128i src1) {
// https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
union {
alignas(16) int32_t a_i[4];
alignas(16) uint32_t a_u[4];
alignas(16) float a_f[4];
};
alignas(16) uint32_t b[4];
alignas(16) uint32_t c[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a_u), src1);
// XYZ are 10 bits, signed and saturated.
for (int i = 0; i < 3; ++i) {
static const int32_t kMinValueXYZ = 0x403FFE01; // 3 - 1FF / (1 << 22)
static const int32_t kMaxValueXYZ = 0x404001FF; // 3 + 1FF / (1 << 22)
uint32_t exponent = (a_u[i] >> 23) & 0xFF;
uint32_t fractional = a_u[i] & 0x007FFFFF;
if ((exponent == 0xFF) && fractional) {
b[i] = 0x200;
} else if (a_i[i] > kMaxValueXYZ) {
b[i] = 0x1FF; // INT_MAX
} else if (a_i[i] < kMinValueXYZ) {
b[i] = 0x201; // -INT_MAX
} else {
b[i] = a_u[i] & 0x3FF;
}
}
// W is 2 bits, unsigned and saturated.
static const int32_t kMinValueW = 0x40400000; // 3
static const int32_t kMaxValueW = 0x40400003; // 3 + 3 / (1 << 22)
uint32_t w_exponent = (a_u[3] >> 23) & 0xFF;
uint32_t w_fractional = a_u[3] & 0x007FFFFF;
if ((w_exponent == 0xFF) && w_fractional) {
b[3] = 0x0;
} else if (a_i[3] > kMaxValueW) {
b[3] = 0x3;
} else if (a_i[3] < kMinValueW) {
b[3] = 0x0;
} else {
b[3] = a_u[3] & 0x3;
}
// Combine in 2101010 WZYX.
c[0] = c[1] = c[2] = 0;
c[3] = ((b[3] & 0x3) << 30) | ((b[2] & 0x3FF) << 20) |
((b[1] & 0x3FF) << 10) | ((b[0] & 0x3FF));
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
}
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero()); // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
// dest = [(b2(src1.w), b10(src1.z), b10(src1.y), b10(src1.x)), 0, 0, 0] // XYZ are 10 bits, signed and saturated.
// TODO(benvanik): optimized version. // W is 2 bits, unsigned and saturated.
e.lea(e.r8, e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulatePackUINT_2101010)); // Saturate.
e.vmovaps(i.dest, e.xmm0); e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked));
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaxUnpacked));
// Remove the unneeded bits of the floats.
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// Shift the components up.
e.vpsllvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
} else {
// Duplicate all the components into bits 10-19.
e.vpslld(e.xmm0, i.dest, 10);
e.vpor(i.dest, e.xmm0);
// Duplicate all the components into bits 20-39
// (so alpha will be in 30-31).
e.vpslld(e.xmm0, i.dest, 20);
e.vpor(i.dest, e.xmm0);
// Leave only the needed components.
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
}
// Combine the components.
e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(2, 3, 0, 1));
e.vorps(i.dest, e.xmm0);
e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2));
e.vorps(i.dest, e.xmm0);
} }
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1, static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
__m128i src2) { __m128i src2) {
@ -7329,20 +7309,23 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
} }
static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
// ARGB (WXYZ) -> RGBA (XYZW) // ARGB (WXYZ) -> RGBA (XYZW)
// XMLoadColor Xmm src;
if (i.src1.is_constant) { if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) { if (i.src1.value->IsConstantZero()) {
e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne)); e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne));
return; return;
} else {
assert_always();
} }
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
} }
// src = ZZYYXXWW // src = ZZYYXXWW
// Unpack to 000000ZZ,000000YY,000000XX,000000WW // Unpack to 000000ZZ,000000YY,000000XX,000000WW
e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR)); e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR));
// Add 1.0f to each. // Add 1.0f to each.
e.vpor(i.dest, e.GetXmmConstPtr(XMMOne)); e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
// To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
} }
static __m128 EmulateFLOAT16_2(void*, __m128i src1) { static __m128 EmulateFLOAT16_2(void*, __m128i src1) {
alignas(16) uint16_t a[8]; alignas(16) uint16_t a[8];
@ -7426,64 +7409,96 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
// (VD.x) = 3.0 + (VB.x>>16)*2^-22 // (VD.x) = 3.0 + (VB.x>>16)*2^-22
// (VD.y) = 3.0 + (VB.x)*2^-22 // (VD.y) = 3.0 + (VB.x)*2^-22
// (VD.z) = 0.0 // (VD.z) = 0.0
// (VD.w) = 1.0 // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
// XMLoadShortN2 plus 3,3,0,3 (for some reason)
// src is (xx,xx,xx,VALUE) // src is (xx,xx,xx,VALUE)
// (VALUE,VALUE,VALUE,VALUE)
Xmm src; Xmm src;
if (i.src1.is_constant) { if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) { if (i.src1.value->IsConstantZero()) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301)); e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301));
return; return;
} else {
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} }
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} else { } else {
src = i.src1; src = i.src1;
} }
// Shuffle bytes. // Shuffle bytes.
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2)); e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2));
// Sign extend words. // If negative, make smaller than 3 - sign extend before adding.
e.vpslld(i.dest, 16); e.vpslld(i.dest, 16);
e.vpsrad(i.dest, 16); e.vpsrad(i.dest, 16);
// Add 3,3,0,1. // Add 3,3,0,1.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
// Clamp the absolute value to the maximum positive value.
e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_2_Min));
} }
static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
// (VD.x) = 3.0 + (VB.x>>16)*2^-22 // (VD.x) = 3.0 + (VB.x>>16)*2^-22
// (VD.y) = 3.0 + (VB.x)*2^-22 // (VD.y) = 3.0 + (VB.x)*2^-22
// (VD.z) = 3.0 + (VB.y>>16)*2^-22 // (VD.z) = 3.0 + (VB.y>>16)*2^-22
// (VD.w) = 3.0 + (VB.y)*2^-22 // (VD.w) = 3.0 + (VB.y)*2^-22
// XMLoadShortN4 plus 3,3,3,3 (for some reason)
// src is (xx,xx,VALUE,VALUE) // src is (xx,xx,VALUE,VALUE)
// (VALUE,VALUE,VALUE,VALUE)
Xmm src; Xmm src;
if (i.src1.is_constant) { if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) { if (i.src1.value->IsConstantZero()) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333)); e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
return; return;
} else {
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} }
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} else { } else {
src = i.src1; src = i.src1;
} }
// Shuffle bytes. // Shuffle bytes.
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4)); e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4));
// Sign extend words. // If negative, make smaller than 3 - sign extend before adding.
e.vpslld(i.dest, 16); e.vpslld(i.dest, 16);
e.vpsrad(i.dest, 16); e.vpsrad(i.dest, 16);
// Add 3,3,3,3. // Add 3,3,3,3.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333)); e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
// Clamp the absolute value to the maximum positive value.
e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Min));
} }
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
assert_always("not implemented"); Xmm src;
if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
return;
}
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
}
// Splat W.
e.vshufps(i.dest, src, src, _MM_SHUFFLE(3, 3, 3, 3));
// Keep only the needed components.
// Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31.
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// Shift the components down.
e.vpsrlvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
} else {
// Duplicate green in 0-9 and alpha in 20-21.
e.vpsrld(e.xmm0, i.dest, 10);
e.vpor(i.dest, e.xmm0);
// Duplicate blue in 0-9 and alpha in 0-1.
e.vpsrld(e.xmm0, i.dest, 20);
e.vpor(i.dest, e.xmm0);
// Remove higher duplicate components.
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
}
// If negative, make smaller than 3 - sign extend XYZ before adding.
e.vpslld(i.dest, 22);
e.vpsrad(i.dest, 22);
// Add 3,3,3,3.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
// Clamp the absolute values of XYZ to the maximum positive value.
e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked));
// To convert XYZ to -1 to 1, games multiply by 0x46004020 & add 0xC6C06030.
} }
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
assert_false(IsPackOutSaturate(flags)); assert_false(IsPackOutSaturate(flags));