Merge pull request #1166 from Triang3l/altivec_vpkd3d

[CPU] v(u)pkd3d128: Support UINT_2101010 and don't saturate D3DCOLOR
This commit is contained in:
Rick Gibbed 2018-05-31 07:15:35 -05:00 committed by GitHub
commit 20bcd3f3c6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 115 additions and 85 deletions

View File

@ -626,7 +626,6 @@ static const vec128_t xmm_consts[] = {
vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu),
/* XMMPermuteControl15 */ vec128b(15),
/* XMMPermuteByteMask */ vec128b(0x1F),
/* XMMPackD3DCOLORSat */ vec128i(0x404000FFu),
/* XMMPackD3DCOLOR */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u),
/* XMMUnpackD3DCOLOR */
@ -647,8 +646,19 @@ static const vec128_t xmm_consts[] = {
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, 0x09080D0Cu),
/* XMMUnpackSHORT_2 */
vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu),
/* XMMUnpackSHORT_2_Min */
vec128i(0x403F8001u, 0x403F8001u, 0x00000000u, 0x00000000u),
/* XMMUnpackSHORT_4 */
vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu),
/* XMMPackUINT_2101010_MinUnpacked */
vec128i(0x403FFE01u, 0x403FFE01u, 0x403FFE01u, 0x40400000u),
/* XMMPackUINT_2101010_MaxUnpacked */
vec128i(0x404001FFu, 0x404001FFu, 0x404001FFu, 0x40400003u),
/* XMMPackUINT_2101010_MaskUnpacked */
vec128i(0x3FFu, 0x3FFu, 0x3FFu, 0x3u),
/* XMMPackUINT_2101010_MaskPacked */
vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30),
/* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30),
/* XMMOneOver255 */ vec128f(1.0f / 255.0f),
/* XMMMaskEvenPI16 */
vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu),

View File

@ -64,7 +64,6 @@ enum XmmConst {
XMMByteOrderMask,
XMMPermuteControl15,
XMMPermuteByteMask,
XMMPackD3DCOLORSat,
XMMPackD3DCOLOR,
XMMUnpackD3DCOLOR,
XMMPackFLOAT16_2,
@ -76,7 +75,13 @@ enum XmmConst {
XMMPackSHORT_2,
XMMPackSHORT_4,
XMMUnpackSHORT_2,
XMMUnpackSHORT_2_Min,
XMMUnpackSHORT_4,
XMMPackUINT_2101010_MinUnpacked,
XMMPackUINT_2101010_MaxUnpacked,
XMMPackUINT_2101010_MaskUnpacked,
XMMPackUINT_2101010_MaskPacked,
XMMPackUINT_2101010_Shift,
XMMOneOver255,
XMMMaskEvenPI16,
XMMShiftMaskEvenPI16,

View File

@ -6964,20 +6964,24 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
}
static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero());
// Saturate to [3,3....] so that only values between 3...[00] and 3...[FF]
// are valid.
if (i.src1.is_constant) {
e.LoadConstantXmm(i.dest, i.src1.constant());
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLORSat));
} else {
e.vminps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLORSat));
}
e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMM3333));
// No saturation done here.
// Unpacking D3DCOLOR gives (1.0f | bits), or from 3F800000 to 3F8000FF.
// However, you can pack 3.0f + (value / (float) (1 << 22)), which creates
// a number between 40400000 and 404000FF:
// https://github.com/ValveSoftware/source-sdk-2013/blob/master/sp/src/public/pixelwriter.h#L648
// With saturation, you will get 0 when re-packing after unpacking.
// The above code also has to perform clamping explicitly.
// Extract bytes.
// RGBA (XYZW) -> ARGB (WXYZ)
// w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) |
// ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF)
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
if (i.src1.is_constant) {
e.LoadConstantXmm(i.dest, i.src1.constant());
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR));
} else {
e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLOR));
}
}
static __m128i EmulateFLOAT16_2(void*, __m128 src1) {
alignas(16) float a[4];
@ -7050,59 +7054,35 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
// Pack.
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4));
}
static __m128i EmulatePackUINT_2101010(void*, __m128i src1) {
// https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
union {
alignas(16) int32_t a_i[4];
alignas(16) uint32_t a_u[4];
alignas(16) float a_f[4];
};
alignas(16) uint32_t b[4];
alignas(16) uint32_t c[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a_u), src1);
// XYZ are 10 bits, signed and saturated.
for (int i = 0; i < 3; ++i) {
static const int32_t kMinValueXYZ = 0x403FFE01; // 3 - 1FF / (1 << 22)
static const int32_t kMaxValueXYZ = 0x404001FF; // 3 + 1FF / (1 << 22)
uint32_t exponent = (a_u[i] >> 23) & 0xFF;
uint32_t fractional = a_u[i] & 0x007FFFFF;
if ((exponent == 0xFF) && fractional) {
b[i] = 0x200;
} else if (a_i[i] > kMaxValueXYZ) {
b[i] = 0x1FF; // INT_MAX
} else if (a_i[i] < kMinValueXYZ) {
b[i] = 0x201; // -INT_MAX
} else {
b[i] = a_u[i] & 0x3FF;
}
}
// W is 2 bits, unsigned and saturated.
static const int32_t kMinValueW = 0x40400000; // 3
static const int32_t kMaxValueW = 0x40400003; // 3 + 3 / (1 << 22)
uint32_t w_exponent = (a_u[3] >> 23) & 0xFF;
uint32_t w_fractional = a_u[3] & 0x007FFFFF;
if ((w_exponent == 0xFF) && w_fractional) {
b[3] = 0x0;
} else if (a_i[3] > kMaxValueW) {
b[3] = 0x3;
} else if (a_i[3] < kMinValueW) {
b[3] = 0x0;
} else {
b[3] = a_u[3] & 0x3;
}
// Combine in 2101010 WZYX.
c[0] = c[1] = c[2] = 0;
c[3] = ((b[3] & 0x3) << 30) | ((b[2] & 0x3FF) << 20) |
((b[1] & 0x3FF) << 10) | ((b[0] & 0x3FF));
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
}
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero());
// dest = [(b2(src1.w), b10(src1.z), b10(src1.y), b10(src1.x)), 0, 0, 0]
// TODO(benvanik): optimized version.
e.lea(e.r8, e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulatePackUINT_2101010));
e.vmovaps(i.dest, e.xmm0);
// https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
// XYZ are 10 bits, signed and saturated.
// W is 2 bits, unsigned and saturated.
// Saturate.
e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked));
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaxUnpacked));
// Remove the unneeded bits of the floats.
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// Shift the components up.
e.vpsllvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
} else {
// Duplicate all the components into bits 10-19.
e.vpslld(e.xmm0, i.dest, 10);
e.vpor(i.dest, e.xmm0);
// Duplicate all the components into bits 20-39
// (so alpha will be in 30-31).
e.vpslld(e.xmm0, i.dest, 20);
e.vpor(i.dest, e.xmm0);
// Leave only the needed components.
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
}
// Combine the components.
e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(2, 3, 0, 1));
e.vorps(i.dest, e.xmm0);
e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2));
e.vorps(i.dest, e.xmm0);
}
static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1,
__m128i src2) {
@ -7329,20 +7309,23 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
}
static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) {
// ARGB (WXYZ) -> RGBA (XYZW)
// XMLoadColor
Xmm src;
if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) {
e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne));
return;
} else {
assert_always();
}
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
}
// src = ZZYYXXWW
// Unpack to 000000ZZ,000000YY,000000XX,000000WW
e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR));
// Add 1.0f to each.
e.vpor(i.dest, e.GetXmmConstPtr(XMMOne));
// To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081.
}
static __m128 EmulateFLOAT16_2(void*, __m128i src1) {
alignas(16) uint16_t a[8];
@ -7426,64 +7409,96 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
// (VD.x) = 3.0 + (VB.x>>16)*2^-22
// (VD.y) = 3.0 + (VB.x)*2^-22
// (VD.z) = 0.0
// (VD.w) = 1.0
// XMLoadShortN2 plus 3,3,0,3 (for some reason)
// (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f)
// src is (xx,xx,xx,VALUE)
// (VALUE,VALUE,VALUE,VALUE)
Xmm src;
if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301));
return;
} else {
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
}
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
}
// Shuffle bytes.
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2));
// Sign extend words.
// If negative, make smaller than 3 - sign extend before adding.
e.vpslld(i.dest, 16);
e.vpsrad(i.dest, 16);
// Add 3,3,0,1.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
// Clamp the absolute value to the maximum positive value.
e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_2_Min));
}
static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
// (VD.x) = 3.0 + (VB.x>>16)*2^-22
// (VD.y) = 3.0 + (VB.x)*2^-22
// (VD.z) = 3.0 + (VB.y>>16)*2^-22
// (VD.w) = 3.0 + (VB.y)*2^-22
// XMLoadShortN4 plus 3,3,3,3 (for some reason)
// src is (xx,xx,VALUE,VALUE)
// (VALUE,VALUE,VALUE,VALUE)
Xmm src;
if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
return;
} else {
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
}
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
}
// Shuffle bytes.
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4));
// Sign extend words.
// If negative, make smaller than 3 - sign extend before adding.
e.vpslld(i.dest, 16);
e.vpsrad(i.dest, 16);
// Add 3,3,3,3.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
// Clamp the absolute value to the maximum positive value.
e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Min));
}
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
assert_always("not implemented");
Xmm src;
if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
return;
}
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
} else {
src = i.src1;
}
// Splat W.
e.vshufps(i.dest, src, src, _MM_SHUFFLE(3, 3, 3, 3));
// Keep only the needed components.
// Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31.
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked));
if (e.IsFeatureEnabled(kX64EmitAVX2)) {
// Shift the components down.
e.vpsrlvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift));
} else {
// Duplicate green in 0-9 and alpha in 20-21.
e.vpsrld(e.xmm0, i.dest, 10);
e.vpor(i.dest, e.xmm0);
// Duplicate blue in 0-9 and alpha in 0-1.
e.vpsrld(e.xmm0, i.dest, 20);
e.vpor(i.dest, e.xmm0);
// Remove higher duplicate components.
e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked));
}
// If negative, make smaller than 3 - sign extend XYZ before adding.
e.vpslld(i.dest, 22);
e.vpsrad(i.dest, 22);
// Add 3,3,3,3.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
// Clamp the absolute values of XYZ to the maximum positive value.
e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked));
// To convert XYZ to -1 to 1, games multiply by 0x46004020 & add 0xC6C06030.
}
static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) {
assert_false(IsPackOutSaturate(flags));