diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index a2a85e7db..6479da0e3 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -626,7 +626,6 @@ static const vec128_t xmm_consts[] = { vec128i(0x01000302u, 0x05040706u, 0x09080B0Au, 0x0D0C0F0Eu), /* XMMPermuteControl15 */ vec128b(15), /* XMMPermuteByteMask */ vec128b(0x1F), - /* XMMPackD3DCOLORSat */ vec128i(0x404000FFu), /* XMMPackD3DCOLOR */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u), /* XMMUnpackD3DCOLOR */ @@ -647,8 +646,19 @@ static const vec128_t xmm_consts[] = { vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, 0x09080D0Cu), /* XMMUnpackSHORT_2 */ vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu), + /* XMMUnpackSHORT_2_Min */ + vec128i(0x403F8001u, 0x403F8001u, 0x00000000u, 0x00000000u), /* XMMUnpackSHORT_4 */ vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu), + /* XMMPackUINT_2101010_MinUnpacked */ + vec128i(0x403FFE01u, 0x403FFE01u, 0x403FFE01u, 0x40400000u), + /* XMMPackUINT_2101010_MaxUnpacked */ + vec128i(0x404001FFu, 0x404001FFu, 0x404001FFu, 0x40400003u), + /* XMMPackUINT_2101010_MaskUnpacked */ + vec128i(0x3FFu, 0x3FFu, 0x3FFu, 0x3u), + /* XMMPackUINT_2101010_MaskPacked */ + vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30), + /* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30), /* XMMOneOver255 */ vec128f(1.0f / 255.0f), /* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu), diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index a499b66c2..039d67509 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -64,7 +64,6 @@ enum XmmConst { XMMByteOrderMask, XMMPermuteControl15, XMMPermuteByteMask, - XMMPackD3DCOLORSat, XMMPackD3DCOLOR, XMMUnpackD3DCOLOR, XMMPackFLOAT16_2, @@ -76,7 +75,13 @@ enum XmmConst { XMMPackSHORT_2, XMMPackSHORT_4, XMMUnpackSHORT_2, + XMMUnpackSHORT_2_Min, XMMUnpackSHORT_4, + XMMPackUINT_2101010_MinUnpacked, + XMMPackUINT_2101010_MaxUnpacked, + XMMPackUINT_2101010_MaskUnpacked, + XMMPackUINT_2101010_MaskPacked, + XMMPackUINT_2101010_Shift, XMMOneOver255, XMMMaskEvenPI16, XMMShiftMaskEvenPI16, diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index f2dfb89f0..84b91ea10 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -6964,20 +6964,24 @@ struct PACK : Sequence> { } static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { assert_true(i.src2.value->IsConstantZero()); - // Saturate to [3,3....] so that only values between 3...[00] and 3...[FF] - // are valid. - if (i.src1.is_constant) { - e.LoadConstantXmm(i.dest, i.src1.constant()); - e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLORSat)); - } else { - e.vminps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLORSat)); - } - e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMM3333)); + // No saturation done here. + // Unpacking D3DCOLOR gives (1.0f | bits), or from 3F800000 to 3F8000FF. + // However, you can pack 3.0f + (value / (float) (1 << 22)), which creates + // a number between 40400000 and 404000FF: + // https://github.com/ValveSoftware/source-sdk-2013/blob/master/sp/src/public/pixelwriter.h#L648 + // With saturation, you will get 0 when re-packing after unpacking. + // The above code also has to perform clamping explicitly. + // Extract bytes. // RGBA (XYZW) -> ARGB (WXYZ) // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR)); + if (i.src1.is_constant) { + e.LoadConstantXmm(i.dest, i.src1.constant()); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR)); + } else { + e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLOR)); + } } static __m128i EmulateFLOAT16_2(void*, __m128 src1) { alignas(16) float a[4]; @@ -7050,59 +7054,35 @@ struct PACK : Sequence> { // Pack. e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4)); } - static __m128i EmulatePackUINT_2101010(void*, __m128i src1) { - // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt - union { - alignas(16) int32_t a_i[4]; - alignas(16) uint32_t a_u[4]; - alignas(16) float a_f[4]; - }; - alignas(16) uint32_t b[4]; - alignas(16) uint32_t c[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(a_u), src1); - // XYZ are 10 bits, signed and saturated. - for (int i = 0; i < 3; ++i) { - static const int32_t kMinValueXYZ = 0x403FFE01; // 3 - 1FF / (1 << 22) - static const int32_t kMaxValueXYZ = 0x404001FF; // 3 + 1FF / (1 << 22) - uint32_t exponent = (a_u[i] >> 23) & 0xFF; - uint32_t fractional = a_u[i] & 0x007FFFFF; - if ((exponent == 0xFF) && fractional) { - b[i] = 0x200; - } else if (a_i[i] > kMaxValueXYZ) { - b[i] = 0x1FF; // INT_MAX - } else if (a_i[i] < kMinValueXYZ) { - b[i] = 0x201; // -INT_MAX - } else { - b[i] = a_u[i] & 0x3FF; - } - } - // W is 2 bits, unsigned and saturated. - static const int32_t kMinValueW = 0x40400000; // 3 - static const int32_t kMaxValueW = 0x40400003; // 3 + 3 / (1 << 22) - uint32_t w_exponent = (a_u[3] >> 23) & 0xFF; - uint32_t w_fractional = a_u[3] & 0x007FFFFF; - if ((w_exponent == 0xFF) && w_fractional) { - b[3] = 0x0; - } else if (a_i[3] > kMaxValueW) { - b[3] = 0x3; - } else if (a_i[3] < kMinValueW) { - b[3] = 0x0; - } else { - b[3] = a_u[3] & 0x3; - } - // Combine in 2101010 WZYX. - c[0] = c[1] = c[2] = 0; - c[3] = ((b[3] & 0x3) << 30) | ((b[2] & 0x3FF) << 20) | - ((b[1] & 0x3FF) << 10) | ((b[0] & 0x3FF)); - return _mm_load_si128(reinterpret_cast<__m128i*>(c)); - } static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { - assert_true(i.src2.value->IsConstantZero()); - // dest = [(b2(src1.w), b10(src1.z), b10(src1.y), b10(src1.x)), 0, 0, 0] - // TODO(benvanik): optimized version. - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulatePackUINT_2101010)); - e.vmovaps(i.dest, e.xmm0); + // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt + // XYZ are 10 bits, signed and saturated. + // W is 2 bits, unsigned and saturated. + + // Saturate. + e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked)); + e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaxUnpacked)); + // Remove the unneeded bits of the floats. + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked)); + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // Shift the components up. + e.vpsllvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift)); + } else { + // Duplicate all the components into bits 10-19. + e.vpslld(e.xmm0, i.dest, 10); + e.vpor(i.dest, e.xmm0); + // Duplicate all the components into bits 20-39 + // (so alpha will be in 30-31). + e.vpslld(e.xmm0, i.dest, 20); + e.vpor(i.dest, e.xmm0); + // Leave only the needed components. + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked)); + } + // Combine the components. + e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(2, 3, 0, 1)); + e.vorps(i.dest, e.xmm0); + e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2)); + e.vorps(i.dest, e.xmm0); } static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1, __m128i src2) { @@ -7329,20 +7309,23 @@ struct UNPACK : Sequence> { } static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { // ARGB (WXYZ) -> RGBA (XYZW) - // XMLoadColor + Xmm src; if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { e.vmovaps(i.dest, e.GetXmmConstPtr(XMMOne)); return; - } else { - assert_always(); } + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; } // src = ZZYYXXWW // Unpack to 000000ZZ,000000YY,000000XX,000000WW e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR)); // Add 1.0f to each. e.vpor(i.dest, e.GetXmmConstPtr(XMMOne)); + // To convert to 0 to 1, games multiply by 0x47008081 and add 0xC7008081. } static __m128 EmulateFLOAT16_2(void*, __m128i src1) { alignas(16) uint16_t a[8]; @@ -7426,64 +7409,96 @@ struct UNPACK : Sequence> { // (VD.x) = 3.0 + (VB.x>>16)*2^-22 // (VD.y) = 3.0 + (VB.x)*2^-22 // (VD.z) = 0.0 - // (VD.w) = 1.0 - - // XMLoadShortN2 plus 3,3,0,3 (for some reason) + // (VD.w) = 1.0 (games splat W after unpacking to get vectors of 1.0f) // src is (xx,xx,xx,VALUE) - // (VALUE,VALUE,VALUE,VALUE) Xmm src; if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301)); return; - } else { - // TODO(benvanik): check other common constants/perform shuffle/or here. - src = e.xmm0; - e.LoadConstantXmm(src, i.src1.constant()); } + // TODO(benvanik): check other common constants/perform shuffle/or here. + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); } else { src = i.src1; } // Shuffle bytes. e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2)); - // Sign extend words. + // If negative, make smaller than 3 - sign extend before adding. e.vpslld(i.dest, 16); e.vpsrad(i.dest, 16); // Add 3,3,0,1. e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); + // Clamp the absolute value to the maximum positive value. + e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_2_Min)); } static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { // (VD.x) = 3.0 + (VB.x>>16)*2^-22 // (VD.y) = 3.0 + (VB.x)*2^-22 // (VD.z) = 3.0 + (VB.y>>16)*2^-22 // (VD.w) = 3.0 + (VB.y)*2^-22 - - // XMLoadShortN4 plus 3,3,3,3 (for some reason) // src is (xx,xx,VALUE,VALUE) - // (VALUE,VALUE,VALUE,VALUE) Xmm src; if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333)); return; - } else { - // TODO(benvanik): check other common constants/perform shuffle/or here. - src = e.xmm0; - e.LoadConstantXmm(src, i.src1.constant()); } + // TODO(benvanik): check other common constants/perform shuffle/or here. + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); } else { src = i.src1; } // Shuffle bytes. e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4)); - // Sign extend words. + // If negative, make smaller than 3 - sign extend before adding. e.vpslld(i.dest, 16); e.vpsrad(i.dest, 16); // Add 3,3,3,3. e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333)); + // Clamp the absolute value to the maximum positive value. + e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Min)); } static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { - assert_always("not implemented"); + Xmm src; + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333)); + return; + } + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Splat W. + e.vshufps(i.dest, src, src, _MM_SHUFFLE(3, 3, 3, 3)); + // Keep only the needed components. + // Red in 0-9 now, green in 10-19, blue in 20-29, alpha in 30-31. + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskPacked)); + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // Shift the components down. + e.vpsrlvd(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_Shift)); + } else { + // Duplicate green in 0-9 and alpha in 20-21. + e.vpsrld(e.xmm0, i.dest, 10); + e.vpor(i.dest, e.xmm0); + // Duplicate blue in 0-9 and alpha in 0-1. + e.vpsrld(e.xmm0, i.dest, 20); + e.vpor(i.dest, e.xmm0); + // Remove higher duplicate components. + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MaskUnpacked)); + } + // If negative, make smaller than 3 - sign extend XYZ before adding. + e.vpslld(i.dest, 22); + e.vpsrad(i.dest, 22); + // Add 3,3,3,3. + e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333)); + // Clamp the absolute values of XYZ to the maximum positive value. + e.vmaxps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackUINT_2101010_MinUnpacked)); + // To convert XYZ to -1 to 1, games multiply by 0x46004020 & add 0xC6C06030. } static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { assert_false(IsPackOutSaturate(flags));