From 8c00aea5ffa143557c5275a78a2b08aa1ee9e1a1 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Sat, 20 Jul 2019 22:04:37 +0300 Subject: [PATCH] [CPU] vpkd3d VPACK_NORMPACKED64 --- src/xenia/cpu/backend/x64/x64_emitter.cc | 15 ++++- src/xenia/cpu/backend/x64/x64_emitter.h | 9 ++- src/xenia/cpu/backend/x64/x64_seq_vector.cc | 69 ++++++++++++++++++++- src/xenia/cpu/hir/opcodes.h | 5 +- src/xenia/cpu/ppc/ppc_emit_altivec.cc | 19 ++++-- 5 files changed, 104 insertions(+), 13 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 9bbea6df7..ef2c8bd57 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -666,7 +666,19 @@ static const vec128_t xmm_consts[] = { vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30), /* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30), /* XMMUnpackUINT_2101010_Overflow */ vec128i(0x403FFE00u), - /* XMMUnpackOverflowNaN */ vec128i(0x7FC00000u), + /* XMMPackULONG_4202020_MinUnpacked */ + vec128i(0x40380001u, 0x40380001u, 0x40380001u, 0x40400000u), + /* XMMPackULONG_4202020_MaxUnpacked */ + vec128i(0x4047FFFFu, 0x4047FFFFu, 0x4047FFFFu, 0x4040000Fu), + /* XMMPackULONG_4202020_MaskUnpacked */ + vec128i(0xFFFFFu, 0xFFFFFu, 0xFFFFFu, 0xFu), + /* XMMPackULONG_4202020_PermuteXZ */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0A0908FFu, 0xFF020100u), + /* XMMPackULONG_4202020_PermuteYW */ + vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x0CFFFF06u, 0x0504FFFFu), + /* XMMUnpackULONG_4202020_Permute */ + vec128i(0xFF0E0D0Cu, 0xFF0B0A09u, 0xFF080F0Eu, 0xFFFFFF0Bu), + /* XMMUnpackULONG_4202020_Overflow */ vec128i(0x40380000u), /* XMMOneOver255 */ vec128f(1.0f / 255.0f), /* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu), @@ -696,6 +708,7 @@ static const vec128_t xmm_consts[] = { /* XMMIntMax */ vec128i(INT_MAX), /* XMMIntMaxPD */ vec128d(INT_MAX), /* XMMPosIntMinPS */ vec128f((float)0x80000000u), + /* XMMQNaN */ vec128i(0x7FC00000u), }; // First location to try and place constants. diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index a35c2d2b0..59c904b99 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -85,7 +85,13 @@ enum XmmConst { XMMPackUINT_2101010_MaskPacked, XMMPackUINT_2101010_Shift, XMMUnpackUINT_2101010_Overflow, - XMMUnpackOverflowNaN, + XMMPackULONG_4202020_MinUnpacked, + XMMPackULONG_4202020_MaxUnpacked, + XMMPackULONG_4202020_MaskUnpacked, + XMMPackULONG_4202020_PermuteXZ, + XMMPackULONG_4202020_PermuteYW, + XMMUnpackULONG_4202020_Permute, + XMMUnpackULONG_4202020_Overflow, XMMOneOver255, XMMMaskEvenPI16, XMMShiftMaskEvenPI16, @@ -105,6 +111,7 @@ enum XmmConst { XMMIntMax, XMMIntMaxPD, XMMPosIntMinPS, + XMMQNaN, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/xenia/cpu/backend/x64/x64_seq_vector.cc b/src/xenia/cpu/backend/x64/x64_seq_vector.cc index 0d1a0de33..dc9aa7186 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_vector.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_vector.cc @@ -1822,6 +1822,9 @@ struct PACK : Sequence> { case PACK_TYPE_UINT_2101010: EmitUINT_2101010(e, i); break; + case PACK_TYPE_ULONG_4202020: + EmitULONG_4202020(e, i); + break; case PACK_TYPE_8_IN_16: Emit8_IN_16(e, i, i.instr->flags); break; @@ -2002,6 +2005,32 @@ struct PACK : Sequence> { e.vshufps(e.xmm0, i.dest, i.dest, _MM_SHUFFLE(1, 0, 3, 2)); e.vorps(i.dest, e.xmm0); } + static void EmitULONG_4202020(X64Emitter& e, const EmitArgType& i) { + // XYZ are 20 bits, signed and saturated. + // W is 4 bits, unsigned and saturated. + Xmm src; + if (i.src1.is_constant) { + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Saturate. + e.vmaxps(i.dest, src, e.GetXmmConstPtr(XMMPackULONG_4202020_MinUnpacked)); + e.vminps(i.dest, i.dest, + e.GetXmmConstPtr(XMMPackULONG_4202020_MaxUnpacked)); + // Remove the unneeded bits of the floats (so excess nibbles will also be + // cleared). + e.vpand(i.dest, e.GetXmmConstPtr(XMMPackULONG_4202020_MaskUnpacked)); + // Store Y and W shifted left by 4 so vpshufb can be used with them. + e.vpslld(e.xmm0, i.dest, 4); + // Place XZ where they're supposed to be. + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackULONG_4202020_PermuteXZ)); + // Place YW. + e.vpshufb(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMPackULONG_4202020_PermuteYW)); + // Merge XZ and YW. + e.vorps(i.dest, e.xmm0); + } static __m128i EmulatePack8_IN_16_UN_UN_SAT(void*, __m128i src1, __m128i src2) { alignas(16) uint16_t a[8]; @@ -2214,6 +2243,9 @@ struct UNPACK : Sequence> { case PACK_TYPE_UINT_2101010: EmitUINT_2101010(e, i); break; + case PACK_TYPE_ULONG_4202020: + EmitULONG_4202020(e, i); + break; case PACK_TYPE_8_IN_16: Emit8_IN_16(e, i, i.instr->flags); break; @@ -2367,7 +2399,7 @@ struct UNPACK : Sequence> { e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); // Return quiet NaNs in case of negative overflow. e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow)); - e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0); } static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { // (VD.x) = 3.0 + (VB.x>>16)*2^-22 @@ -2396,7 +2428,7 @@ struct UNPACK : Sequence> { e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333)); // Return quiet NaNs in case of negative overflow. e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow)); - e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0); } static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { Xmm src; @@ -2437,10 +2469,41 @@ struct UNPACK : Sequence> { // Return quiet NaNs in case of negative overflow. e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackUINT_2101010_Overflow)); - e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0); // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030. // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB. } + static void EmitULONG_4202020(X64Emitter& e, const EmitArgType& i) { + Xmm src; + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3331)); + return; + } + src = i.dest; + e.LoadConstantXmm(src, i.src1.constant()); + } else { + src = i.src1; + } + // Extract pairs of nibbles to XZYW. XZ will have excess 4 upper bits, YW + // will have excess 4 lower bits. + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackULONG_4202020_Permute)); + // Drop the excess nibble of YW. + e.vpsrld(e.xmm0, i.dest, 4); + // Merge XZ and YW now both starting at offset 0. + e.vshufps(i.dest, i.dest, e.xmm0, _MM_SHUFFLE(3, 2, 1, 0)); + // Reorder as XYZW. + e.vshufps(i.dest, i.dest, _MM_SHUFFLE(3, 1, 2, 0)); + // Drop the excess upper nibble in XZ and sign-extend XYZ. + e.vpslld(i.dest, 12); + e.vpsrad(i.dest, 12); + // Add 3,3,3,1. + e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3331)); + // Return quiet NaNs in case of negative overflow. + e.vcmpeqps(e.xmm0, i.dest, + e.GetXmmConstPtr(XMMUnpackULONG_4202020_Overflow)); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMQNaN), e.xmm0); + } static void Emit8_IN_16(X64Emitter& e, const EmitArgType& i, uint32_t flags) { assert_false(IsPackOutSaturate(flags)); Xmm src; diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index ce232fd1d..6afa28555 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -81,10 +81,11 @@ enum PackType : uint16_t { PACK_TYPE_FLOAT16_4 = 3, PACK_TYPE_SHORT_2 = 4, PACK_TYPE_UINT_2101010 = 5, + PACK_TYPE_ULONG_4202020 = 6, // Types which use the bitmasks below for configuration: - PACK_TYPE_8_IN_16 = 6, - PACK_TYPE_16_IN_32 = 7, + PACK_TYPE_8_IN_16 = 7, + PACK_TYPE_16_IN_32 = 8, PACK_TYPE_MODE = 0x000F, // just to get the mode // Unpack to low or high parts. diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index b5b60af6d..08ea1b2fa 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -2052,18 +2052,22 @@ int InstrEmit_vpkd3d128(PPCHIRBuilder& f, const InstrData& i) { case 1: // VPACK_NORMSHORT2 v = f.Pack(v, PACK_TYPE_SHORT_2); break; - case 2: // VPACK_... 2_10_10_10 w_z_y_x + case 2: // VPACK_NORMPACKED32 2_10_10_10 w_z_y_x v = f.Pack(v, PACK_TYPE_UINT_2101010); break; - case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT + case 3: // VPACK_FLOAT16_2 DXGI_FORMAT_R16G16_FLOAT v = f.Pack(v, PACK_TYPE_FLOAT16_2); break; case 4: // VPACK_NORMSHORT4 v = f.Pack(v, PACK_TYPE_SHORT_4); break; - case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT + case 5: // VPACK_FLOAT16_4 DXGI_FORMAT_R16G16B16A16_FLOAT v = f.Pack(v, PACK_TYPE_FLOAT16_4); break; + case 6: // VPACK_NORMPACKED64 4_20_20_20 w_z_y_x + // Used in 2K games like NBA 2K9, pretty rarely in general. + v = f.Pack(v, PACK_TYPE_ULONG_4202020); + break; default: assert_unhandled_case(type); return 1; @@ -2156,18 +2160,21 @@ int InstrEmit_vupkd3d128(PPCHIRBuilder& f, const InstrData& i) { case 1: // VPACK_NORMSHORT2 v = f.Unpack(v, PACK_TYPE_SHORT_2); break; - case 2: // VPACK_... 2_10_10_10 w_z_y_x + case 2: // VPACK_NORMPACKED32 2_10_10_10 w_z_y_x v = f.Unpack(v, PACK_TYPE_UINT_2101010); break; - case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT + case 3: // VPACK_FLOAT16_2 DXGI_FORMAT_R16G16_FLOAT v = f.Unpack(v, PACK_TYPE_FLOAT16_2); break; case 4: // VPACK_NORMSHORT4 v = f.Unpack(v, PACK_TYPE_SHORT_4); break; - case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT + case 5: // VPACK_FLOAT16_4 DXGI_FORMAT_R16G16B16A16_FLOAT v = f.Unpack(v, PACK_TYPE_FLOAT16_4); break; + case 6: // VPACK_NORMPACKED64 4_20_20_20 w_z_y_x + v = f.Unpack(v, PACK_TYPE_ULONG_4202020); + break; default: assert_unhandled_case(type); return 1;