diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 683e4a471..0e3cbe31c 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -639,12 +639,16 @@ static const vec128_t xmm_consts[] = { 0x01000302u), /* XMMUnpackFLOAT16_4 */ vec128i(0x09080B0Au, 0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu), - /* XMMPackSHORT_2Min */ vec128i(0x403F8001u), - /* XMMPackSHORT_2Max */ vec128i(0x40407FFFu), + /* XMMPackSHORT_Min */ vec128i(0x403F8001u), + /* XMMPackSHORT_Max */ vec128i(0x40407FFFu), /* XMMPackSHORT_2 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u), + /* XMMPackSHORT_4 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u, + 0x09080D0Cu), /* XMMUnpackSHORT_2 */ vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu), + /* XMMUnpackSHORT_4 */ vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, + 0xFFFF0D0Cu), /* XMMOneOver255 */ vec128f(1.0f / 255.0f), /* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu), diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 8f952105b..9c87bab0a 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -70,10 +70,12 @@ enum XmmConst { XMMUnpackFLOAT16_2, XMMPackFLOAT16_4, XMMUnpackFLOAT16_4, - XMMPackSHORT_2Min, - XMMPackSHORT_2Max, + XMMPackSHORT_Min, + XMMPackSHORT_Max, XMMPackSHORT_2, + XMMPackSHORT_4, XMMUnpackSHORT_2, + XMMUnpackSHORT_4, XMMOneOver255, XMMMaskEvenPI16, XMMShiftMaskEvenPI16, diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 9c30fe964..a57a51422 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -6881,6 +6881,9 @@ struct PACK : Sequence> { case PACK_TYPE_SHORT_2: EmitSHORT_2(e, i); break; + case PACK_TYPE_SHORT_4: + EmitSHORT_4(e, i); + break; case PACK_TYPE_UINT_2101010: EmitUINT_2101010(e, i); break; @@ -6970,11 +6973,19 @@ struct PACK : Sequence> { static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { assert_true(i.src2.value->IsConstantZero()); // Saturate. - e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_2Min)); - e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2Max)); + e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_Min)); + e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max)); // Pack. e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2)); } + static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { + assert_true(i.src2.value->IsConstantZero()); + // Saturate. + e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_Min)); + e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max)); + // Pack. + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4)); + } static __m128i EmulatePackUINT_2101010(void*, __m128i src1) { // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt union { @@ -7229,15 +7240,15 @@ struct UNPACK : Sequence> { case PACK_TYPE_FLOAT16_2: EmitFLOAT16_2(e, i); break; - case PACK_TYPE_FLOAT16_3: - EmitFLOAT16_3(e, i); - break; case PACK_TYPE_FLOAT16_4: EmitFLOAT16_4(e, i); break; case PACK_TYPE_SHORT_2: EmitSHORT_2(e, i); break; + case PACK_TYPE_SHORT_4: + EmitSHORT_4(e, i); + break; case PACK_TYPE_UINT_2101010: EmitUINT_2101010(e, i); break; @@ -7323,27 +7334,6 @@ struct UNPACK : Sequence> { e.vmovaps(i.dest, e.xmm0); } } - // FIXME: This has not been verified on a real 360, but from context the - // return values are used in floating point math. - static __m128 EmulateFLOAT16_3(void*, __m128i src1) { - alignas(16) uint16_t a[8]; - alignas(16) float b[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - - for (int i = 0; i < 3; i++) { - b[i] = half_float::detail::half2float(a[VEC128_W(5 + i)]); - } - - // FIXME: Correct? - b[3] = 1.0f; - - return _mm_load_ps(b); - } - static void EmitFLOAT16_3(X64Emitter& e, const EmitArgType& i) { - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_3)); - e.vmovaps(i.dest, e.xmm0); - } static __m128 EmulateFLOAT16_4(void*, __m128i src1) { alignas(16) uint16_t a[8]; alignas(16) float b[4]; @@ -7398,6 +7388,36 @@ struct UNPACK : Sequence> { // Add 3,3,0,1. e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); } + static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { + // (VD.x) = 3.0 + (VB.x>>16)*2^-22 + // (VD.y) = 3.0 + (VB.x)*2^-22 + // (VD.z) = 3.0 + (VB.y>>16)*2^-22 + // (VD.w) = 3.0 + (VB.y)*2^-22 + + // XMLoadShortN4 plus 3,3,3,3 (for some reason) + // src is (xx,xx,VALUE,VALUE) + // (VALUE,VALUE,VALUE,VALUE) + Xmm src; + if (i.src1.is_constant) { + if (i.src1.value->IsConstantZero()) { + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333)); + return; + } else { + // TODO(benvanik): check other common constants/perform shuffle/or here. + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); + } + } else { + src = i.src1; + } + // Shuffle bytes. + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4)); + // Sign extend words. + e.vpslld(i.dest, 16); + e.vpsrad(i.dest, 16); + // Add 3,3,3,3. + e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333)); + } static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { assert_always("not implemented"); } diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 6ae58f8f4..d2e9344db 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -77,7 +77,7 @@ enum PackType : uint16_t { // Special types: PACK_TYPE_D3DCOLOR = 0, PACK_TYPE_FLOAT16_2 = 1, - PACK_TYPE_FLOAT16_3 = 2, // FIXME: Not verified, but looks correct. + PACK_TYPE_SHORT_4 = 2, PACK_TYPE_FLOAT16_4 = 3, PACK_TYPE_SHORT_2 = 4, PACK_TYPE_UINT_2101010 = 5, diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index d825f188a..c71b2ae5c 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -2058,6 +2058,9 @@ int InstrEmit_vpkd3d128(PPCHIRBuilder& f, const InstrData& i) { case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT v = f.Pack(v, PACK_TYPE_FLOAT16_2); break; + case 4: // VPACK_NORMSHORT4 + v = f.Pack(v, PACK_TYPE_SHORT_4); + break; case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT v = f.Pack(v, PACK_TYPE_FLOAT16_4); break; @@ -2158,8 +2161,8 @@ int InstrEmit_vupkd3d128(PPCHIRBuilder& f, const InstrData& i) { case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT v = f.Unpack(v, PACK_TYPE_FLOAT16_2); break; - case 4: - v = f.Unpack(v, PACK_TYPE_FLOAT16_3); + case 4: // VPACK_NORMSHORT4 + v = f.Unpack(v, PACK_TYPE_SHORT_4); break; case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT v = f.Unpack(v, PACK_TYPE_FLOAT16_4); diff --git a/src/xenia/cpu/ppc/testing/instr_vpkd3d128.s b/src/xenia/cpu/ppc/testing/instr_vpkd3d128.s index 9268b2a8b..c366b3ab2 100644 --- a/src/xenia/cpu/ppc/testing/instr_vpkd3d128.s +++ b/src/xenia/cpu/ppc/testing/instr_vpkd3d128.s @@ -2,7 +2,9 @@ # type: # 0 = PACK_TYPE_D3DCOLOR # 1 = PACK_TYPE_SHORT_2 +# 2 = PACK_TYPE_2_10_10_10 # 3 = PACK_TYPE_FLOAT16_2 +# 4 = ? # 5 = PACK_TYPE_FLOAT16_4 # mask: # must not be zero @@ -177,6 +179,15 @@ test_vpkd3d128_short2_2: #_ REGISTER_OUT v3 [4040FFFE, 403FF333, 42A23EC8, 403DB757] #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 7FFFF333] +test_vpkd3d128_short4_0: + # v3 = [1.5, -1.5, 1.5, -1.5] + #_ REGISTER_IN v3 [403F8001, 403FFFF8, 4040007F, 40400000] + #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] + # vpkd3d128 v4, v3, 4, 2, 0 + .long 0x18921E10 + blr + #_ REGISTER_OUT v3 [403F8001, 403FFFF8, 4040007F, 40400000] + #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, 8001FFF8, 007F0000] test_vpkd3d128_uint_2101010_0: #_ REGISTER_IN v3 [B8FF8000, B8FF8000, C04001FF, 4E9A5A5A] @@ -221,7 +232,6 @@ test_vpkd3d128_float16_2_0: #_ REGISTER_OUT v3 [3F000000, BF000000, 00000000, 00000000] #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800] - test_vpkd3d128_float16_4_invalid_0: #_ REGISTER_IN v3 [3FC00000, BFC00000, 3FC00000, BFC00000] #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] diff --git a/src/xenia/cpu/ppc/testing/instr_vupkd3d128.s b/src/xenia/cpu/ppc/testing/instr_vupkd3d128.s index 31800b3a2..cca99605c 100644 --- a/src/xenia/cpu/ppc/testing/instr_vupkd3d128.s +++ b/src/xenia/cpu/ppc/testing/instr_vupkd3d128.s @@ -33,6 +33,13 @@ test_vupkd3d128_short2_2: blr #_ REGISTER_OUT v3 [40407FFF, 403FF333, 00000000, 3f800000] +test_vupkd3d128_short4_0: + #_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, 7FFFFFFF, 007FFFF8] + # vupkd3d128 v3, v3, 4 + .long 0x18701FF0 + blr + #_ REGISTER_OUT v3 [40407FFF, 403FFFFF, 4040007F, 403FFFF8] + test_vupkd3d128_float16_2_0: #_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800] # vupkd3d128 v3, v3, 3