From c89cc6a2298f40d46e305c31b96628c01f152630 Mon Sep 17 00:00:00 2001 From: Triang3l Date: Mon, 11 Jun 2018 21:31:30 +0300 Subject: [PATCH] [CPU] vupkd3d: NaN on negative overflow and tests --- src/xenia/cpu/backend/x64/x64_emitter.cc | 3 +++ src/xenia/cpu/backend/x64/x64_emitter.h | 3 +++ src/xenia/cpu/backend/x64/x64_sequences.cc | 10 ++++++++ src/xenia/cpu/ppc/testing/instr_vupkd3d128.s | 25 ++++++++++++++++++++ 4 files changed, 41 insertions(+) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 77361aa54..5642e11d7 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -650,6 +650,7 @@ static const vec128_t xmm_consts[] = { vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, 0xFFFFFFFFu), /* XMMUnpackSHORT_4 */ vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu, 0xFFFF0D0Cu), + /* XMMUnpackSHORT_Overflow */ vec128i(0x403F8000u), /* XMMPackUINT_2101010_MinUnpacked */ vec128i(0x403FFE01u, 0x403FFE01u, 0x403FFE01u, 0x40400000u), /* XMMPackUINT_2101010_MaxUnpacked */ @@ -659,6 +660,8 @@ static const vec128_t xmm_consts[] = { /* XMMPackUINT_2101010_MaskPacked */ vec128i(0x3FFu, 0x3FFu << 10, 0x3FFu << 20, 0x3u << 30), /* XMMPackUINT_2101010_Shift */ vec128i(0, 10, 20, 30), + /* XMMUnpackUINT_2101010_Overflow */ vec128i(0x403FFE00u), + /* XMMUnpackOverflowNaN */ vec128i(0x7FC00000u), /* XMMOneOver255 */ vec128f(1.0f / 255.0f), /* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu), diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 479fcc865..33ce2c0a2 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -78,11 +78,14 @@ enum XmmConst { XMMPackSHORT_4, XMMUnpackSHORT_2, XMMUnpackSHORT_4, + XMMUnpackSHORT_Overflow, XMMPackUINT_2101010_MinUnpacked, XMMPackUINT_2101010_MaxUnpacked, XMMPackUINT_2101010_MaskUnpacked, XMMPackUINT_2101010_MaskPacked, XMMPackUINT_2101010_Shift, + XMMUnpackUINT_2101010_Overflow, + XMMUnpackOverflowNaN, XMMOneOver255, XMMMaskEvenPI16, XMMShiftMaskEvenPI16, diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index d0eb88e11..d9cb1cc55 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -7426,6 +7426,9 @@ struct UNPACK : Sequence> { e.vpsrad(i.dest, 16); // Add 3,3,0,1. e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); + // Return quiet NaNs in case of negative overflow. + e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow)); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); } static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) { // (VD.x) = 3.0 + (VB.x>>16)*2^-22 @@ -7452,6 +7455,9 @@ struct UNPACK : Sequence> { e.vpsrad(i.dest, 16); // Add 3,3,3,3. e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333)); + // Return quiet NaNs in case of negative overflow. + e.vcmpeqps(e.xmm0, i.dest, e.GetXmmConstPtr(XMMUnpackSHORT_Overflow)); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); } static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { Xmm src; @@ -7489,6 +7495,10 @@ struct UNPACK : Sequence> { e.vpsrad(i.dest, 22); // Add 3,3,3,1. e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3331)); + // Return quiet NaNs in case of negative overflow. + e.vcmpeqps(e.xmm0, i.dest, + e.GetXmmConstPtr(XMMUnpackUINT_2101010_Overflow)); + e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMUnpackOverflowNaN), e.xmm0); // To convert XYZ to -1 to 1, games multiply by 0x46004020 & sub 0x46C06030. // For W to 0 to 1, they multiply by and subtract 0x4A2AAAAB. } diff --git a/src/xenia/cpu/ppc/testing/instr_vupkd3d128.s b/src/xenia/cpu/ppc/testing/instr_vupkd3d128.s index cca99605c..b16b299d4 100644 --- a/src/xenia/cpu/ppc/testing/instr_vupkd3d128.s +++ b/src/xenia/cpu/ppc/testing/instr_vupkd3d128.s @@ -32,6 +32,12 @@ test_vupkd3d128_short2_2: .long 0x18641FF0 blr #_ REGISTER_OUT v3 [40407FFF, 403FF333, 00000000, 3f800000] +test_vupkd3d128_short2_3: + #_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 00008000] + # vupkd3d128 v3, v3, 1 + .long 0x18641FF0 + blr + #_ REGISTER_OUT v3 [40400000, 7FC00000, 00000000, 3f800000] test_vupkd3d128_short4_0: #_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, 7FFFFFFF, 007FFFF8] @@ -53,3 +59,22 @@ test_vupkd3d128_float16_4_0: .long 0x18741FF0 blr #_ REGISTER_OUT v3 [3F000000, bf002000, 3f004000, bf006000] + +test_vupkd3d128_uint_2101010_0: + #_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 400001FF] + # vupkd3d128 v3, v3, 2 + .long 0x18681FF0 + blr + #_ REGISTER_OUT v3 [404001FF, 40400000, 40400000, 3F800001] +test_vupkd3d128_uint_2101010_1: + #_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 40000201] + # vupkd3d128 v3, v3, 2 + .long 0x18681FF0 + blr + #_ REGISTER_OUT v3 [403FFE01, 40400000, 40400000, 3F800001] +test_vupkd3d128_uint_2101010_2: + #_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 40000200] + # vupkd3d128 v3, v3, 2 + .long 0x18681FF0 + blr + #_ REGISTER_OUT v3 [7FC00000, 40400000, 40400000, 3F800001]