PPC: Support v[u]pkd3d128 VPACK_NORMSHORT4

This commit is contained in:
Dr. Chat 2017-05-12 17:55:52 -05:00
parent 82efbd7bc5
commit d3ed53c43e
7 changed files with 80 additions and 34 deletions

View File

@ -639,12 +639,16 @@ static const vec128_t xmm_consts[] = {
0x01000302u), 0x01000302u),
/* XMMUnpackFLOAT16_4 */ vec128i(0x09080B0Au, 0x0D0C0F0Eu, 0xFFFFFFFFu, /* XMMUnpackFLOAT16_4 */ vec128i(0x09080B0Au, 0x0D0C0F0Eu, 0xFFFFFFFFu,
0xFFFFFFFFu), 0xFFFFFFFFu),
/* XMMPackSHORT_2Min */ vec128i(0x403F8001u), /* XMMPackSHORT_Min */ vec128i(0x403F8001u),
/* XMMPackSHORT_2Max */ vec128i(0x40407FFFu), /* XMMPackSHORT_Max */ vec128i(0x40407FFFu),
/* XMMPackSHORT_2 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, /* XMMPackSHORT_2 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu,
0x01000504u), 0x01000504u),
/* XMMPackSHORT_4 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u,
0x09080D0Cu),
/* XMMUnpackSHORT_2 */ vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu, /* XMMUnpackSHORT_2 */ vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu,
0xFFFFFFFFu), 0xFFFFFFFFu),
/* XMMUnpackSHORT_4 */ vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu,
0xFFFF0D0Cu),
/* XMMOneOver255 */ vec128f(1.0f / 255.0f), /* XMMOneOver255 */ vec128f(1.0f / 255.0f),
/* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, /* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu,
0x0000FFFFu), 0x0000FFFFu),

View File

@ -70,10 +70,12 @@ enum XmmConst {
XMMUnpackFLOAT16_2, XMMUnpackFLOAT16_2,
XMMPackFLOAT16_4, XMMPackFLOAT16_4,
XMMUnpackFLOAT16_4, XMMUnpackFLOAT16_4,
XMMPackSHORT_2Min, XMMPackSHORT_Min,
XMMPackSHORT_2Max, XMMPackSHORT_Max,
XMMPackSHORT_2, XMMPackSHORT_2,
XMMPackSHORT_4,
XMMUnpackSHORT_2, XMMUnpackSHORT_2,
XMMUnpackSHORT_4,
XMMOneOver255, XMMOneOver255,
XMMMaskEvenPI16, XMMMaskEvenPI16,
XMMShiftMaskEvenPI16, XMMShiftMaskEvenPI16,

View File

@ -6881,6 +6881,9 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
case PACK_TYPE_SHORT_2: case PACK_TYPE_SHORT_2:
EmitSHORT_2(e, i); EmitSHORT_2(e, i);
break; break;
case PACK_TYPE_SHORT_4:
EmitSHORT_4(e, i);
break;
case PACK_TYPE_UINT_2101010: case PACK_TYPE_UINT_2101010:
EmitUINT_2101010(e, i); EmitUINT_2101010(e, i);
break; break;
@ -6970,11 +6973,19 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero()); assert_true(i.src2.value->IsConstantZero());
// Saturate. // Saturate.
e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_2Min)); e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_Min));
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2Max)); e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max));
// Pack. // Pack.
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2)); e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2));
} }
static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero());
// Saturate.
e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_Min));
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max));
// Pack.
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4));
}
static __m128i EmulatePackUINT_2101010(void*, __m128i src1) { static __m128i EmulatePackUINT_2101010(void*, __m128i src1) {
// https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt // https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
union { union {
@ -7229,15 +7240,15 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
case PACK_TYPE_FLOAT16_2: case PACK_TYPE_FLOAT16_2:
EmitFLOAT16_2(e, i); EmitFLOAT16_2(e, i);
break; break;
case PACK_TYPE_FLOAT16_3:
EmitFLOAT16_3(e, i);
break;
case PACK_TYPE_FLOAT16_4: case PACK_TYPE_FLOAT16_4:
EmitFLOAT16_4(e, i); EmitFLOAT16_4(e, i);
break; break;
case PACK_TYPE_SHORT_2: case PACK_TYPE_SHORT_2:
EmitSHORT_2(e, i); EmitSHORT_2(e, i);
break; break;
case PACK_TYPE_SHORT_4:
EmitSHORT_4(e, i);
break;
case PACK_TYPE_UINT_2101010: case PACK_TYPE_UINT_2101010:
EmitUINT_2101010(e, i); EmitUINT_2101010(e, i);
break; break;
@ -7323,27 +7334,6 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
} }
} }
// FIXME: This has not been verified on a real 360, but from context the
// return values are used in floating point math.
static __m128 EmulateFLOAT16_3(void*, __m128i src1) {
alignas(16) uint16_t a[8];
alignas(16) float b[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
for (int i = 0; i < 3; i++) {
b[i] = half_float::detail::half2float(a[VEC128_W(5 + i)]);
}
// FIXME: Correct?
b[3] = 1.0f;
return _mm_load_ps(b);
}
static void EmitFLOAT16_3(X64Emitter& e, const EmitArgType& i) {
e.lea(e.r8, e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_3));
e.vmovaps(i.dest, e.xmm0);
}
static __m128 EmulateFLOAT16_4(void*, __m128i src1) { static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
alignas(16) uint16_t a[8]; alignas(16) uint16_t a[8];
alignas(16) float b[4]; alignas(16) float b[4];
@ -7398,6 +7388,36 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
// Add 3,3,0,1. // Add 3,3,0,1.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301)); e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
} }
static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
// (VD.x) = 3.0 + (VB.x>>16)*2^-22
// (VD.y) = 3.0 + (VB.x)*2^-22
// (VD.z) = 3.0 + (VB.y>>16)*2^-22
// (VD.w) = 3.0 + (VB.y)*2^-22
// XMLoadShortN4 plus 3,3,3,3 (for some reason)
// src is (xx,xx,VALUE,VALUE)
// (VALUE,VALUE,VALUE,VALUE)
Xmm src;
if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
return;
} else {
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
}
} else {
src = i.src1;
}
// Shuffle bytes.
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4));
// Sign extend words.
e.vpslld(i.dest, 16);
e.vpsrad(i.dest, 16);
// Add 3,3,3,3.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
}
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) { static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
assert_always("not implemented"); assert_always("not implemented");
} }

View File

@ -77,7 +77,7 @@ enum PackType : uint16_t {
// Special types: // Special types:
PACK_TYPE_D3DCOLOR = 0, PACK_TYPE_D3DCOLOR = 0,
PACK_TYPE_FLOAT16_2 = 1, PACK_TYPE_FLOAT16_2 = 1,
PACK_TYPE_FLOAT16_3 = 2, // FIXME: Not verified, but looks correct. PACK_TYPE_SHORT_4 = 2,
PACK_TYPE_FLOAT16_4 = 3, PACK_TYPE_FLOAT16_4 = 3,
PACK_TYPE_SHORT_2 = 4, PACK_TYPE_SHORT_2 = 4,
PACK_TYPE_UINT_2101010 = 5, PACK_TYPE_UINT_2101010 = 5,

View File

@ -2058,6 +2058,9 @@ int InstrEmit_vpkd3d128(PPCHIRBuilder& f, const InstrData& i) {
case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
v = f.Pack(v, PACK_TYPE_FLOAT16_2); v = f.Pack(v, PACK_TYPE_FLOAT16_2);
break; break;
case 4: // VPACK_NORMSHORT4
v = f.Pack(v, PACK_TYPE_SHORT_4);
break;
case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT
v = f.Pack(v, PACK_TYPE_FLOAT16_4); v = f.Pack(v, PACK_TYPE_FLOAT16_4);
break; break;
@ -2158,8 +2161,8 @@ int InstrEmit_vupkd3d128(PPCHIRBuilder& f, const InstrData& i) {
case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
v = f.Unpack(v, PACK_TYPE_FLOAT16_2); v = f.Unpack(v, PACK_TYPE_FLOAT16_2);
break; break;
case 4: case 4: // VPACK_NORMSHORT4
v = f.Unpack(v, PACK_TYPE_FLOAT16_3); v = f.Unpack(v, PACK_TYPE_SHORT_4);
break; break;
case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT
v = f.Unpack(v, PACK_TYPE_FLOAT16_4); v = f.Unpack(v, PACK_TYPE_FLOAT16_4);

View File

@ -2,7 +2,9 @@
# type: # type:
# 0 = PACK_TYPE_D3DCOLOR # 0 = PACK_TYPE_D3DCOLOR
# 1 = PACK_TYPE_SHORT_2 # 1 = PACK_TYPE_SHORT_2
# 2 = PACK_TYPE_2_10_10_10
# 3 = PACK_TYPE_FLOAT16_2 # 3 = PACK_TYPE_FLOAT16_2
# 4 = ?
# 5 = PACK_TYPE_FLOAT16_4 # 5 = PACK_TYPE_FLOAT16_4
# mask: # mask:
# must not be zero # must not be zero
@ -177,6 +179,15 @@ test_vpkd3d128_short2_2:
#_ REGISTER_OUT v3 [4040FFFE, 403FF333, 42A23EC8, 403DB757] #_ REGISTER_OUT v3 [4040FFFE, 403FF333, 42A23EC8, 403DB757]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 7FFFF333] #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 7FFFF333]
test_vpkd3d128_short4_0:
# v3 = [1.5, -1.5, 1.5, -1.5]
#_ REGISTER_IN v3 [403F8001, 403FFFF8, 4040007F, 40400000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
# vpkd3d128 v4, v3, 4, 2, 0
.long 0x18921E10
blr
#_ REGISTER_OUT v3 [403F8001, 403FFFF8, 4040007F, 40400000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, 8001FFF8, 007F0000]
test_vpkd3d128_uint_2101010_0: test_vpkd3d128_uint_2101010_0:
#_ REGISTER_IN v3 [B8FF8000, B8FF8000, C04001FF, 4E9A5A5A] #_ REGISTER_IN v3 [B8FF8000, B8FF8000, C04001FF, 4E9A5A5A]
@ -221,7 +232,6 @@ test_vpkd3d128_float16_2_0:
#_ REGISTER_OUT v3 [3F000000, BF000000, 00000000, 00000000] #_ REGISTER_OUT v3 [3F000000, BF000000, 00000000, 00000000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800] #_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800]
test_vpkd3d128_float16_4_invalid_0: test_vpkd3d128_float16_4_invalid_0:
#_ REGISTER_IN v3 [3FC00000, BFC00000, 3FC00000, BFC00000] #_ REGISTER_IN v3 [3FC00000, BFC00000, 3FC00000, BFC00000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD] #_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]

View File

@ -33,6 +33,13 @@ test_vupkd3d128_short2_2:
blr blr
#_ REGISTER_OUT v3 [40407FFF, 403FF333, 00000000, 3f800000] #_ REGISTER_OUT v3 [40407FFF, 403FF333, 00000000, 3f800000]
test_vupkd3d128_short4_0:
#_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, 7FFFFFFF, 007FFFF8]
# vupkd3d128 v3, v3, 4
.long 0x18701FF0
blr
#_ REGISTER_OUT v3 [40407FFF, 403FFFFF, 4040007F, 403FFFF8]
test_vupkd3d128_float16_2_0: test_vupkd3d128_float16_2_0:
#_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800] #_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800]
# vupkd3d128 v3, v3, 3 # vupkd3d128 v3, v3, 3