PPC: Support v[u]pkd3d128 VPACK_NORMSHORT4

This commit is contained in:
Dr. Chat 2017-05-12 17:55:52 -05:00
parent 82efbd7bc5
commit d3ed53c43e
7 changed files with 80 additions and 34 deletions

View File

@ -639,12 +639,16 @@ static const vec128_t xmm_consts[] = {
0x01000302u),
/* XMMUnpackFLOAT16_4 */ vec128i(0x09080B0Au, 0x0D0C0F0Eu, 0xFFFFFFFFu,
0xFFFFFFFFu),
/* XMMPackSHORT_2Min */ vec128i(0x403F8001u),
/* XMMPackSHORT_2Max */ vec128i(0x40407FFFu),
/* XMMPackSHORT_Min */ vec128i(0x403F8001u),
/* XMMPackSHORT_Max */ vec128i(0x40407FFFu),
/* XMMPackSHORT_2 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu,
0x01000504u),
/* XMMPackSHORT_4 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000504u,
0x09080D0Cu),
/* XMMUnpackSHORT_2 */ vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, 0xFFFFFFFFu,
0xFFFFFFFFu),
/* XMMUnpackSHORT_4 */ vec128i(0xFFFF0B0Au, 0xFFFF0908u, 0xFFFF0F0Eu,
0xFFFF0D0Cu),
/* XMMOneOver255 */ vec128f(1.0f / 255.0f),
/* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu,
0x0000FFFFu),

View File

@ -70,10 +70,12 @@ enum XmmConst {
XMMUnpackFLOAT16_2,
XMMPackFLOAT16_4,
XMMUnpackFLOAT16_4,
XMMPackSHORT_2Min,
XMMPackSHORT_2Max,
XMMPackSHORT_Min,
XMMPackSHORT_Max,
XMMPackSHORT_2,
XMMPackSHORT_4,
XMMUnpackSHORT_2,
XMMUnpackSHORT_4,
XMMOneOver255,
XMMMaskEvenPI16,
XMMShiftMaskEvenPI16,

View File

@ -6881,6 +6881,9 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
case PACK_TYPE_SHORT_2:
EmitSHORT_2(e, i);
break;
case PACK_TYPE_SHORT_4:
EmitSHORT_4(e, i);
break;
case PACK_TYPE_UINT_2101010:
EmitUINT_2101010(e, i);
break;
@ -6970,11 +6973,19 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero());
// Saturate.
e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_2Min));
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2Max));
e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_Min));
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max));
// Pack.
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2));
}
static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero());
// Saturate.
e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMPackSHORT_Min));
e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_Max));
// Pack.
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_4));
}
static __m128i EmulatePackUINT_2101010(void*, __m128i src1) {
// https://www.opengl.org/registry/specs/ARB/vertex_type_2_10_10_10_rev.txt
union {
@ -7229,15 +7240,15 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
case PACK_TYPE_FLOAT16_2:
EmitFLOAT16_2(e, i);
break;
case PACK_TYPE_FLOAT16_3:
EmitFLOAT16_3(e, i);
break;
case PACK_TYPE_FLOAT16_4:
EmitFLOAT16_4(e, i);
break;
case PACK_TYPE_SHORT_2:
EmitSHORT_2(e, i);
break;
case PACK_TYPE_SHORT_4:
EmitSHORT_4(e, i);
break;
case PACK_TYPE_UINT_2101010:
EmitUINT_2101010(e, i);
break;
@ -7323,27 +7334,6 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
e.vmovaps(i.dest, e.xmm0);
}
}
// FIXME: This has not been verified on a real 360, but from context the
// return values are used in floating point math.
static __m128 EmulateFLOAT16_3(void*, __m128i src1) {
alignas(16) uint16_t a[8];
alignas(16) float b[4];
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
for (int i = 0; i < 3; i++) {
b[i] = half_float::detail::half2float(a[VEC128_W(5 + i)]);
}
// FIXME: Correct?
b[3] = 1.0f;
return _mm_load_ps(b);
}
static void EmitFLOAT16_3(X64Emitter& e, const EmitArgType& i) {
e.lea(e.r8, e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateFLOAT16_3));
e.vmovaps(i.dest, e.xmm0);
}
static __m128 EmulateFLOAT16_4(void*, __m128i src1) {
alignas(16) uint16_t a[8];
alignas(16) float b[4];
@ -7398,6 +7388,36 @@ struct UNPACK : Sequence<UNPACK, I<OPCODE_UNPACK, V128Op, V128Op>> {
// Add 3,3,0,1.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3301));
}
static void EmitSHORT_4(X64Emitter& e, const EmitArgType& i) {
// (VD.x) = 3.0 + (VB.x>>16)*2^-22
// (VD.y) = 3.0 + (VB.x)*2^-22
// (VD.z) = 3.0 + (VB.y>>16)*2^-22
// (VD.w) = 3.0 + (VB.y)*2^-22
// XMLoadShortN4 plus 3,3,3,3 (for some reason)
// src is (xx,xx,VALUE,VALUE)
// (VALUE,VALUE,VALUE,VALUE)
Xmm src;
if (i.src1.is_constant) {
if (i.src1.value->IsConstantZero()) {
e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3333));
return;
} else {
// TODO(benvanik): check other common constants/perform shuffle/or here.
src = e.xmm0;
e.LoadConstantXmm(src, i.src1.constant());
}
} else {
src = i.src1;
}
// Shuffle bytes.
e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_4));
// Sign extend words.
e.vpslld(i.dest, 16);
e.vpsrad(i.dest, 16);
// Add 3,3,3,3.
e.vpaddd(i.dest, e.GetXmmConstPtr(XMM3333));
}
static void EmitUINT_2101010(X64Emitter& e, const EmitArgType& i) {
assert_always("not implemented");
}

View File

@ -77,7 +77,7 @@ enum PackType : uint16_t {
// Special types:
PACK_TYPE_D3DCOLOR = 0,
PACK_TYPE_FLOAT16_2 = 1,
PACK_TYPE_FLOAT16_3 = 2, // FIXME: Not verified, but looks correct.
PACK_TYPE_SHORT_4 = 2,
PACK_TYPE_FLOAT16_4 = 3,
PACK_TYPE_SHORT_2 = 4,
PACK_TYPE_UINT_2101010 = 5,

View File

@ -2058,6 +2058,9 @@ int InstrEmit_vpkd3d128(PPCHIRBuilder& f, const InstrData& i) {
case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
v = f.Pack(v, PACK_TYPE_FLOAT16_2);
break;
case 4: // VPACK_NORMSHORT4
v = f.Pack(v, PACK_TYPE_SHORT_4);
break;
case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT
v = f.Pack(v, PACK_TYPE_FLOAT16_4);
break;
@ -2158,8 +2161,8 @@ int InstrEmit_vupkd3d128(PPCHIRBuilder& f, const InstrData& i) {
case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT
v = f.Unpack(v, PACK_TYPE_FLOAT16_2);
break;
case 4:
v = f.Unpack(v, PACK_TYPE_FLOAT16_3);
case 4: // VPACK_NORMSHORT4
v = f.Unpack(v, PACK_TYPE_SHORT_4);
break;
case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT
v = f.Unpack(v, PACK_TYPE_FLOAT16_4);

View File

@ -2,7 +2,9 @@
# type:
# 0 = PACK_TYPE_D3DCOLOR
# 1 = PACK_TYPE_SHORT_2
# 2 = PACK_TYPE_2_10_10_10
# 3 = PACK_TYPE_FLOAT16_2
# 4 = ?
# 5 = PACK_TYPE_FLOAT16_4
# mask:
# must not be zero
@ -177,6 +179,15 @@ test_vpkd3d128_short2_2:
#_ REGISTER_OUT v3 [4040FFFE, 403FF333, 42A23EC8, 403DB757]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 7FFFF333]
test_vpkd3d128_short4_0:
# v3 = [1.5, -1.5, 1.5, -1.5]
#_ REGISTER_IN v3 [403F8001, 403FFFF8, 4040007F, 40400000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
# vpkd3d128 v4, v3, 4, 2, 0
.long 0x18921E10
blr
#_ REGISTER_OUT v3 [403F8001, 403FFFF8, 4040007F, 40400000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, 8001FFF8, 007F0000]
test_vpkd3d128_uint_2101010_0:
#_ REGISTER_IN v3 [B8FF8000, B8FF8000, C04001FF, 4E9A5A5A]
@ -221,7 +232,6 @@ test_vpkd3d128_float16_2_0:
#_ REGISTER_OUT v3 [3F000000, BF000000, 00000000, 00000000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800]
test_vpkd3d128_float16_4_invalid_0:
#_ REGISTER_IN v3 [3FC00000, BFC00000, 3FC00000, BFC00000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]

View File

@ -33,6 +33,13 @@ test_vupkd3d128_short2_2:
blr
#_ REGISTER_OUT v3 [40407FFF, 403FF333, 00000000, 3f800000]
test_vupkd3d128_short4_0:
#_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, 7FFFFFFF, 007FFFF8]
# vupkd3d128 v3, v3, 4
.long 0x18701FF0
blr
#_ REGISTER_OUT v3 [40407FFF, 403FFFFF, 4040007F, 403FFFF8]
test_vupkd3d128_float16_2_0:
#_ REGISTER_IN v3 [CDCDCDCD, CDCDCDCD, CDCDCDCD, 3800B800]
# vupkd3d128 v3, v3, 3