[CPU] Fix vpkd3d half4 component order

This commit is contained in:
Triang3l 2019-01-24 17:45:41 +03:00
parent c3fcb47efe
commit 4ae9266f13
3 changed files with 8 additions and 7 deletions

View File

@ -642,7 +642,7 @@ static const vec128_t xmm_consts[] = {
/* XMMUnpackFLOAT16_2 */
vec128i(0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu),
/* XMMPackFLOAT16_4 */
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x05040706u, 0x01000302u),
vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0x01000302u, 0x05040706u),
/* XMMUnpackFLOAT16_4 */
vec128i(0x09080B0Au, 0x0D0C0F0Eu, 0xFFFFFFFFu, 0xFFFFFFFFu),
/* XMMPackSHORT_Min */ vec128i(0x403F8001u),

View File

@ -1902,14 +1902,15 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
std::memset(b, 0, sizeof(b));
for (int i = 0; i < 4; i++) {
b[7 - i] = half_float::detail::float2half<std::round_toward_zero>(a[i]);
b[7 - (i ^ 2)] =
half_float::detail::float2half<std::round_toward_zero>(a[i]);
}
return _mm_load_si128(reinterpret_cast<__m128i*>(b));
}
static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) {
assert_true(i.src2.value->IsConstantZero());
// dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0]
// dest = [(src1.z | src1.w), (src1.x | src1.y), 0, 0]
Xmm src;
if (e.IsFeatureEnabled(kX64EmitF16C)) {
@ -1921,7 +1922,7 @@ struct PACK : Sequence<PACK, I<OPCODE_PACK, V128Op, V128Op, V128Op>> {
}
// 0|0|0|0|W|Z|Y|X
e.vcvtps2ph(i.dest, src, 0b00000011);
// Shuffle to X|Y|Z|W|0|0|0|0
// Shuffle to Z|W|X|Y|0|0|0|0
e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4));
} else {
if (i.src1.is_constant) {

View File

@ -249,10 +249,10 @@ test_vpkd3d128_float16_4_invalid_0:
#_ REGISTER_OUT v3 [3FC00000, BFC00000, 3FC00000, BFC00000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, 3E00BE00, 3E00BE00]
test_vpkd3d128_float16_4_0:
#_ REGISTER_IN v3 [3F000000, BF000000, 3F000000, BF000000]
#_ REGISTER_IN v3 [3F000000, BF000000, 3F800000, BF800000]
#_ REGISTER_IN v4 [CDCDCDCD, CDCDCDCD, CDCDCDCD, CDCDCDCD]
# vpkd3d128 v4, v3, 5, 2, 0
.long 0x18961E10
blr
#_ REGISTER_OUT v3 [3F000000, BF000000, 3F000000, BF000000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, 3800B800, 3800B800]
#_ REGISTER_OUT v3 [3F000000, BF000000, 3F800000, BF800000]
#_ REGISTER_OUT v4 [CDCDCDCD, CDCDCDCD, 3800B800, 3C00BC00]