Saturating unsigned VECTOR_ADD.

This commit is contained in:
Ben Vanik 2014-05-27 14:27:07 -07:00
parent 8619a15ee3
commit 5436cde0fc
3 changed files with 37 additions and 1 deletions

View File

@ -444,6 +444,7 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) {
/* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f),
/* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu),
/* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu),
/* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u),
};
// TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to
// prevent this move.

View File

@ -52,6 +52,7 @@ enum XmmConst {
XMMOneOver255 = 13,
XMMShiftMaskPS = 14,
XMMShiftByteMask = 15,
XMMUnsignedDwordMax = 16,
};
// Unfortunately due to the design of xbyak we have to pass this to the ctor.

View File

@ -2566,7 +2566,41 @@ EMITTER(VECTOR_ADD, MATCH(I<OPCODE_VECTOR_ADD, V128<>, V128<>, V128<>>)) {
}
break;
case INT32_TYPE:
XEASSERTALWAYS();
if (saturate) {
if (is_unsigned) {
// We reuse all these temps...
XEASSERT(src1 != e.xmm0 && src1 != e.xmm1 && src1 != e.xmm2);
XEASSERT(src2 != e.xmm0 && src2 != e.xmm1 && src2 != e.xmm2);
// Clamp to 0xFFFFFFFF.
// Wish there was a vpaddusd...
// | A | B | C | D |
// | B | D |
e.db(0xCC);
e.vpsllq(e.xmm0, src1, 32);
e.vpsllq(e.xmm1, src2, 32);
e.vpsrlq(e.xmm0, 32);
e.vpsrlq(e.xmm1, 32);
e.vpaddq(e.xmm0, e.xmm1);
e.vpcmpgtq(e.xmm0, e.GetXmmConstPtr(XMMUnsignedDwordMax));
e.vpsllq(e.xmm0, 32);
e.vpsrlq(e.xmm0, 32);
// | A | C |
e.vpsrlq(e.xmm1, src1, 32);
e.vpsrlq(e.xmm2, src2, 32);
e.vpaddq(e.xmm1, e.xmm2);
e.vpcmpgtq(e.xmm1, e.GetXmmConstPtr(XMMUnsignedDwordMax));
e.vpsllq(e.xmm1, 32);
// xmm0 = mask for with saturated dwords == 111...
e.vpor(e.xmm0, e.xmm1);
e.vpaddd(dest, src1, src2);
// dest.f[n] = xmm1.f[n] ? xmm1.f[n] : dest.f[n];
e.vblendvps(dest, dest, e.xmm1, e.xmm1);
} else {
XEASSERTALWAYS();
}
} else {
e.vpaddd(dest, src1, src2);
}
break;
case FLOAT32_TYPE:
e.vaddps(dest, src1, src2);