diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 6e94a660a..684fcaa86 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -444,6 +444,7 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f), /* XMMShiftMaskPS */ vec128i(0x0000001Fu, 0x0000001Fu, 0x0000001Fu, 0x0000001Fu), /* XMMShiftByteMask */ vec128i(0x000000FFu, 0x000000FFu, 0x000000FFu, 0x000000FFu), + /* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u), }; // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to // prevent this move. diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index d67348e18..3ac92be3f 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -52,6 +52,7 @@ enum XmmConst { XMMOneOver255 = 13, XMMShiftMaskPS = 14, XMMShiftByteMask = 15, + XMMUnsignedDwordMax = 16, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 6e6d85da9..9ece83e67 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -2566,7 +2566,41 @@ EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { } break; case INT32_TYPE: - XEASSERTALWAYS(); + if (saturate) { + if (is_unsigned) { + // We reuse all these temps... + XEASSERT(src1 != e.xmm0 && src1 != e.xmm1 && src1 != e.xmm2); + XEASSERT(src2 != e.xmm0 && src2 != e.xmm1 && src2 != e.xmm2); + // Clamp to 0xFFFFFFFF. + // Wish there was a vpaddusd... + // | A | B | C | D | + // | B | D | + e.db(0xCC); + e.vpsllq(e.xmm0, src1, 32); + e.vpsllq(e.xmm1, src2, 32); + e.vpsrlq(e.xmm0, 32); + e.vpsrlq(e.xmm1, 32); + e.vpaddq(e.xmm0, e.xmm1); + e.vpcmpgtq(e.xmm0, e.GetXmmConstPtr(XMMUnsignedDwordMax)); + e.vpsllq(e.xmm0, 32); + e.vpsrlq(e.xmm0, 32); + // | A | C | + e.vpsrlq(e.xmm1, src1, 32); + e.vpsrlq(e.xmm2, src2, 32); + e.vpaddq(e.xmm1, e.xmm2); + e.vpcmpgtq(e.xmm1, e.GetXmmConstPtr(XMMUnsignedDwordMax)); + e.vpsllq(e.xmm1, 32); + // xmm0 = mask for with saturated dwords == 111... + e.vpor(e.xmm0, e.xmm1); + e.vpaddd(dest, src1, src2); + // dest.f[n] = xmm1.f[n] ? xmm1.f[n] : dest.f[n]; + e.vblendvps(dest, dest, e.xmm1, e.xmm1); + } else { + XEASSERTALWAYS(); + } + } else { + e.vpaddd(dest, src1, src2); + } break; case FLOAT32_TYPE: e.vaddps(dest, src1, src2);