diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 1a7f9f78e..a2a85e7db 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -677,6 +677,7 @@ static const vec128_t xmm_consts[] = { /* XMMIntMin */ vec128i(INT_MIN), /* XMMIntMax */ vec128i(INT_MAX), /* XMMIntMaxPD */ vec128d(INT_MAX), + /* XMMPosIntMinPS */ vec128f((float)0x80000000u), }; // First location to try and place constants. diff --git a/src/xenia/cpu/backend/x64/x64_emitter.h b/src/xenia/cpu/backend/x64/x64_emitter.h index 0d74ce948..a499b66c2 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.h +++ b/src/xenia/cpu/backend/x64/x64_emitter.h @@ -95,6 +95,7 @@ enum XmmConst { XMMIntMin, XMMIntMax, XMMIntMaxPD, + XMMPosIntMinPS, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index d510fb46e..cd27eba12 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -1601,18 +1601,38 @@ struct VECTOR_CONVERT_F2I : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - Xmm src1 = i.src1; + if (i.instr->flags & ARITHMETIC_UNSIGNED) { + // clamp to min 0 + e.vmaxps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMZero)); - // Copy src1 if necessary. - bool copy_src1 = !!(i.instr->flags & ARITHMETIC_SATURATE); - if (copy_src1 && i.dest == i.src1) { - e.vmovdqa(e.xmm1, i.src1); - src1 = e.xmm1; - } + // xmm1 = mask of values >= (unsigned)INT_MIN + e.vcmpgeps(e.xmm1, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); + + // scale any values >= (unsigned)INT_MIN back to [0, ...] + e.vsubps(e.xmm2, e.xmm0, e.GetXmmConstPtr(XMMPosIntMinPS)); + e.vandps(e.xmm2, e.xmm1, e.xmm2); // 0 if < (unsigned)INT_MIN + e.vandnps(e.xmm0, e.xmm1, e.xmm0); // 0 if >= (unsigned)INT_MIN + + // xmm0 = [0, INT_MAX] + // this may still contain values > INT_MAX (if src has vals > UINT_MAX) + e.vorps(e.xmm0, e.xmm0, e.xmm2); + e.vcvttps2dq(i.dest, e.xmm0); + + // xmm0 = mask of values that need saturation + e.vpcmpeqd(e.xmm0, i.dest, e.GetXmmConstPtr(XMMIntMin)); + + // scale values back above [INT_MIN, UINT_MAX] + e.vpand(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMIntMin)); + e.vpaddd(i.dest, i.dest, e.xmm1); + + // saturate values > UINT_MAX + e.vpor(i.dest, i.dest, e.xmm0); + } else { + Xmm src1 = e.xmm2; + e.vmovdqa(src1, i.src1); // Duplicate src1. + + e.vcvttps2dq(i.dest, i.src1); - e.vcvttps2dq(i.dest, i.src1); - if (i.instr->flags & ARITHMETIC_SATURATE && - !(i.instr->flags & ARITHMETIC_UNSIGNED)) { // if dest is indeterminate and i.src1 >= 0 (i.e. !(i.src1 & 0x80000000)) // i.dest = 0x7FFFFFFF e.vpcmpeqd(e.xmm0, i.dest, e.GetXmmConstPtr(XMMIntMin)); @@ -1621,8 +1641,6 @@ struct VECTOR_CONVERT_F2I // (high bit of xmm0 = is ind. && i.src1 >= 0) e.vblendvps(i.dest, i.dest, e.GetXmmConstPtr(XMMIntMax), e.xmm0); } - - // TODO(DrChat): Unsigned saturation! } }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_CONVERT_F2I, VECTOR_CONVERT_F2I); diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index c749203a9..577d04e62 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -554,7 +554,7 @@ int InstrEmit_vctsxs_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb, float fuimm = static_cast(std::exp2(uimm)); Value* v = f.Mul(f.LoadVR(vb), f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE)); - v = f.VectorConvertF2I(v, ARITHMETIC_SATURATE); + v = f.VectorConvertF2I(v); f.StoreSAT(f.DidSaturate(v)); f.StoreVR(vd, v); return 0; @@ -572,7 +572,7 @@ int InstrEmit_vctuxs_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb, float fuimm = static_cast(std::exp2(uimm)); Value* v = f.Mul(f.LoadVR(vb), f.Splat(f.LoadConstantFloat32(fuimm), VEC128_TYPE)); - v = f.VectorConvertF2I(v, ARITHMETIC_UNSIGNED | ARITHMETIC_SATURATE); + v = f.VectorConvertF2I(v, ARITHMETIC_UNSIGNED); f.StoreSAT(f.DidSaturate(v)); f.StoreVR(vd, v); return 0; diff --git a/src/xenia/cpu/ppc/testing/instr_vctuxs.s b/src/xenia/cpu/ppc/testing/instr_vctuxs.s new file mode 100644 index 000000000..cc6b10a84 --- /dev/null +++ b/src/xenia/cpu/ppc/testing/instr_vctuxs.s @@ -0,0 +1,79 @@ +# 0 * 2^31 +test_vctuxs_1: + #_ REGISTER_IN v0 [00000000, 00000000, 00000000, 00000000] + vctuxs v3, v0, 31 + blr + #_ REGISTER_OUT v0 [00000000, 00000000, 00000000, 00000000] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# -0 ^ 2^31 +test_vctuxs_2: + #_ REGISTER_IN v0 [80000000, 80000000, 80000000, 80000000] + vctuxs v3, v0, 31 + blr + #_ REGISTER_OUT v0 [80000000, 80000000, 80000000, 80000000] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# smallest positive subnormal * 2^31 +test_vctuxs_3: + #_ REGISTER_IN v0 [00000001, 00000001, 00000001, 00000001] + vctuxs v3, v0, 31 + blr + #_ REGISTER_OUT v0 [00000001, 00000001, 00000001, 00000001] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# largest subnormal * 2^31 +test_vctuxs_4: + #_ REGISTER_IN v0 [007FFFFF, 007FFFFF, 007FFFFF, 007FFFFF] + vctuxs v3, v0, 31 + blr + #_ REGISTER_OUT v0 [007FFFFF, 007FFFFF, 007FFFFF, 007FFFFF] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# +1 * 2^0 +test_vctuxs_5: + #_ REGISTER_IN v0 [3F800000, 3F800000, 3F800000, 3F800000] + vctuxs v3, v0, 0 + blr + #_ REGISTER_OUT v0 [3F800000, 3F800000, 3F800000, 3F800000] + #_ REGISTER_OUT v3 [00000001, 00000001, 00000001, 00000001] + +# -1 * 2^0 +test_vctuxs_6: + #_ REGISTER_IN v0 [BF800000, BF800000, BF800000, BF800000] + vctuxs v3, v0, 0 + blr + #_ REGISTER_OUT v0 [BF800000, BF800000, BF800000, BF800000] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000] + +# 2^31 * 2^0 +test_vctuxs_7: + #_ REGISTER_IN v0 [4F000000, 4F000000, 4F000000, 4F000000] + vctuxs v3, v0, 0 + blr + #_ REGISTER_OUT v0 [4F000000, 4F000000, 4F000000, 4F000000] + #_ REGISTER_OUT v3 [80000000, 80000000, 80000000, 80000000] + +# 2^32 * 2^0 +test_vctuxs_8: + #_ REGISTER_IN v0 [4F800000, 4F800000, 4F800000, 4F800000] + vctuxs v3, v0, 0 + blr + #_ REGISTER_OUT v0 [4F800000, 4F800000, 4F800000, 4F800000] + #_ REGISTER_OUT v3 [FFFFFFFF, FFFFFFFF, FFFFFFFF, FFFFFFFF] + +# +infinity * 2^0 +test_vctuxs_9: + #_ REGISTER_IN v0 [7F800000, 7F800000, 7F800000, 7F800000] + vctuxs v3, v0, 0 + blr + #_ REGISTER_OUT v0 [7F800000, 7F800000, 7F800000, 7F800000] + #_ REGISTER_OUT v3 [FFFFFFFF, FFFFFFFF, FFFFFFFF, FFFFFFFF] + +# -infinity * 2^0 +test_vctuxs_10: + #_ REGISTER_IN v0 [FF800000, FF800000, FF800000, FF800000] + vctuxs v3, v0, 0 + blr + #_ REGISTER_OUT v0 [FF800000, FF800000, FF800000, FF800000] + #_ REGISTER_OUT v3 [00000000, 00000000, 00000000, 00000000]