diff --git a/src/xenia/base/debug_visualizers.natvis b/src/xenia/base/debug_visualizers.natvis index 17e481c08..b5077dfc6 100644 --- a/src/xenia/base/debug_visualizers.natvis +++ b/src/xenia/base/debug_visualizers.natvis @@ -2,6 +2,31 @@ + + + {(((value & 0xFF00000000000000) >> 56) | + ((value & 0x00FF000000000000) >> 40) | + ((value & 0x0000FF0000000000) >> 24) | + ((value & 0x000000FF00000000) >> 8 ) | + ((value & 0x00000000FF000000) << 8 ) | + ((value & 0x0000000000FF0000) << 24) | + ((value & 0x000000000000FF00) << 40) | + ((value & 0x00000000000000FF) << 56))} + + + + + {(((value & 0xFF00000000000000) >> 56) | + ((value & 0x00FF000000000000) >> 40) | + ((value & 0x0000FF0000000000) >> 24) | + ((value & 0x000000FF00000000) >> 8 ) | + ((value & 0x00000000FF000000) << 8 ) | + ((value & 0x0000000000FF0000) << 24) | + ((value & 0x000000000000FF00) << 40) | + ((value & 0x00000000000000FF) << 56))} + + + {(((value & 0xFF000000) >> 24) | diff --git a/src/xenia/base/vec128.h b/src/xenia/base/vec128.h index 0d5e985eb..139227cc5 100644 --- a/src/xenia/base/vec128.h +++ b/src/xenia/base/vec128.h @@ -105,12 +105,54 @@ typedef struct alignas(16) vec128_s { }; }; + vec128_s() = default; + vec128_s(const vec128_s& other) { + high = other.high; + low = other.low; + } + + vec128_s& operator=(const vec128_s& b) { + high = b.high; + low = b.low; + return *this; + } + bool operator==(const vec128_s& b) const { return low == b.low && high == b.high; } bool operator!=(const vec128_s& b) const { return low != b.low || high != b.high; } + vec128_s operator^(const vec128_s& b) const { + vec128_s a = *this; + a.high ^= b.high; + a.low ^= b.low; + return a; + }; + vec128_s& operator^=(const vec128_s& b) { + *this = *this ^ b; + return *this; + }; + vec128_s operator&(const vec128_s& b) const { + vec128_s a = *this; + a.high &= b.high; + a.low &= b.low; + return a; + }; + vec128_s& operator&=(const vec128_s& b) { + *this = *this & b; + return *this; + }; + vec128_s operator|(const vec128_s& b) const { + vec128_s a = *this; + a.high |= b.high; + a.low |= b.low; + return a; + }; + vec128_s& operator|=(const vec128_s& b) { + *this = *this | b; + return *this; + }; } vec128_t; static inline vec128_t vec128i(uint32_t src) { diff --git a/src/xenia/cpu/backend/x64/x64_code_cache.h b/src/xenia/cpu/backend/x64/x64_code_cache.h index 5795f85d7..8fef0273e 100644 --- a/src/xenia/cpu/backend/x64/x64_code_cache.h +++ b/src/xenia/cpu/backend/x64/x64_code_cache.h @@ -70,7 +70,7 @@ class X64CodeCache : public CodeCache { // This is picked to be high enough to cover whatever we can reasonably // expect. If we hit issues with this it probably means some corner case // in analysis triggering. - static const size_t kMaximumFunctionCount = 30000; + static const size_t kMaximumFunctionCount = 50000; struct UnwindReservation { size_t data_size = 0; diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 2aa290952..60afde294 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -572,6 +572,7 @@ struct Sequence { e.LoadConstantXmm(e.xmm0, i.src1.constant()); fn(e, i.dest, e.xmm0, i.src2); } else if (i.src2.is_constant) { + assert_true(!i.src1.is_constant); e.LoadConstantXmm(e.xmm0, i.src2.constant()); fn(e, i.dest, i.src1, e.xmm0); } else { @@ -2715,26 +2716,46 @@ struct SELECT_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): find a shorter sequence. - // xmm0 = src1 != 0 ? 1111... : 0000.... + // dest = src1 != 0 ? src2 : src3 e.movzx(e.eax, i.src1); e.vmovd(e.xmm1, e.eax); e.vxorps(e.xmm0, e.xmm0); - e.vcmpneqss(e.xmm0, e.xmm1); - e.vpand(e.xmm1, e.xmm0, i.src2); - e.vpandn(i.dest, e.xmm0, i.src3); + e.vpcmpeqd(e.xmm0, e.xmm1); + + Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src2.constant()); + } + e.vpandn(e.xmm1, e.xmm0, src2); + + Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src3.constant()); + } + e.vpand(i.dest, e.xmm0, src3); e.vpor(i.dest, e.xmm1); } }; struct SELECT_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // xmm0 = src1 != 0 ? 1111... : 0000.... + // dest = src1 != 0 ? src2 : src3 e.movzx(e.eax, i.src1); e.vmovd(e.xmm1, e.eax); - e.vxorpd(e.xmm0, e.xmm0); - e.vcmpneqsd(e.xmm0, e.xmm1); - e.vpand(e.xmm1, e.xmm0, i.src2); - e.vpandn(i.dest, e.xmm0, i.src3); + e.vpxor(e.xmm0, e.xmm0); + e.vpcmpeqq(e.xmm0, e.xmm1); + + Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src2.constant()); + } + e.vpandn(e.xmm1, e.xmm0, src2); + + Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src3.constant()); + } + e.vpand(i.dest, e.xmm0, src3); e.vpor(i.dest, e.xmm1); } }; @@ -2742,14 +2763,24 @@ struct SELECT_V128_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): find a shorter sequence. - // xmm0 = src1 != 0 ? 1111... : 0000.... + // dest = src1 != 0 ? src2 : src3 e.movzx(e.eax, i.src1); e.vmovd(e.xmm1, e.eax); e.vpbroadcastd(e.xmm1, e.xmm1); e.vxorps(e.xmm0, e.xmm0); - e.vcmpneqps(e.xmm0, e.xmm1); - e.vpand(e.xmm1, e.xmm0, i.src2); - e.vpandn(i.dest, e.xmm0, i.src3); + e.vpcmpeqd(e.xmm0, e.xmm1); + + Xmm src2 = i.src2.is_constant ? e.xmm2 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src2.constant()); + } + e.vpandn(e.xmm1, e.xmm0, src2); + + Xmm src3 = i.src3.is_constant ? e.xmm2 : i.src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm2, i.src3.constant()); + } + e.vpand(i.dest, e.xmm0, src3); e.vpor(i.dest, e.xmm1); } }; @@ -2757,26 +2788,24 @@ struct SELECT_V128_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): could be made shorter when consts involved. + Xmm src1 = i.src1.is_constant ? e.xmm1 : i.src1; + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm1, i.src1.constant()); + } + + Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2; if (i.src2.is_constant) { - if (i.src2.value->IsConstantZero()) { - e.vpxor(e.xmm1, e.xmm1); - } else { - assert_always(); - } - } else { - e.vpandn(e.xmm1, i.src1, i.src2); + e.LoadConstantXmm(e.xmm0, i.src2.constant()); } + e.vpandn(e.xmm0, src1, src2); + + Xmm src3 = i.src3.is_constant ? i.dest : i.src3; if (i.src3.is_constant) { - if (i.src3.value->IsConstantZero()) { - e.vpxor(i.dest, i.dest); - } else { - assert_always(); - } - } else { - e.vpand(i.dest, i.src1, i.src3); + e.LoadConstantXmm(i.dest, i.src3.constant()); } - e.vpor(i.dest, e.xmm1); + e.vpand(i.dest, src1, src3); + + e.vpor(i.dest, i.dest, e.xmm0); } }; EMITTER_OPCODE_TABLE(OPCODE_SELECT, SELECT_I8, SELECT_I16, SELECT_I32, @@ -2926,14 +2955,20 @@ struct COMPARE_EQ_I64 struct COMPARE_EQ_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcomiss(i.src1, i.src2); + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { + e.vcomiss(src1, src2); + }); e.sete(i.dest); } }; struct COMPARE_EQ_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vcomisd(i.src1, i.src2); + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, I8Op dest, const Xmm& src1, const Xmm& src2) { + e.vcomisd(src1, src2); + }); e.sete(i.dest); } }; @@ -3210,6 +3245,9 @@ struct VECTOR_COMPARE_UGT_V128 case FLOAT32_TYPE: sign_addr = e.GetXmmConstPtr(XMMSignMaskF32); break; + default: + assert_always(); + break; } if (i.src1.is_constant) { // TODO(benvanik): make this constant. @@ -3418,43 +3456,9 @@ EMITTER_OPCODE_TABLE(OPCODE_ADD_CARRY, ADD_CARRY_I8, ADD_CARRY_I16, // ============================================================================ struct VECTOR_ADD : Sequence> { - static __m128i EmulateVectorAddUnsignedSatI32(void*, __m128i src1, - __m128i src2) { - alignas(16) uint32_t a[4]; - alignas(16) uint32_t b[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); - for (size_t i = 0; i < 4; ++i) { - uint64_t v = (uint64_t)a[i] + (uint64_t)b[i]; - if (v > 0xFFFFFFFF) { - a[i] = 0xFFFFFFFF; - } else { - a[i] = (uint32_t)v; - } - } - return _mm_load_si128(reinterpret_cast<__m128i*>(a)); - } - static __m128i EmulateVectorAddSignedSatI32(void*, __m128i src1, - __m128i src2) { - alignas(16) int32_t a[4]; - alignas(16) int32_t b[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); - for (size_t i = 0; i < 4; ++i) { - int64_t v = (int64_t)a[i] + (int64_t)b[i]; - if (v > 0x7FFFFFFF) { - a[i] = 0x7FFFFFFF; - } else if (v < -0x80000000ll) { - a[i] = 0x80000000; - } else { - a[i] = (uint32_t)v; - } - } - return _mm_load_si128(reinterpret_cast<__m128i*>(a)); - } static void Emit(X64Emitter& e, const EmitArgType& i) { EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, - const Xmm& src1, const Xmm& src2) { + Xmm src1, Xmm src2) { const TypeName part_type = static_cast(i.instr->flags & 0xFF); const uint32_t arithmetic_flags = i.instr->flags >> 8; bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); @@ -3487,71 +3491,56 @@ struct VECTOR_ADD case INT32_TYPE: if (saturate) { if (is_unsigned) { - // TODO(benvanik): broken with UINT32MAX+1 - //// We reuse all these temps... - // assert_true(src1 != e.xmm0 && src1 != e.xmm1 && src1 != - // e.xmm2); - // assert_true(src2 != e.xmm0 && src2 != e.xmm1 && src2 != - // e.xmm2); - //// Clamp to 0xFFFFFFFF. - //// Wish there was a vpaddusd... - //// | A | B | C | D | - //// | B | D | - // e.vpsllq(e.xmm0, src1, 32); - // e.vpsllq(e.xmm1, src2, 32); - // e.vpsrlq(e.xmm0, 32); - // e.vpsrlq(e.xmm1, 32); - // e.vpaddq(e.xmm0, e.xmm1); - // e.vpcmpgtq(e.xmm0, e.GetXmmConstPtr(XMMUnsignedDwordMax)); - // e.vpsllq(e.xmm0, 32); - // e.vpsrlq(e.xmm0, 32); - //// | A | C | - // e.vpsrlq(e.xmm1, src1, 32); - // e.vpsrlq(e.xmm2, src2, 32); - // e.vpaddq(e.xmm1, e.xmm2); - // e.vpcmpgtq(e.xmm1, e.GetXmmConstPtr(XMMUnsignedDwordMax)); - // e.vpsllq(e.xmm1, 32); - //// xmm0 = mask for with saturated dwords == 111... - // e.vpor(e.xmm0, e.xmm1); - // e.vpaddd(dest, src1, src2); - //// dest.f[n] = xmm1.f[n] ? xmm1.f[n] : dest.f[n]; - // e.vblendvps(dest, dest, e.xmm1, e.xmm1); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorAddUnsignedSatI32)); - e.vmovaps(i.dest, e.xmm0); + // xmm0 is the only temp register that can be used by src1/src2. + e.vpaddd(e.xmm1, src1, src2); + + // If result is smaller than either of the inputs, we've + // overflowed (only need to check one input) + // if (src1 > res) then overflowed + // http://locklessinc.com/articles/sat_arithmetic/ + e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpcmpgtd(e.xmm0, e.xmm2, e.xmm0); + e.vpor(dest, e.xmm1, e.xmm0); } else { - // https://software.intel.com/en-us/forums/topic/285219 - // TODO(benvanik): this is broken with INTMAX+1. - // We reuse all these temps... - // assert_true(src1 != e.xmm0 && src1 != e.xmm1 && src1 != - // e.xmm2); - // assert_true(src2 != e.xmm0 && src2 != e.xmm1 && src2 != - // e.xmm2); - // e.vpaddd(e.xmm0, src1, src2); // res - // e.vpand(e.xmm1, src1, src2); // sign_and - // e.vpandn(e.xmm2, e.xmm0, e.xmm1); // min_sat_mask - // e.vblendvps(dest, e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS), - // e.xmm2); - // e.vpor(e.xmm1, src1, src2); // sign_or - // e.vpandn(e.xmm1, e.xmm0); // max_sat_mask - // e.vblendvps(dest, e.GetXmmConstPtr(XMMAbsMaskPS), e.xmm1); - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); + // Preserve the sources. + if (dest == src1) { + e.vmovdqa(e.xmm2, src1); + src1 = e.xmm2; } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorAddSignedSatI32)); - e.vmovaps(i.dest, e.xmm0); + if (dest == src2) { + e.vmovdqa(e.xmm1, src2); + src2 = e.xmm1; + } + + // xmm0 is the only temp register that can be used by src1/src2. + e.vpaddd(dest, src1, src2); + + // Overflow results if two inputs are the same sign and the result + // isn't the same sign. + // if ((s32b)(~(src1 ^ src2) & (src1 ^ res)) < 0) then overflowed + // http://locklessinc.com/articles/sat_arithmetic/ + e.vpxor(e.xmm1, src1, src2); + + // Move src1 to xmm0 in-case it was the same register as the dest. + // This kills src2 if it's a constant. + if (src1 != e.xmm0) { + e.vmovdqa(e.xmm0, src1); + src1 = e.xmm0; + } + + e.vpxor(e.xmm2, src1, dest); + e.vpandn(e.xmm1, e.xmm1, e.xmm2); + + // High bit of xmm1 is now set if overflowed. + + // Set any negative overflowed elements of src1 to INT_MIN + e.vpand(e.xmm2, src1, e.xmm1); + e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMSignMaskI32), e.xmm2); + + // Set any positive overflowed elements of src1 to INT_MAX + e.vpandn(e.xmm2, src1, e.xmm1); + e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMAbsMaskPS), e.xmm2); } } else { e.vpaddd(dest, src1, src2); @@ -3630,22 +3619,9 @@ EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, // ============================================================================ struct VECTOR_SUB : Sequence> { - static __m128i EmulateVectorSubSignedSatI32(void*, __m128i src1, - __m128i src2) { - alignas(16) int32_t src1v[4]; - alignas(16) int32_t src2v[4]; - alignas(16) int32_t value[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); - for (size_t i = 0; i < 4; ++i) { - auto t = int64_t(src1v[i]) - int64_t(src2v[i]); - value[i] = t < INT_MIN ? INT_MIN : (t > INT_MAX ? INT_MAX : int32_t(t)); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(value)); - } static void Emit(X64Emitter& e, const EmitArgType& i) { EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, - const Xmm& src1, const Xmm& src2) { + Xmm src1, Xmm src2) { const TypeName part_type = static_cast(i.instr->flags & 0xFF); const uint32_t arithmetic_flags = i.instr->flags >> 8; bool is_unsigned = !!(arithmetic_flags & ARITHMETIC_UNSIGNED); @@ -3678,13 +3654,57 @@ struct VECTOR_SUB case INT32_TYPE: if (saturate) { if (is_unsigned) { - assert_always(); + // xmm0 is the only temp register that can be used by src1/src2. + e.vpsubd(e.xmm1, src1, src2); + + // If result is greater than either of the inputs, we've + // underflowed (only need to check one input) + // if (res > src1) then underflowed + // http://locklessinc.com/articles/sat_arithmetic/ + e.vpxor(e.xmm2, src1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpxor(e.xmm0, e.xmm1, e.GetXmmConstPtr(XMMSignMaskI32)); + e.vpcmpgtd(e.xmm0, e.xmm0, e.xmm2); + e.vpandn(dest, e.xmm0, e.xmm1); } else { - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.lea(e.r9, e.StashXmm(1, i.src2)); - e.CallNativeSafe( - reinterpret_cast(EmulateVectorSubSignedSatI32)); - e.vmovaps(i.dest, e.xmm0); + // Preserve the sources. + if (dest == src1) { + e.vmovdqa(e.xmm2, src1); + src1 = e.xmm2; + } + if (dest == src2) { + e.vmovdqa(e.xmm1, src2); + src2 = e.xmm1; + } + + // xmm0 is the only temp register that can be used by src1/src2. + e.vpsubd(dest, src1, src2); + + // We can only overflow if the signs of the operands are opposite. + // If signs are opposite and result sign isn't the same as src1's + // sign, we've overflowed. + // if ((s32b)((src1 ^ src2) & (src1 ^ res)) < 0) then overflowed + // http://locklessinc.com/articles/sat_arithmetic/ + e.vpxor(e.xmm1, src1, src2); + + // Move src1 to xmm0 in-case it's the same register as the dest. + // This kills src2 if it's a constant. + if (src1 != e.xmm0) { + e.vmovdqa(e.xmm0, src1); + src1 = e.xmm0; + } + + e.vpxor(e.xmm2, src1, dest); + e.vpand(e.xmm1, e.xmm1, e.xmm2); + + // High bit of xmm1 is now set if overflowed. + + // Set any negative overflowed elements of src1 to INT_MIN + e.vpand(e.xmm2, src1, e.xmm1); + e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMSignMaskI32), e.xmm2); + + // Set any positive overflowed elements of src1 to INT_MAX + e.vpandn(e.xmm2, src1, e.xmm1); + e.vblendvps(dest, dest, e.GetXmmConstPtr(XMMAbsMaskPS), e.xmm2); } } else { e.vpsubd(dest, src1, src2); @@ -4361,68 +4381,113 @@ EMITTER_OPCODE_TABLE(OPCODE_DIV, DIV_I8, DIV_I16, DIV_I32, DIV_I64, DIV_F32, // ============================================================================ // d = 1 * 2 + 3 // $0 = $1x$0 + $2 -// TODO(benvanik): use other forms (132/213/etc) to avoid register shuffling. -// dest could be src2 or src3 - need to ensure it's not before overwriting dest -// perhaps use other 132/213/etc -// Forms: +// Forms of vfmadd/vfmsub: // - 132 -> $1 = $1 * $3 + $2 // - 213 -> $1 = $2 * $1 + $3 // - 231 -> $1 = $2 * $3 + $1 struct MUL_ADD_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + // Calculate the multiply part if it's constant. + // TODO: Do this in the constant propagation pass. + if (i.src1.is_constant && i.src2.is_constant) { + float mul = i.src1.constant() * i.src2.constant(); + + e.LoadConstantXmm(e.xmm0, mul); + e.vaddss(i.dest, e.xmm0, i.src3); + return; + } + // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - if (i.dest == i.src1) { - e.vfmadd213ss(i.dest, i.src2, i.src3); - } else if (i.dest == i.src2) { - e.vfmadd213ss(i.dest, i.src1, i.src3); - } else if (i.dest == i.src3) { - e.vfmadd231ss(i.dest, i.src1, i.src2); - } else { - // Dest not equal to anything - e.vmovss(i.dest, i.src1); - e.vfmadd213ss(i.dest, i.src2, i.src3); - } + EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + if (i.dest == src1) { + e.vfmadd213ss(i.dest, src2, i.src3); + } else if (i.dest == src2) { + e.vfmadd213ss(i.dest, src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmadd231ss(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovss(i.dest, src1); + e.vfmadd213ss(i.dest, src2, i.src3); + } + }); } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - Xmm src3 = i.src3; - if (i.dest == i.src3) { - e.vmovss(e.xmm0, i.src3); - src3 = e.xmm0; + Xmm src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm1, i.src3.constant()); + src3 = e.xmm1; + } else { + // If i.dest == i.src3, back up i.src3 so we don't overwrite it. + src3 = i.src3; + if (i.dest == i.src3) { + e.vmovss(e.xmm1, i.src3); + src3 = e.xmm1; + } } - e.vmulss(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.vaddss(i.dest, i.dest, src3); // $0 = $1 + $2 + // Multiply operation is commutative. + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulss(dest, src1, src2); // $0 = $1 * $2 + }); + + e.vaddss(i.dest, i.dest, src3); // $0 = $1 + $2 } } }; struct MUL_ADD_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + // Calculate the multiply part if it's constant. + // TODO: Do this in the constant propagation pass. + if (i.src1.is_constant && i.src2.is_constant) { + double mul = i.src1.constant() * i.src2.constant(); + + e.LoadConstantXmm(e.xmm0, mul); + e.vaddsd(i.dest, e.xmm0, i.src3); + return; + } + // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - if (i.dest == i.src1) { - e.vfmadd213sd(i.dest, i.src2, i.src3); - } else if (i.dest == i.src2) { - e.vfmadd213sd(i.dest, i.src1, i.src3); - } else if (i.dest == i.src3) { - e.vfmadd231sd(i.dest, i.src1, i.src2); - } else { - // Dest not equal to anything - e.vmovsd(i.dest, i.src1); - e.vfmadd213sd(i.dest, i.src2, i.src3); - } + EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + if (i.dest == src1) { + e.vfmadd213sd(i.dest, src2, i.src3); + } else if (i.dest == src2) { + e.vfmadd213sd(i.dest, src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmadd231sd(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovsd(i.dest, src1); + e.vfmadd213sd(i.dest, src2, i.src3); + } + }); } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - Xmm src3 = i.src3; - if (i.dest == i.src3) { - e.vmovsd(e.xmm0, i.src3); - src3 = e.xmm0; + Xmm src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm1, i.src3.constant()); + src3 = e.xmm1; + } else { + // If i.dest == i.src3, back up i.src3 so we don't overwrite it. + src3 = i.src3; + if (i.dest == i.src3) { + e.vmovsd(e.xmm1, i.src3); + src3 = e.xmm1; + } } - e.vmulsd(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.vaddsd(i.dest, i.dest, src3); // $0 = $1 + $2 + // Multiply operation is commutative. + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulsd(dest, src1, src2); // $0 = $1 * $2 + }); + + e.vaddsd(i.dest, i.dest, src3); // $0 = $1 + $2 } } }; @@ -4430,37 +4495,58 @@ struct MUL_ADD_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + // Calculate the multiply part if it's constant. + // TODO: Do this in the constant propagation pass. + if (i.src1.is_constant && i.src2.is_constant) { + vec128_t mul; + for (int n = 0; n < 4; n++) { + mul.f32[n] = i.src1.constant().f32[n] * i.src2.constant().f32[n]; + } + + e.LoadConstantXmm(e.xmm0, mul); + e.vaddps(i.dest, e.xmm0, i.src3); + return; + } + // TODO(benvanik): the vfmadd sequence produces slightly different results // than vmul+vadd and it'd be nice to know why. Until we know, it's // disabled so tests pass. if (false && e.IsFeatureEnabled(kX64EmitFMA)) { - if (i.dest == i.src1) { - e.vfmadd213ps(i.dest, i.src2, i.src3); - } else if (i.dest == i.src2) { - e.vfmadd213ps(i.dest, i.src1, i.src3); - } else if (i.dest == i.src3) { - e.vfmadd231ps(i.dest, i.src1, i.src2); - } else { - // Dest not equal to anything - e.vmovdqa(i.dest, i.src1); - e.vfmadd213ps(i.dest, i.src2, i.src3); - } + EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + if (i.dest == src1) { + e.vfmadd213ps(i.dest, src2, i.src3); + } else if (i.dest == src2) { + e.vfmadd213ps(i.dest, src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmadd231ps(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovdqa(i.dest, src1); + e.vfmadd213ps(i.dest, src2, i.src3); + } + }); } else { Xmm src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src3.constant()); - src3 = e.xmm0; + e.LoadConstantXmm(e.xmm1, i.src3.constant()); + src3 = e.xmm1; } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; if (i.dest == i.src3) { - e.vmovdqa(e.xmm0, i.src3); - src3 = e.xmm0; + e.vmovdqa(e.xmm1, i.src3); + src3 = e.xmm1; } } - e.vmulps(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.vaddps(i.dest, i.dest, src3); // $0 = $1 + $2 + // Multiply operation is commutative. + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulps(dest, src1, src2); // $0 = $1 * $2 + }); + + e.vaddps(i.dest, i.dest, src3); // $0 = $1 + $2 } } }; @@ -4481,58 +4567,106 @@ EMITTER_OPCODE_TABLE(OPCODE_MUL_ADD, MUL_ADD_F32, MUL_ADD_F64, MUL_ADD_V128); struct MUL_SUB_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + // Calculate the multiply part if it's constant. + // TODO: Do this in the constant propagation pass. + if (i.src1.is_constant && i.src2.is_constant) { + float mul = i.src1.constant() * i.src2.constant(); + + e.LoadConstantXmm(e.xmm0, mul); + e.vsubss(i.dest, e.xmm0, i.src3); + return; + } + // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - if (i.dest == i.src1) { - e.vfmsub213ss(i.dest, i.src2, i.src3); - } else if (i.dest == i.src2) { - e.vfmsub213ss(i.dest, i.src1, i.src3); - } else if (i.dest == i.src3) { - e.vfmsub231ss(i.dest, i.src1, i.src2); - } else { - // Dest not equal to anything - e.vmovss(i.dest, i.src1); - e.vfmsub213ss(i.dest, i.src2, i.src3); - } + EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + if (i.dest == src1) { + e.vfmsub213ss(i.dest, src2, i.src3); + } else if (i.dest == src2) { + e.vfmsub213ss(i.dest, src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmsub231ss(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovss(i.dest, src1); + e.vfmsub213ss(i.dest, src2, i.src3); + } + }); } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - Xmm src3 = i.src3; - if (i.dest == i.src3) { - e.vmovss(e.xmm0, i.src3); - src3 = e.xmm0; + Xmm src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm1, i.src3.constant()); + src3 = e.xmm1; + } else { + // If i.dest == i.src3, back up i.src3 so we don't overwrite it. + src3 = i.src3; + if (i.dest == i.src3) { + e.vmovss(e.xmm1, i.src3); + src3 = e.xmm1; + } } - e.vmulss(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.vsubss(i.dest, i.dest, src3); // $0 = $1 - $2 + // Multiply operation is commutative. + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulss(dest, src1, src2); // $0 = $1 * $2 + }); + + e.vsubss(i.dest, i.dest, src3); // $0 = $1 - $2 } } }; struct MUL_SUB_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + // Calculate the multiply part if it's constant. + // TODO: Do this in the constant propagation pass. + if (i.src1.is_constant && i.src2.is_constant) { + double mul = i.src1.constant() * i.src2.constant(); + + e.LoadConstantXmm(e.xmm0, mul); + e.vsubsd(i.dest, e.xmm0, i.src3); + return; + } + // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - if (i.dest == i.src1) { - e.vfmsub213sd(i.dest, i.src2, i.src3); - } else if (i.dest == i.src2) { - e.vfmsub213sd(i.dest, i.src1, i.src3); - } else if (i.dest == i.src3) { - e.vfmsub231sd(i.dest, i.src1, i.src2); - } else { - // Dest not equal to anything - e.vmovsd(i.dest, i.src1); - e.vfmsub213sd(i.dest, i.src2, i.src3); - } + EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + if (i.dest == src1) { + e.vfmsub213sd(i.dest, src2, i.src3); + } else if (i.dest == src2) { + e.vfmsub213sd(i.dest, src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmsub231sd(i.dest, src1, src2); + } else { + // Dest not equal to anything + e.vmovsd(i.dest, src1); + e.vfmsub213sd(i.dest, src2, i.src3); + } + }); } else { - // If i.dest == i.src3, back up i.src3 so we don't overwrite it. - Xmm src3 = i.src3; - if (i.dest == i.src3) { - e.vmovsd(e.xmm0, i.src3); - src3 = e.xmm0; + Xmm src3; + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm1, i.src3.constant()); + src3 = e.xmm1; + } else { + // If i.dest == i.src3, back up i.src3 so we don't overwrite it. + src3 = i.src3; + if (i.dest == i.src3) { + e.vmovsd(e.xmm1, i.src3); + src3 = e.xmm1; + } } - e.vmulsd(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.vsubsd(i.dest, i.dest, src3); // $0 = $1 - $2 + // Multiply operation is commutative. + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulsd(dest, src1, src2); // $0 = $1 * $2 + }); + + e.vsubsd(i.dest, i.dest, src3); // $0 = $1 - $2 } } }; @@ -4540,50 +4674,56 @@ struct MUL_SUB_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + // Calculate the multiply part if it's constant. + // TODO: Do this in the constant propagation pass. + if (i.src1.is_constant && i.src2.is_constant) { + vec128_t mul; + for (int n = 0; n < 4; n++) { + mul.f32[n] = i.src1.constant().f32[n] * i.src2.constant().f32[n]; + } + + e.LoadConstantXmm(e.xmm0, mul); + e.vsubps(i.dest, e.xmm0, i.src3); + return; + } + // FMA extension if (e.IsFeatureEnabled(kX64EmitFMA)) { - if (i.dest == i.src1) { - if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src3.constant()); - e.vfmsub213ps(i.dest, i.src2, e.xmm0); + EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, + const Xmm& src1, const Xmm& src2) { + if (i.dest == src1) { + e.vfmsub213ps(i.dest, src2, i.src3); + } else if (i.dest == src2) { + e.vfmsub213ps(i.dest, src1, i.src3); + } else if (i.dest == i.src3) { + e.vfmsub231ps(i.dest, src1, src2); } else { - e.vfmsub213ps(i.dest, i.src2, i.src3); + // Dest not equal to anything + e.vmovdqa(i.dest, src1); + e.vfmsub213ps(i.dest, src2, i.src3); } - } else if (i.dest == i.src2) { - if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src3.constant()); - e.vfmsub213ps(i.dest, i.src1, e.xmm0); - } else { - e.vfmsub213ps(i.dest, i.src1, i.src3); - } - } else if (i.dest == i.src3) { - e.vfmsub231ps(i.dest, i.src1, i.src2); - } else { - // Dest not equal to anything. - e.vmovdqa(i.dest, i.src1); - if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src3.constant()); - e.vfmsub213ps(i.dest, i.src2, e.xmm0); - } else { - e.vfmsub213ps(i.dest, i.src2, i.src3); - } - } + }); } else { Xmm src3; if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src3.constant()); - src3 = e.xmm0; + e.LoadConstantXmm(e.xmm1, i.src3.constant()); + src3 = e.xmm1; } else { // If i.dest == i.src3, back up i.src3 so we don't overwrite it. src3 = i.src3; if (i.dest == i.src3) { - e.vmovdqa(e.xmm0, i.src3); - src3 = e.xmm0; + e.vmovdqa(e.xmm1, i.src3); + src3 = e.xmm1; } } - e.vmulps(i.dest, i.src1, i.src2); // $0 = $1 * $2 - e.vsubps(i.dest, i.dest, src3); // $0 = $1 - $2 + // Multiply operation is commutative. + EmitCommutativeBinaryXmmOp( + e, i, [&i](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { + e.vmulps(dest, src1, src2); // $0 = $1 * $2 + }); + + e.vsubps(i.dest, i.dest, src3); // $0 = $1 - $2 } } }; @@ -5274,7 +5414,28 @@ struct VECTOR_SHL_V128 return; } } + + // Shift 8 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + // See if the shift is equal first for a shortcut. + e.vpshuflw(e.xmm0, i.src2, 0b00000000); + e.vpshufd(e.xmm0, e.xmm0, 0b00000000); + e.vptest(e.xmm0, i.src2); + e.jnc(emu); + + // Equal. Shift using vpsllw. + e.mov(e.rax, 0xF); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsllw(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + // TODO(benvanik): native version (with shift magic). + e.L(emu); if (i.src2.is_constant) { e.LoadConstantXmm(e.xmm0, i.src2.constant()); e.lea(e.r9, e.StashXmm(1, e.xmm0)); @@ -5284,6 +5445,8 @@ struct VECTOR_SHL_V128 e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI16)); e.vmovaps(i.dest, e.xmm0); + + e.L(end); } static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) { alignas(16) uint32_t value[4]; @@ -5296,28 +5459,32 @@ struct VECTOR_SHL_V128 return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpslld. + e.vpslld(i.dest, i.src1, shamt.u8[0] & 0x1F); + return; + } + } + if (e.IsFeatureEnabled(kX64EmitAVX2)) { if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 4 - n; ++n) { - if (shamt.u32[n] != shamt.u32[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same, so we can use vpslld. - e.vpslld(i.dest, i.src1, shamt.u8[0] & 0x1F); - } else { - // Counts differ, so pre-mask and load constant. - vec128_t masked = i.src2.constant(); - for (size_t n = 0; n < 4; ++n) { - masked.u32[n] &= 0x1F; - } - e.LoadConstantXmm(e.xmm0, masked); - e.vpsllvd(i.dest, i.src1, e.xmm0); + // Counts differ, so pre-mask and load constant. + vec128_t masked = i.src2.constant(); + for (size_t n = 0; n < 4; ++n) { + masked.u32[n] &= 0x1F; } + e.LoadConstantXmm(e.xmm0, masked); + e.vpsllvd(i.dest, i.src1, e.xmm0); } else { // Fully variable shift. // src shift mask may have values >31, and x86 sets to zero when @@ -5326,7 +5493,26 @@ struct VECTOR_SHL_V128 e.vpsllvd(i.dest, i.src1, e.xmm0); } } else { + // Shift 4 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshufd(e.xmm0, i.src2, 0b00000000); + e.vptest(e.xmm0, i.src2); + e.jnc(emu); + + // Equal. Shift using vpsrad. + e.mov(e.rax, 0x1F); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpslld(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + // TODO(benvanik): native version (with shift magic). + e.L(emu); if (i.src2.is_constant) { e.LoadConstantXmm(e.xmm0, i.src2.constant()); e.lea(e.r9, e.StashXmm(1, e.xmm0)); @@ -5336,6 +5522,8 @@ struct VECTOR_SHL_V128 e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI32)); e.vmovaps(i.dest, e.xmm0); + + e.L(end); } } }; @@ -5410,7 +5598,28 @@ struct VECTOR_SHR_V128 return; } } + + // Shift 8 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshuflw(e.xmm0, i.src2, 0b00000000); + e.vpshufd(e.xmm0, e.xmm0, 0b00000000); + e.vptest(e.xmm0, i.src2); + e.jnc(emu); + + // Equal. Shift using vpsrlw. + e.mov(e.rax, 0xF); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsrlw(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + // TODO(benvanik): native version (with shift magic). + e.L(emu); if (i.src2.is_constant) { e.LoadConstantXmm(e.xmm0, i.src2.constant()); e.lea(e.r9, e.StashXmm(1, e.xmm0)); @@ -5420,6 +5629,8 @@ struct VECTOR_SHR_V128 e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI16)); e.vmovaps(i.dest, e.xmm0); + + e.L(end); } static __m128i EmulateVectorShrI32(void*, __m128i src1, __m128i src2) { alignas(16) uint32_t value[4]; @@ -5442,7 +5653,7 @@ struct VECTOR_SHR_V128 } } if (all_same) { - // Every count is the same, so we can use vpslld. + // Every count is the same, so we can use vpsrld. e.vpsrld(i.dest, i.src1, shamt.u8[0] & 0x1F); return; } else { @@ -5457,28 +5668,47 @@ struct VECTOR_SHR_V128 return; } } - } else { - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - // Fully variable shift. - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - e.vpsrlvd(i.dest, i.src1, e.xmm0); - return; - } } - // We've reached here if we don't have AVX2 and it's a variable shift. - // TODO(benvanik): native version. - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // Fully variable shift. + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsrlvd(i.dest, i.src1, e.xmm0); } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); + // Shift 4 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshufd(e.xmm0, i.src2, 0b00000000); + e.vptest(e.xmm0, i.src2); + e.jnc(emu); + + // Equal. Shift using vpsrld. + e.mov(e.rax, 0x1F); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsrld(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version. + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI32)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI32)); - e.vmovaps(i.dest, e.xmm0); } }; EMITTER_OPCODE_TABLE(OPCODE_VECTOR_SHR, VECTOR_SHR_V128); @@ -5498,6 +5728,20 @@ struct VECTOR_SHA_V128 } return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } + + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI8)); + e.vmovaps(i.dest, e.xmm0); + } + static __m128i EmulateVectorShaI16(void*, __m128i src1, __m128i src2) { alignas(16) int16_t value[8]; alignas(16) int16_t shamt[8]; @@ -5508,6 +5752,58 @@ struct VECTOR_SHA_V128 } return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } + + static void EmitInt16(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 8 - n; ++n) { + if (shamt.u16[n] != shamt.u16[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsraw. + e.vpsraw(i.dest, i.src1, shamt.u16[0] & 0xF); + return; + } + } + + // Shift 8 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshuflw(e.xmm0, i.src2, 0b00000000); + e.vpshufd(e.xmm0, e.xmm0, 0b00000000); + e.vptest(e.xmm0, i.src2); + e.jnc(emu); + + // Equal. Shift using vpsraw. + e.mov(e.rax, 0xF); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsraw(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version (with shift magic). + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI16)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); + } + static __m128i EmulateVectorShaI32(void*, __m128i src1, __m128i src2) { alignas(16) int32_t value[4]; alignas(16) int32_t shamt[4]; @@ -5518,55 +5814,79 @@ struct VECTOR_SHA_V128 } return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } + + static void EmitInt32(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpsrad. + e.vpsrad(i.dest, i.src1, shamt.u32[0] & 0x1F); + return; + } + } + + if (e.IsFeatureEnabled(kX64EmitAVX2)) { + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); + } else { + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + } + e.vpsravd(i.dest, i.src1, e.xmm0); + } else { + // Shift 4 words in src1 by amount specified in src2. + Xbyak::Label emu, end; + + // See if the shift is equal first for a shortcut. + // Only bother with this check if shift amt isn't constant. + if (!i.src2.is_constant) { + e.vpshufd(e.xmm0, i.src2, 0b00000000); + e.vptest(e.xmm0, i.src2); + e.jnc(emu); + + // Equal. Shift using vpsrad. + e.mov(e.rax, 0x1F); + e.vmovq(e.xmm1, e.rax); + e.vpand(e.xmm0, e.xmm0, e.xmm1); + e.vpsrad(i.dest, i.src1, e.xmm0); + e.jmp(end); + } + + // TODO(benvanik): native version. + e.L(emu); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI32)); + e.vmovaps(i.dest, e.xmm0); + + e.L(end); + } + } + static void Emit(X64Emitter& e, const EmitArgType& i) { switch (i.instr->flags) { case INT8_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI8)); - e.vmovaps(i.dest, e.xmm0); + EmitInt8(e, i); break; case INT16_TYPE: - // TODO(benvanik): native version (with shift magic). - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI16)); - e.vmovaps(i.dest, e.xmm0); + EmitInt16(e, i); break; case INT32_TYPE: - if (e.IsFeatureEnabled(kX64EmitAVX2)) { - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); - } else { - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - } - e.vpsravd(i.dest, i.src1, e.xmm0); - } else { - // TODO(benvanik): native version. - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI32)); - e.vmovaps(i.dest, e.xmm0); - } + EmitInt32(e, i); break; default: assert_always(); @@ -5677,14 +5997,24 @@ struct VECTOR_ROTATE_LEFT_V128 case INT8_TYPE: // TODO(benvanik): native version (with shift magic). e.lea(e.r8, e.StashXmm(0, i.src1)); - e.lea(e.r9, e.StashXmm(1, i.src2)); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI8)); e.vmovaps(i.dest, e.xmm0); break; case INT16_TYPE: // TODO(benvanik): native version (with shift magic). e.lea(e.r8, e.StashXmm(0, i.src1)); - e.lea(e.r9, e.StashXmm(1, i.src2)); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI16)); e.vmovaps(i.dest, e.xmm0); break; @@ -5706,7 +6036,12 @@ struct VECTOR_ROTATE_LEFT_V128 } else { // TODO(benvanik): non-AVX2 native version. e.lea(e.r8, e.StashXmm(0, i.src1)); - e.lea(e.r9, e.StashXmm(1, i.src2)); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI32)); e.vmovaps(i.dest, e.xmm0); } @@ -6264,6 +6599,8 @@ struct PERMUTE_V128 static void EmitByInt8(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): find out how to do this with only one temp register! // Permute bytes between src2 and src3. + // src1 is an array of indices corresponding to positions within src2 and + // src3. if (i.src3.value->IsConstantZero()) { // Permuting with src2/zero, so just shuffle/mask. if (i.src2.value->IsConstantZero()) { @@ -6324,43 +6661,42 @@ struct PERMUTE_V128 } } - static __m128i EmulateByInt16(void*, __m128i control, __m128i src1, - __m128i src2) { - alignas(16) uint16_t c[8]; - alignas(16) uint16_t a[8]; - alignas(16) uint16_t b[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(c), control); - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); - for (size_t i = 0; i < 8; ++i) { - uint16_t si = (c[i] & 0xF) ^ 0x1; - c[i] = si >= 8 ? b[si - 8] : a[si]; - } - return _mm_load_si128(reinterpret_cast<__m128i*>(c)); - } static void EmitByInt16(X64Emitter& e, const EmitArgType& i) { - // TODO(benvanik): replace with proper version. + // src1 is an array of indices corresponding to positions within src2 and + // src3. assert_true(i.src1.is_constant); - if (i.src1.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src1.constant()); - e.lea(e.r8, e.StashXmm(0, e.xmm0)); - } else { - e.lea(e.r8, e.StashXmm(0, i.src1)); + vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1); + vec128_t perm_ctrl = vec128b(0); + for (int i = 0; i < 8; i++) { + perm_ctrl.i16[i] = perm.i16[i] > 7 ? -1 : 0; + + auto v = uint8_t(perm.u16[i]); + perm.u8[i * 2] = v * 2; + perm.u8[i * 2 + 1] = v * 2 + 1; } + e.LoadConstantXmm(e.xmm0, perm); + if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); + e.LoadConstantXmm(e.xmm1, i.src2.constant()); } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); + e.vmovdqa(e.xmm1, i.src2); } if (i.src3.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src3.constant()); - e.lea(e.r10, e.StashXmm(2, e.xmm0)); + e.LoadConstantXmm(e.xmm2, i.src3.constant()); } else { - e.lea(e.r10, e.StashXmm(2, i.src3)); + e.vmovdqa(e.xmm2, i.src3); } - e.CallNativeSafe(reinterpret_cast(EmulateByInt16)); - e.vmovaps(i.dest, e.xmm0); + + e.vpshufb(e.xmm1, e.xmm1, e.xmm0); + e.vpshufb(e.xmm2, e.xmm2, e.xmm0); + + uint8_t mask = 0; + for (int i = 0; i < 8; i++) { + if (perm_ctrl.i16[i] == 0) { + mask |= 1 << (7 - i); + } + } + e.vpblendw(i.dest, e.xmm1, e.xmm2, mask); } static void EmitByInt32(X64Emitter& e, const EmitArgType& i) { @@ -6646,7 +6982,12 @@ struct PACK : Sequence> { if (IsPackOutSaturate(flags)) { // signed -> unsigned + saturate // PACKUSWB / SaturateSignedWordToUnsignedByte - e.vpackuswb(i.dest, i.src1, i.src2); + Xbyak::Xmm src2 = i.src2.is_constant ? e.xmm0 : i.src2; + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + } + + e.vpackuswb(i.dest, i.src1, src2); e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); } else { // signed -> unsigned @@ -6665,19 +7006,6 @@ struct PACK : Sequence> { } } } - static __m128i EmulatePack16_IN_32_UN_UN_SAT(void*, __m128i src1, - __m128i src2) { - alignas(16) uint32_t a[4]; - alignas(16) uint32_t b[4]; - alignas(16) uint16_t c[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); - for (int i = 0; i < 4; ++i) { - c[i] = uint16_t(std::min(65535u, a[i])); - c[i + 4] = uint16_t(std::min(65535u, b[i])); - } - return _mm_load_si128(reinterpret_cast<__m128i*>(c)); - } static void Emit16_IN_32(X64Emitter& e, const EmitArgType& i, uint32_t flags) { // TODO(benvanik): handle src2 (or src1) being constant zero @@ -6685,18 +7013,28 @@ struct PACK : Sequence> { if (IsPackOutUnsigned(flags)) { if (IsPackOutSaturate(flags)) { // unsigned -> unsigned + saturate - Xmm src2; - if (i.src2.is_constant) { - e.LoadConstantXmm(e.xmm0, i.src2.constant()); - e.lea(e.r9, e.StashXmm(1, e.xmm0)); - } else { - e.lea(e.r9, e.StashXmm(1, i.src2)); - } - e.lea(e.r8, e.StashXmm(0, i.src1)); - e.CallNativeSafe( - reinterpret_cast(EmulatePack16_IN_32_UN_UN_SAT)); - e.vmovaps(i.dest, e.xmm0); - e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMByteOrderMask)); + // Construct a saturation mask + e.mov(e.eax, ~0xFFFFu); + e.vmovd(e.xmm0, e.eax); + e.vpshufd(e.xmm0, e.xmm0, 0b00000000); + + e.vandps(e.xmm1, e.xmm0, i.src1); // src1 & 0xFFFF0000 + e.vpcmpeqd(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMZero)); + e.vpxor(e.xmm1, e.xmm1, e.GetXmmConstPtr(XMMFFFF)); + e.vpor(e.xmm1, e.xmm1, i.src1); // Saturate src1 + e.vpshuflw(e.xmm1, e.xmm1, 0b00100010); + e.vpshufhw(e.xmm1, e.xmm1, 0b00100010); + e.vpshufd(e.xmm1, e.xmm1, 0b00001000); + + e.vandps(e.xmm0, e.xmm0, i.src2); // src2 & 0xFFFF0000 + e.vpcmpeqd(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMZero)); + e.vpxor(e.xmm0, e.xmm0, e.GetXmmConstPtr(XMMFFFF)); + e.vpor(i.dest, e.xmm0, i.src2); // Saturate src2 + e.vpshuflw(i.dest, i.dest, 0b00100010); + e.vpshufhw(i.dest, i.dest, 0b00100010); + e.vpshufd(i.dest, i.dest, 0b10000000); + + e.vpblendw(i.dest, i.dest, e.xmm1, 0b00001111); } else { // unsigned -> unsigned e.vmovaps(e.xmm0, i.src1); @@ -6771,6 +7109,9 @@ struct UNPACK : Sequence> { case PACK_TYPE_FLOAT16_2: EmitFLOAT16_2(e, i); break; + case PACK_TYPE_FLOAT16_3: + EmitFLOAT16_3(e, i); + break; case PACK_TYPE_FLOAT16_4: EmitFLOAT16_4(e, i); break; @@ -6814,7 +7155,7 @@ struct UNPACK : Sequence> { _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); for (int i = 0; i < 2; i++) { - b[i] = half_float::detail::half2float(a[7 - i]); + b[i] = half_float::detail::half2float(a[VEC128_W(6 + i)]); } // Constants, or something @@ -6862,23 +7203,34 @@ struct UNPACK : Sequence> { e.vmovaps(i.dest, e.xmm0); } } + // FIXME: This has not been verified on a real 360, but from context the + // return values are used in floating point math. + static __m128 EmulateFLOAT16_3(void*, __m128i src1) { + alignas(16) uint16_t a[8]; + alignas(16) float b[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + + for (int i = 0; i < 3; i++) { + b[i] = half_float::detail::half2float(a[VEC128_W(5 + i)]); + } + + // FIXME: Correct? + b[3] = 1.0f; + + return _mm_load_ps(b); + } + static void EmitFLOAT16_3(X64Emitter& e, const EmitArgType& i) { + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateFLOAT16_3)); + e.vmovaps(i.dest, e.xmm0); + } static __m128 EmulateFLOAT16_4(void*, __m128i src1) { alignas(16) uint16_t a[8]; alignas(16) float b[4]; _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); - // The floats come in swapped for some reason. Swap them back. - for (int i = 0; i < 2; i++) { - uint16_t& n1 = a[7 - (i * 2)]; - uint16_t& n2 = a[6 - (i * 2)]; - - uint16_t tmp = n1; - n1 = n2; - n2 = tmp; - } - for (int i = 0; i < 4; i++) { - b[3 - i] = half_float::detail::half2float(a[7 - i]); + b[i] = half_float::detail::half2float(a[VEC128_W(4 + i)]); } return _mm_load_ps(b); @@ -7086,6 +7438,38 @@ EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_EXCHANGE, ATOMIC_EXCHANGE_I8, ATOMIC_EXCHANGE_I16, ATOMIC_EXCHANGE_I32, ATOMIC_EXCHANGE_I64); +// ============================================================================ +// OPCODE_ATOMIC_COMPARE_EXCHANGE +// ============================================================================ +struct ATOMIC_COMPARE_EXCHANGE_I32 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(e.eax, i.src2); + e.mov(e.ecx, i.src1.reg().cvt32()); + e.lock(); + e.cmpxchg(e.dword[e.rdx + e.rcx], i.src3); + e.sete(i.dest); + + e.ReloadECX(); + } +}; +struct ATOMIC_COMPARE_EXCHANGE_I64 + : Sequence> { + static void Emit(X64Emitter& e, const EmitArgType& i) { + e.mov(e.rax, i.src2); + e.mov(e.ecx, i.src1.reg().cvt32()); + e.lock(); + e.cmpxchg(e.qword[e.rdx + e.rcx], i.src3); + e.sete(i.dest); + + e.ReloadECX(); + } +}; +EMITTER_OPCODE_TABLE(OPCODE_ATOMIC_COMPARE_EXCHANGE, + ATOMIC_COMPARE_EXCHANGE_I32, ATOMIC_COMPARE_EXCHANGE_I64); + void RegisterSequences() { Register_OPCODE_COMMENT(); Register_OPCODE_NOP(); @@ -7201,6 +7585,7 @@ void RegisterSequences() { Register_OPCODE_PACK(); Register_OPCODE_UNPACK(); Register_OPCODE_ATOMIC_EXCHANGE(); + Register_OPCODE_ATOMIC_COMPARE_EXCHANGE(); } bool SelectSequence(X64Emitter* e, const Instr* i, const Instr** new_tail) { diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index c8a5ef632..7be733142 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -161,6 +161,13 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->Remove(); } break; + case OPCODE_ROUND: + if (i->src1.value->IsConstant()) { + v->set_from(i->src1.value); + v->Round(RoundMode(i->flags)); + i->Remove(); + } + break; case OPCODE_ZERO_EXTEND: if (i->src1.value->IsConstant()) { TypeName target_type = v->type; @@ -188,6 +195,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { case OPCODE_LOAD: if (i->src1.value->IsConstant()) { + assert_false(i->flags & LOAD_STORE_BYTE_SWAP); auto memory = processor_->memory(); auto address = i->src1.value->constant.i32; auto mmio_range = @@ -253,12 +261,23 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { case OPCODE_SELECT: if (i->src1.value->IsConstant()) { - if (i->src1.value->IsConstantTrue()) { - v->set_from(i->src2.value); + if (i->src1.value->type != VEC128_TYPE) { + if (i->src1.value->IsConstantTrue()) { + v->set_from(i->src2.value); + i->Remove(); + } else if (i->src1.value->IsConstantFalse()) { + v->set_from(i->src3.value); + i->Remove(); + } else if (i->src2.value->IsConstant() && + i->src3.value->IsConstant()) { + // TODO: Select + // v->set_from(i->src2.value); + // v->Select(i->src3.value, i->src1.value); + // i->Remove(); + } } else { - v->set_from(i->src3.value); + // TODO: vec128 select } - i->Remove(); } break; case OPCODE_IS_TRUE: @@ -355,7 +374,7 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { break; case OPCODE_DID_SATURATE: - assert_true(!i->src1.value->IsConstant()); + // assert_true(!i->src1.value->IsConstant()); break; case OPCODE_ADD: @@ -413,8 +432,33 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->Remove(); } break; - // case OPCODE_MUL_ADD: - // case OPCODE_MUL_SUB + case OPCODE_MUL_ADD: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + // Multiply part is constant. + if (i->src3.value->IsConstant()) { + v->set_from(i->src1.value); + Value::MulAdd(v, i->src1.value, i->src2.value, i->src3.value); + i->Remove(); + } + } + break; + case OPCODE_MUL_SUB: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + // Multiply part is constant. + if (i->src3.value->IsConstant()) { + v->set_from(i->src1.value); + Value::MulSub(v, i->src1.value, i->src2.value, i->src3.value); + i->Remove(); + } + } + break; + case OPCODE_MAX: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->Max(i->src2.value); + i->Remove(); + } + break; case OPCODE_NEG: if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); @@ -484,7 +528,6 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { i->Remove(); } break; - // TODO(benvanik): VECTOR_SHL case OPCODE_SHR: if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { v->set_from(i->src1.value); @@ -515,13 +558,80 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder) { } break; // TODO(benvanik): INSERT/EXTRACT - // TODO(benvanik): SPLAT/PERMUTE/SWIZZLE - case OPCODE_SPLAT: - if (i->src1.value->IsConstant()) { - // Quite a few of these, from building vec128s. + // TODO(benvanik): PERMUTE/SWIZZLE + case OPCODE_EXTRACT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_zero(v->type); + v->Extract(i->src1.value, i->src2.value); + i->Remove(); + } + break; + case OPCODE_SPLAT: + if (i->src1.value->IsConstant()) { + v->set_zero(v->type); + v->Splat(i->src1.value); + i->Remove(); + } + break; + case OPCODE_VECTOR_COMPARE_EQ: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->VectorCompareEQ(i->src2.value, hir::TypeName(i->flags)); + i->Remove(); + } + break; + case OPCODE_VECTOR_COMPARE_SGT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->VectorCompareSGT(i->src2.value, hir::TypeName(i->flags)); + i->Remove(); + } + break; + case OPCODE_VECTOR_CONVERT_F2I: + if (i->src1.value->IsConstant()) { + v->set_zero(VEC128_TYPE); + v->VectorConvertF2I(i->src1.value); + i->Remove(); + } + break; + case OPCODE_VECTOR_CONVERT_I2F: + if (i->src1.value->IsConstant()) { + v->set_zero(VEC128_TYPE); + v->VectorConvertI2F(i->src1.value); + i->Remove(); + } + break; + case OPCODE_VECTOR_SHL: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->VectorShl(i->src2.value, hir::TypeName(i->flags)); + i->Remove(); + } + break; + case OPCODE_VECTOR_SHR: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->VectorShr(i->src2.value, hir::TypeName(i->flags)); + i->Remove(); + } + break; + case OPCODE_VECTOR_ROTATE_LEFT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->VectorRol(i->src2.value, hir::TypeName(i->flags)); + i->Remove(); + } + break; + case OPCODE_VECTOR_SUB: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + uint32_t arith_flags = i->flags >> 8; + v->VectorSub(i->src2.value, hir::TypeName(i->flags & 0xFF), + !!(arith_flags & ARITHMETIC_UNSIGNED), + !!(arith_flags & ARITHMETIC_SATURATE)); + i->Remove(); } break; - default: // Ignored. break; diff --git a/src/xenia/cpu/cpu_flags.cc b/src/xenia/cpu/cpu_flags.cc index 0d1748c1a..6c37e7edc 100644 --- a/src/xenia/cpu/cpu_flags.cc +++ b/src/xenia/cpu/cpu_flags.cc @@ -28,6 +28,10 @@ DEFINE_bool(trace_function_references, false, DEFINE_bool(trace_function_data, false, "Generate tracing for function result data."); +DEFINE_bool( + disable_global_lock, false, + "Disables global lock usage in guest code. Does not affect host code."); + DEFINE_bool(validate_hir, false, "Perform validation checks on the HIR during compilation."); diff --git a/src/xenia/cpu/cpu_flags.h b/src/xenia/cpu/cpu_flags.h index 578429b74..17b88ff08 100644 --- a/src/xenia/cpu/cpu_flags.h +++ b/src/xenia/cpu/cpu_flags.h @@ -23,6 +23,8 @@ DECLARE_bool(trace_function_coverage); DECLARE_bool(trace_function_references); DECLARE_bool(trace_function_data); +DECLARE_bool(disable_global_lock); + DECLARE_bool(validate_hir); DECLARE_uint64(break_on_instruction); diff --git a/src/xenia/cpu/hir/hir_builder.cc b/src/xenia/cpu/hir/hir_builder.cc index a27f0b86a..db278cd81 100644 --- a/src/xenia/cpu/hir/hir_builder.cc +++ b/src/xenia/cpu/hir/hir_builder.cc @@ -2074,6 +2074,17 @@ Value* HIRBuilder::AtomicExchange(Value* address, Value* new_value) { return i->dest; } +Value* HIRBuilder::AtomicCompareExchange(Value* address, Value* old_value, + Value* new_value) { + ASSERT_ADDRESS_TYPE(address); + Instr* i = AppendInstr(OPCODE_ATOMIC_COMPARE_EXCHANGE_info, 0, + AllocValue(INT8_TYPE)); + i->set_src1(address); + i->set_src2(old_value); + i->set_src3(new_value); + return i->dest; +} + } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/hir_builder.h b/src/xenia/cpu/hir/hir_builder.h index d6e0e8ccc..44a528f53 100644 --- a/src/xenia/cpu/hir/hir_builder.h +++ b/src/xenia/cpu/hir/hir_builder.h @@ -236,6 +236,8 @@ class HIRBuilder { Value* Unpack(Value* value, uint32_t pack_flags = 0); Value* AtomicExchange(Value* address, Value* new_value); + Value* AtomicCompareExchange(Value* address, Value* old_value, + Value* new_value); Value* AtomicAdd(Value* address, Value* value); Value* AtomicSub(Value* address, Value* value); diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 84bf2b320..8e440c73e 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -76,13 +76,14 @@ enum PackType : uint16_t { // Special types: PACK_TYPE_D3DCOLOR = 0, PACK_TYPE_FLOAT16_2 = 1, - PACK_TYPE_FLOAT16_4 = 2, - PACK_TYPE_SHORT_2 = 3, - PACK_TYPE_UINT_2101010 = 4, + PACK_TYPE_FLOAT16_3 = 2, // FIXME: Not verified, but looks correct. + PACK_TYPE_FLOAT16_4 = 3, + PACK_TYPE_SHORT_2 = 4, + PACK_TYPE_UINT_2101010 = 5, // Types which use the bitmasks below for configuration: - PACK_TYPE_8_IN_16 = 5, - PACK_TYPE_16_IN_32 = 6, + PACK_TYPE_8_IN_16 = 6, + PACK_TYPE_16_IN_32 = 7, PACK_TYPE_MODE = 0x000F, // just to get the mode @@ -220,6 +221,7 @@ enum Opcode { OPCODE_PACK, OPCODE_UNPACK, OPCODE_ATOMIC_EXCHANGE, + OPCODE_ATOMIC_COMPARE_EXCHANGE, __OPCODE_MAX_VALUE, // Keep at end. }; diff --git a/src/xenia/cpu/hir/opcodes.inl b/src/xenia/cpu/hir/opcodes.inl index c5deb7ff8..a2968e238 100644 --- a/src/xenia/cpu/hir/opcodes.inl +++ b/src/xenia/cpu/hir/opcodes.inl @@ -631,3 +631,9 @@ DEFINE_OPCODE( "atomic_exchange", OPCODE_SIG_V_V_V, OPCODE_FLAG_VOLATILE) + +DEFINE_OPCODE( + OPCODE_ATOMIC_COMPARE_EXCHANGE, + "atomic_compare_exchange", + OPCODE_SIG_V_V_V_V, + OPCODE_FLAG_VOLATILE) diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index 03ef79a2a..4d30de853 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -46,13 +46,13 @@ uint32_t Value::AsUint32() { assert_true(IsConstant()); switch (type) { case INT8_TYPE: - return constant.i8; + return constant.u8; case INT16_TYPE: - return constant.i16; + return constant.u16; case INT32_TYPE: - return constant.i32; + return constant.u32; case INT64_TYPE: - return (uint32_t)constant.i64; + return (uint32_t)constant.u64; default: assert_unhandled_case(type); return 0; @@ -63,13 +63,13 @@ uint64_t Value::AsUint64() { assert_true(IsConstant()); switch (type) { case INT8_TYPE: - return constant.i8; + return constant.u8; case INT16_TYPE: - return constant.i16; + return constant.u16; case INT32_TYPE: - return constant.i32; + return constant.u32; case INT64_TYPE: - return constant.i64; + return constant.u64; default: assert_unhandled_case(type); return 0; @@ -85,15 +85,15 @@ void Value::ZeroExtend(TypeName target_type) { switch (type) { case INT8_TYPE: type = target_type; - constant.i64 = constant.i64 & 0xFF; + constant.u64 = constant.u8; return; case INT16_TYPE: type = target_type; - constant.i64 = constant.i64 & 0xFFFF; + constant.u64 = constant.u16; return; case INT32_TYPE: type = target_type; - constant.i64 = constant.i64 & 0xFFFFFFFF; + constant.u64 = constant.u32; return; default: assert_unhandled_case(type); @@ -210,12 +210,30 @@ void Value::Convert(TypeName target_type, RoundMode round_mode) { assert_unhandled_case(target_type); return; } + case INT64_TYPE: + switch (target_type) { + case FLOAT64_TYPE: + type = target_type; + constant.f64 = (double)constant.i64; + return; + default: + assert_unhandled_case(target_type); + return; + } case FLOAT64_TYPE: switch (target_type) { case FLOAT32_TYPE: type = target_type; constant.f32 = (float)constant.f64; return; + case INT32_TYPE: + type = target_type; + constant.i32 = (int32_t)constant.f64; + return; + case INT64_TYPE: + type = target_type; + constant.i64 = (int64_t)constant.f64; + return; default: assert_unhandled_case(target_type); return; @@ -227,8 +245,28 @@ void Value::Convert(TypeName target_type, RoundMode round_mode) { } void Value::Round(RoundMode round_mode) { - // TODO(benvanik): big matrix. - assert_always(); + switch (type) { + case FLOAT32_TYPE: + switch (round_mode) { + case ROUND_TO_NEAREST: + constant.f32 = std::round(constant.f32); + return; + } + return; + case FLOAT64_TYPE: + return; + case VEC128_TYPE: + for (int i = 0; i < 4; i++) { + switch (round_mode) { + case ROUND_TO_NEAREST: + constant.v128.f32[i] = std::round(constant.v128.f32[i]); + return; + } + } + return; + default: + assert_unhandled_case(type); + } } bool Value::Add(Value* other) { @@ -325,6 +363,11 @@ void Value::Mul(Value* other) { case FLOAT64_TYPE: constant.f64 *= other->constant.f64; break; + case VEC128_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.f32[i] *= other->constant.v128.f32[i]; + } + break; default: assert_unhandled_case(type); break; @@ -406,6 +449,32 @@ void Value::Div(Value* other, bool is_unsigned) { case FLOAT64_TYPE: constant.f64 /= other->constant.f64; break; + case VEC128_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.f32[i] /= other->constant.v128.f32[i]; + } + break; + default: + assert_unhandled_case(type); + break; + } +} + +void Value::Max(Value* other) { + assert_true(type == other->type); + switch (type) { + case FLOAT32_TYPE: + constant.f32 = std::max(constant.f32, other->constant.f32); + break; + case FLOAT64_TYPE: + constant.f64 = std::max(constant.f64, other->constant.f64); + break; + case VEC128_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.f32[i] = + std::max(constant.v128.f32[i], other->constant.v128.f32[i]); + } + break; default: assert_unhandled_case(type); break; @@ -413,13 +482,49 @@ void Value::Div(Value* other, bool is_unsigned) { } void Value::MulAdd(Value* dest, Value* value1, Value* value2, Value* value3) { - // TODO(benvanik): big matrix. - assert_always(); + switch (dest->type) { + case VEC128_TYPE: + for (int i = 0; i < 4; i++) { + dest->constant.v128.f32[i] = + (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) + + value3->constant.v128.f32[i]; + } + break; + case FLOAT32_TYPE: + dest->constant.f32 = + (value1->constant.f32 * value2->constant.f32) + value3->constant.f32; + break; + case FLOAT64_TYPE: + dest->constant.f64 = + (value1->constant.f64 * value2->constant.f64) + value3->constant.f64; + break; + default: + assert_unhandled_case(dest->type); + break; + } } void Value::MulSub(Value* dest, Value* value1, Value* value2, Value* value3) { - // TODO(benvanik): big matrix. - assert_always(); + switch (dest->type) { + case VEC128_TYPE: + for (int i = 0; i < 4; i++) { + dest->constant.v128.f32[i] = + (value1->constant.v128.f32[i] * value2->constant.v128.f32[i]) - + value3->constant.v128.f32[i]; + } + break; + case FLOAT32_TYPE: + dest->constant.f32 = + (value1->constant.f32 * value2->constant.f32) - value3->constant.f32; + break; + case FLOAT64_TYPE: + dest->constant.f64 = + (value1->constant.f64 * value2->constant.f64) - value3->constant.f64; + break; + default: + assert_unhandled_case(dest->type); + break; + } } void Value::Neg() { @@ -527,6 +632,9 @@ void Value::And(Value* other) { case INT64_TYPE: constant.i64 &= other->constant.i64; break; + case VEC128_TYPE: + constant.v128 &= other->constant.v128; + break; default: assert_unhandled_case(type); break; @@ -548,6 +656,9 @@ void Value::Or(Value* other) { case INT64_TYPE: constant.i64 |= other->constant.i64; break; + case VEC128_TYPE: + constant.v128 |= other->constant.v128; + break; default: assert_unhandled_case(type); break; @@ -569,6 +680,9 @@ void Value::Xor(Value* other) { case INT64_TYPE: constant.i64 ^= other->constant.i64; break; + case VEC128_TYPE: + constant.v128 ^= other->constant.v128; + break; default: assert_unhandled_case(type); break; @@ -603,16 +717,16 @@ void Value::Shl(Value* other) { assert_true(other->type == INT8_TYPE); switch (type) { case INT8_TYPE: - constant.i8 <<= other->constant.i8; + constant.u8 <<= other->constant.u8; break; case INT16_TYPE: - constant.i16 <<= other->constant.i8; + constant.u16 <<= other->constant.u8; break; case INT32_TYPE: - constant.i32 <<= other->constant.i8; + constant.u32 <<= other->constant.u8; break; case INT64_TYPE: - constant.i64 <<= other->constant.i8; + constant.u64 <<= other->constant.u8; break; default: assert_unhandled_case(type); @@ -624,16 +738,16 @@ void Value::Shr(Value* other) { assert_true(other->type == INT8_TYPE); switch (type) { case INT8_TYPE: - constant.i8 = (uint8_t)constant.i8 >> other->constant.i8; + constant.u8 = constant.u8 >> other->constant.u8; break; case INT16_TYPE: - constant.i16 = (uint16_t)constant.i16 >> other->constant.i8; + constant.u16 = constant.u16 >> other->constant.u8; break; case INT32_TYPE: - constant.i32 = (uint32_t)constant.i32 >> other->constant.i8; + constant.u32 = constant.u32 >> other->constant.u8; break; case INT64_TYPE: - constant.i64 = (uint64_t)constant.i64 >> other->constant.i8; + constant.u64 = constant.u64 >> other->constant.u8; break; default: assert_unhandled_case(type); @@ -645,16 +759,16 @@ void Value::Sha(Value* other) { assert_true(other->type == INT8_TYPE); switch (type) { case INT8_TYPE: - constant.i8 = constant.i8 >> other->constant.i8; + constant.i8 = constant.i8 >> other->constant.u8; break; case INT16_TYPE: - constant.i16 = constant.i16 >> other->constant.i8; + constant.i16 = constant.i16 >> other->constant.u8; break; case INT32_TYPE: - constant.i32 = constant.i32 >> other->constant.i8; + constant.i32 = constant.i32 >> other->constant.u8; break; case INT64_TYPE: - constant.i64 = constant.i64 >> other->constant.i8; + constant.i64 = constant.i64 >> other->constant.u8; break; default: assert_unhandled_case(type); @@ -662,6 +776,246 @@ void Value::Sha(Value* other) { } } +void Value::Extract(Value* vec, Value* index) { + assert_true(vec->type == VEC128_TYPE); + switch (type) { + case INT8_TYPE: + constant.u8 = vec->constant.v128.u8[index->constant.u8]; + break; + case INT16_TYPE: + constant.u16 = vec->constant.v128.u16[index->constant.u16]; + break; + case INT32_TYPE: + constant.u32 = vec->constant.v128.u32[index->constant.u32]; + break; + case INT64_TYPE: + constant.u64 = vec->constant.v128.u64[index->constant.u64]; + break; + } +} + +void Value::Select(Value* other, Value* ctrl) { + // TODO + assert_always(); +} + +void Value::Splat(Value* other) { + assert_true(type == VEC128_TYPE); + switch (other->type) { + case INT8_TYPE: + for (int i = 0; i < 16; i++) { + constant.v128.i8[i] = other->constant.i8; + } + break; + case INT16_TYPE: + for (int i = 0; i < 8; i++) { + constant.v128.i16[i] = other->constant.i16; + } + break; + case INT32_TYPE: + case FLOAT32_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.i32[i] = other->constant.i32; + } + break; + case INT64_TYPE: + case FLOAT64_TYPE: + for (int i = 0; i < 2; i++) { + constant.v128.i64[i] = other->constant.i64; + } + break; + default: + assert_unhandled_case(other->type); + break; + } +} + +void Value::VectorCompareEQ(Value* other, TypeName type) { + assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE); + switch (type) { + case INT8_TYPE: + for (int i = 0; i < 16; i++) { + constant.v128.u8[i] = + constant.v128.u8[i] == other->constant.v128.u8[i] ? -1 : 0; + } + break; + case INT16_TYPE: + for (int i = 0; i < 8; i++) { + constant.v128.u16[i] = + constant.v128.u16[i] == other->constant.v128.u16[i] ? -1 : 0; + } + break; + case INT32_TYPE: + case FLOAT32_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.u32[i] = + constant.v128.u32[i] == other->constant.v128.u32[i] ? -1 : 0; + } + break; + case INT64_TYPE: + case FLOAT64_TYPE: + for (int i = 0; i < 2; i++) { + constant.v128.u64[i] = + constant.v128.u64[i] == other->constant.v128.u64[i] ? -1 : 0; + } + break; + default: + assert_unhandled_case(type); + break; + } +} + +void Value::VectorCompareSGT(Value* other, TypeName type) { + assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE); + switch (type) { + case INT8_TYPE: + for (int i = 0; i < 16; i++) { + constant.v128.u8[i] = + constant.v128.i8[i] > other->constant.v128.i8[i] ? -1 : 0; + } + break; + case INT16_TYPE: + for (int i = 0; i < 8; i++) { + constant.v128.u16[i] = + constant.v128.i16[i] > other->constant.v128.i16[i] ? -1 : 0; + } + break; + case INT32_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.u32[i] = + constant.v128.i32[i] > other->constant.v128.i32[i] ? -1 : 0; + } + break; + case FLOAT32_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.u32[i] = + constant.v128.f32[i] > other->constant.v128.f32[i] ? -1 : 0; + } + break; + case INT64_TYPE: + for (int i = 0; i < 2; i++) { + constant.v128.u64[i] = + constant.v128.i64[i] > other->constant.v128.i64[i] ? -1 : 0; + } + break; + default: + assert_unhandled_case(type); + break; + } +} + +void Value::VectorConvertI2F(Value* other) { + assert_true(type == VEC128_TYPE); + for (int i = 0; i < 4; i++) { + constant.v128.f32[i] = (float)other->constant.v128.i32[i]; + } +} + +void Value::VectorConvertF2I(Value* other) { + assert_true(type == VEC128_TYPE); + for (int i = 0; i < 4; i++) { + constant.v128.i32[i] = (int32_t)other->constant.v128.f32[i]; + } +} + +void Value::VectorShl(Value* other, TypeName type) { + assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE); + switch (type) { + case INT8_TYPE: + for (int i = 0; i < 16; i++) { + constant.v128.u8[i] <<= other->constant.v128.u8[i] & 0x7; + } + break; + case INT16_TYPE: + for (int i = 0; i < 8; i++) { + constant.v128.u16[i] <<= other->constant.v128.u16[i] & 0xF; + } + break; + case INT32_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.u32[i] <<= other->constant.v128.u32[i] & 0x1F; + } + break; + default: + assert_unhandled_case(type); + break; + } +} + +void Value::VectorShr(Value* other, TypeName type) { + assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE); + switch (type) { + case INT8_TYPE: + for (int i = 0; i < 16; i++) { + constant.v128.u8[i] >>= other->constant.v128.u8[i] & 0x7; + } + break; + case INT16_TYPE: + for (int i = 0; i < 8; i++) { + constant.v128.u16[i] >>= other->constant.v128.u16[i] & 0xF; + } + break; + case INT32_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.u32[i] >>= other->constant.v128.u32[i] & 0x1F; + } + break; + default: + assert_unhandled_case(type); + break; + } +} + +void Value::VectorRol(Value* other, TypeName type) { + assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE); + switch (type) { + case INT8_TYPE: + for (int i = 0; i < 16; i++) { + constant.v128.u8[i] = xe::rotate_left(constant.v128.u8[i], + other->constant.v128.i8[i] & 0x7); + } + break; + case INT16_TYPE: + for (int i = 0; i < 8; i++) { + constant.v128.u16[i] = xe::rotate_left( + constant.v128.u16[i], other->constant.v128.u16[i] & 0xF); + } + break; + case INT32_TYPE: + for (int i = 0; i < 4; i++) { + constant.v128.u32[i] = xe::rotate_left( + constant.v128.u32[i], other->constant.v128.u32[i] & 0x1F); + } + break; + default: + assert_unhandled_case(type); + break; + } +} + +void Value::VectorSub(Value* other, TypeName type, bool is_unsigned, + bool saturate) { + assert_true(this->type == VEC128_TYPE && other->type == VEC128_TYPE); + switch (type) { + case INT32_TYPE: + for (int i = 0; i < 4; i++) { + if (is_unsigned) { + if (saturate) { + assert_always(); + } else { + constant.v128.u32[i] -= other->constant.v128.u32[i]; + } + } else { + if (saturate) { + assert_always(); + } else { + constant.v128.i32[i] -= other->constant.v128.i32[i]; + } + } + } + } +} + void Value::ByteSwap() { switch (type) { case INT8_TYPE: diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index c078983bb..d797f27d7 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -77,9 +77,13 @@ class Value { } Use; typedef union { int8_t i8; + uint8_t u8; int16_t i16; + uint16_t u16; int32_t i32; + uint32_t u32; int64_t i64; + uint64_t u64; float f32; double f64; vec128_t v128; @@ -190,6 +194,8 @@ class Value { return !!constant.f32; case FLOAT64_TYPE: return !!constant.f64; + case VEC128_TYPE: + return constant.v128.low || constant.v128.high; default: assert_unhandled_case(type); return false; @@ -199,9 +205,6 @@ class Value { } } bool IsConstantFalse() const { - if (type == VEC128_TYPE) { - assert_always(); - } if (flags & VALUE_IS_CONSTANT) { switch (type) { case INT8_TYPE: @@ -216,6 +219,8 @@ class Value { return !constant.f32; case FLOAT64_TYPE: return !constant.f64; + case VEC128_TYPE: + return !(constant.v128.low || constant.v128.high); default: assert_unhandled_case(type); return false; @@ -475,6 +480,7 @@ class Value { void Mul(Value* other); void MulHi(Value* other, bool is_unsigned); void Div(Value* other, bool is_unsigned); + void Max(Value* other); static void MulAdd(Value* dest, Value* value1, Value* value2, Value* value3); static void MulSub(Value* dest, Value* value1, Value* value2, Value* value3); void Neg(); @@ -488,6 +494,17 @@ class Value { void Shl(Value* other); void Shr(Value* other); void Sha(Value* other); + void Extract(Value* vec, Value* index); + void Select(Value* other, Value* ctrl); + void Splat(Value* other); + void VectorCompareEQ(Value* other, TypeName type); + void VectorCompareSGT(Value* other, TypeName type); + void VectorConvertI2F(Value* other); + void VectorConvertF2I(Value* other); + void VectorShl(Value* other, TypeName type); + void VectorShr(Value* other, TypeName type); + void VectorRol(Value* other, TypeName type); + void VectorSub(Value* other, TypeName type, bool is_unsigned, bool saturate); void ByteSwap(); void CountLeadingZeros(const Value* other); bool Compare(Opcode opcode, Value* other); diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h index b37f4cda4..9c96daa6b 100644 --- a/src/xenia/cpu/ppc/ppc_context.h +++ b/src/xenia/cpu/ppc/ppc_context.h @@ -423,8 +423,8 @@ typedef struct PPCContext_s { uint8_t* physical_membase; - // Keep the struct padded out to 64b total. - uint8_t _padding[8]; + // Value of last reserved load + uint64_t reserved_val; static std::string GetRegisterName(PPCRegister reg); std::string GetStringFromValue(PPCRegister reg) const; diff --git a/src/xenia/cpu/ppc/ppc_emit_altivec.cc b/src/xenia/cpu/ppc/ppc_emit_altivec.cc index c0c067d31..f2fc1330f 100644 --- a/src/xenia/cpu/ppc/ppc_emit_altivec.cc +++ b/src/xenia/cpu/ppc/ppc_emit_altivec.cc @@ -2149,6 +2149,9 @@ int InstrEmit_vupkd3d128(PPCHIRBuilder& f, const InstrData& i) { case 3: // VPACK_... 2 FLOAT16s DXGI_FORMAT_R16G16_FLOAT v = f.Unpack(v, PACK_TYPE_FLOAT16_2); break; + case 4: + v = f.Unpack(v, PACK_TYPE_FLOAT16_3); + break; case 5: // VPACK_... 4 FLOAT16s DXGI_FORMAT_R16G16B16A16_FLOAT v = f.Unpack(v, PACK_TYPE_FLOAT16_4); break; diff --git a/src/xenia/cpu/ppc/ppc_emit_control.cc b/src/xenia/cpu/ppc/ppc_emit_control.cc index a44644193..0bd9cbd2e 100644 --- a/src/xenia/cpu/ppc/ppc_emit_control.cc +++ b/src/xenia/cpu/ppc/ppc_emit_control.cc @@ -10,6 +10,7 @@ #include "xenia/cpu/ppc/ppc_emit-private.h" #include "xenia/base/assert.h" +#include "xenia/cpu/cpu_flags.h" #include "xenia/cpu/ppc/ppc_context.h" #include "xenia/cpu/ppc/ppc_frontend.h" #include "xenia/cpu/ppc/ppc_hir_builder.h" @@ -725,10 +726,14 @@ int InstrEmit_mtmsr(PPCHIRBuilder& f, const InstrData& i) { f.ZeroExtend(f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE), INT64_TYPE)); if (i.X.RT == 13) { // iff storing from r13 we are taking a lock (disable interrupts). - f.CallExtern(f.builtins()->enter_global_lock); + if (!FLAGS_disable_global_lock) { + f.CallExtern(f.builtins()->enter_global_lock); + } } else { // Otherwise we are restoring interrupts (probably). - f.CallExtern(f.builtins()->leave_global_lock); + if (!FLAGS_disable_global_lock) { + f.CallExtern(f.builtins()->leave_global_lock); + } } return 0; } else { @@ -746,10 +751,14 @@ int InstrEmit_mtmsrd(PPCHIRBuilder& f, const InstrData& i) { f.ZeroExtend(f.LoadGPR(i.X.RT), INT64_TYPE)); if (i.X.RT == 13) { // iff storing from r13 we are taking a lock (disable interrupts). - f.CallExtern(f.builtins()->enter_global_lock); + if (!FLAGS_disable_global_lock) { + f.CallExtern(f.builtins()->enter_global_lock); + } } else { // Otherwise we are restoring interrupts (probably). - f.CallExtern(f.builtins()->leave_global_lock); + if (!FLAGS_disable_global_lock) { + f.CallExtern(f.builtins()->leave_global_lock); + } } return 0; } else { diff --git a/src/xenia/cpu/ppc/ppc_emit_memory.cc b/src/xenia/cpu/ppc/ppc_emit_memory.cc index 8749deb9a..e9294048e 100644 --- a/src/xenia/cpu/ppc/ppc_emit_memory.cc +++ b/src/xenia/cpu/ppc/ppc_emit_memory.cc @@ -658,6 +658,7 @@ int InstrEmit_ldarx(PPCHIRBuilder& f, const InstrData& i) { Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* rt = f.ByteSwap(f.Load(ea, INT64_TYPE)); + f.StoreReserved(rt); f.StoreGPR(i.X.RT, rt); return 0; } @@ -682,6 +683,7 @@ int InstrEmit_lwarx(PPCHIRBuilder& f, const InstrData& i) { Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* rt = f.ZeroExtend(f.ByteSwap(f.Load(ea, INT32_TYPE)), INT64_TYPE); + f.StoreReserved(rt); f.StoreGPR(i.X.RT, rt); return 0; } @@ -700,11 +702,15 @@ int InstrEmit_stdcx(PPCHIRBuilder& f, const InstrData& i) { // NOTE: we assume we are within a global lock. // As we have been exclusively executing this entire time, we assume that no // one else could have possibly touched the memory and must always succeed. + // We use atomic compare exchange here to support reserved load/store without + // being under the global lock (flag disable_global_lock - see mtmsr/mtmsrd). + // This will always succeed if under the global lock, however. Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* rt = f.ByteSwap(f.LoadGPR(i.X.RT)); - f.Store(ea, rt); - f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1)); + Value* res = f.ByteSwap(f.LoadReserved()); + Value* v = f.AtomicCompareExchange(ea, res, rt); + f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v); f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8()); f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8()); @@ -729,11 +735,15 @@ int InstrEmit_stwcx(PPCHIRBuilder& f, const InstrData& i) { // NOTE: we assume we are within a global lock. // As we have been exclusively executing this entire time, we assume that no // one else could have possibly touched the memory and must always succeed. + // We use atomic compare exchange here to support reserved load/store without + // being under the global lock (flag disable_global_lock - see mtmsr/mtmsrd). + // This will always succeed if under the global lock, however. Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); Value* rt = f.ByteSwap(f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE)); - f.Store(ea, rt); - f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), f.LoadConstantInt8(1)); + Value* res = f.ByteSwap(f.Truncate(f.LoadReserved(), INT32_TYPE)); + Value* v = f.AtomicCompareExchange(ea, res, rt); + f.StoreContext(offsetof(PPCContext, cr0.cr0_eq), v); f.StoreContext(offsetof(PPCContext, cr0.cr0_lt), f.LoadZeroInt8()); f.StoreContext(offsetof(PPCContext, cr0.cr0_gt), f.LoadZeroInt8()); diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.cc b/src/xenia/cpu/ppc/ppc_hir_builder.cc index bce5d9d16..e18118d7b 100644 --- a/src/xenia/cpu/ppc/ppc_hir_builder.cc +++ b/src/xenia/cpu/ppc/ppc_hir_builder.cc @@ -511,6 +511,15 @@ void PPCHIRBuilder::StoreVR(uint32_t reg, Value* value) { trace_reg.value = value; } +void PPCHIRBuilder::StoreReserved(Value* val) { + assert_true(val->type == INT64_TYPE); + StoreContext(offsetof(PPCContext, reserved_val), val); +} + +Value* PPCHIRBuilder::LoadReserved() { + return LoadContext(offsetof(PPCContext, reserved_val), INT64_TYPE); +} + } // namespace ppc } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/ppc/ppc_hir_builder.h b/src/xenia/cpu/ppc/ppc_hir_builder.h index ca9830799..8b88ae35d 100644 --- a/src/xenia/cpu/ppc/ppc_hir_builder.h +++ b/src/xenia/cpu/ppc/ppc_hir_builder.h @@ -78,6 +78,9 @@ class PPCHIRBuilder : public hir::HIRBuilder { Value* LoadVR(uint32_t reg); void StoreVR(uint32_t reg, Value* value); + void StoreReserved(Value* val); + Value* LoadReserved(); + private: void AnnotateLabel(uint32_t address, Label* label); diff --git a/src/xenia/cpu/xex_module.cc b/src/xenia/cpu/xex_module.cc index ea54bd4fc..4d875a626 100644 --- a/src/xenia/cpu/xex_module.cc +++ b/src/xenia/cpu/xex_module.cc @@ -286,9 +286,6 @@ bool XexModule::Load(const std::string& name, const std::string& path, } // Setup memory protection. - // TODO: This introduces a load of constants into the JIT, and Xenia isn't - // quite set-up to handle constants yet... - /* auto sec_header = xex_security_info(); auto heap = memory()->LookupHeap(sec_header->load_address); auto page_size = heap->page_size(); @@ -311,7 +308,6 @@ bool XexModule::Load(const std::string& name, const std::string& path, page += desc.size; } - */ return true; } diff --git a/src/xenia/gpu/glsl_shader_translator.cc b/src/xenia/gpu/glsl_shader_translator.cc index 397dd3a63..3a891316d 100644 --- a/src/xenia/gpu/glsl_shader_translator.cc +++ b/src/xenia/gpu/glsl_shader_translator.cc @@ -535,6 +535,14 @@ void GlslShaderTranslator::ProcessVertexFetchInstruction( EmitSource("// "); instr.Disassemble(&source_); + if (instr.operands[0].storage_index != 0) { + // Unimplemented for now. + EmitUnimplementedTranslationError(); + EmitSourceDepth("pv.xyzw = vec4(0.0, 0.0, 0.0, 0.0);\n"); + EmitStoreVectorResult(instr.result); + return; + } + if (instr.is_predicated) { EmitSourceDepth("if (%cp0) {\n", instr.predicate_condition ? ' ' : '!'); Indent(); diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc index e9db2fc31..2d7b935bb 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_threading.cc @@ -1251,22 +1251,20 @@ pointer_result_t InterlockedPushEntrySList( assert_not_null(plist_ptr); assert_not_null(entry); - // Hold a global lock during this method. Once in the lock we assume we have - // exclusive access to the structure. - auto global_lock = xe::global_critical_region::AcquireDirect(); - alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr; alignas(8) X_SLIST_HEADER new_hdr = {0}; - new_hdr.depth = old_hdr.depth + 1; - new_hdr.sequence = old_hdr.sequence + 1; + uint32_t old_head = 0; + do { + old_hdr = *plist_ptr; + new_hdr.depth = old_hdr.depth + 1; + new_hdr.sequence = old_hdr.sequence + 1; - uint32_t old_head = old_hdr.next.next; - entry->next = old_hdr.next.next; - new_hdr.next.next = entry.guest_address(); - - *reinterpret_cast(plist_ptr.host_address()) = - *reinterpret_cast(&new_hdr); - xe::threading::SyncMemory(); + uint32_t old_head = old_hdr.next.next; + entry->next = old_hdr.next.next; + new_hdr.next.next = entry.guest_address(); + } while ( + !xe::atomic_cas(*(uint64_t*)(&old_hdr), *(uint64_t*)(&new_hdr), + reinterpret_cast(plist_ptr.host_address()))); return old_head; } @@ -1276,28 +1274,24 @@ DECLARE_XBOXKRNL_EXPORT(InterlockedPushEntrySList, pointer_result_t InterlockedPopEntrySList(pointer_t plist_ptr) { assert_not_null(plist_ptr); - // Hold a global lock during this method. Once in the lock we assume we have - // exclusive access to the structure. - auto global_lock = xe::global_critical_region::AcquireDirect(); - uint32_t popped = 0; - - alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr; + alignas(8) X_SLIST_HEADER old_hdr = {0}; alignas(8) X_SLIST_HEADER new_hdr = {0}; - auto next = kernel_memory()->TranslateVirtual( - old_hdr.next.next); - if (!old_hdr.next.next) { - return 0; - } - popped = old_hdr.next.next; + do { + old_hdr = *plist_ptr; + auto next = kernel_memory()->TranslateVirtual( + old_hdr.next.next); + if (!old_hdr.next.next) { + return 0; + } + popped = old_hdr.next.next; - new_hdr.depth = old_hdr.depth - 1; - new_hdr.next.next = next->next; - new_hdr.sequence = old_hdr.sequence; - - *reinterpret_cast(plist_ptr.host_address()) = - *reinterpret_cast(&new_hdr); - xe::threading::SyncMemory(); + new_hdr.depth = old_hdr.depth - 1; + new_hdr.next.next = next->next; + new_hdr.sequence = old_hdr.sequence; + } while ( + !xe::atomic_cas(*(uint64_t*)(&old_hdr), *(uint64_t*)(&new_hdr), + reinterpret_cast(plist_ptr.host_address()))); return popped; } @@ -1307,20 +1301,18 @@ DECLARE_XBOXKRNL_EXPORT(InterlockedPopEntrySList, pointer_result_t InterlockedFlushSList(pointer_t plist_ptr) { assert_not_null(plist_ptr); - // Hold a global lock during this method. Once in the lock we assume we have - // exclusive access to the structure. - auto global_lock = xe::global_critical_region::AcquireDirect(); - alignas(8) X_SLIST_HEADER old_hdr = *plist_ptr; alignas(8) X_SLIST_HEADER new_hdr = {0}; - uint32_t first = old_hdr.next.next; - new_hdr.next.next = 0; - new_hdr.depth = 0; - new_hdr.sequence = 0; - - *reinterpret_cast(plist_ptr.host_address()) = - *reinterpret_cast(&new_hdr); - xe::threading::SyncMemory(); + uint32_t first = 0; + do { + old_hdr = *plist_ptr; + first = old_hdr.next.next; + new_hdr.next.next = 0; + new_hdr.depth = 0; + new_hdr.sequence = 0; + } while ( + !xe::atomic_cas(*(uint64_t*)(&old_hdr), *(uint64_t*)(&new_hdr), + reinterpret_cast(plist_ptr.host_address()))); return first; }