From cb127ae9ba51be9007d1c3ad2b54196bc5711bb2 Mon Sep 17 00:00:00 2001 From: "Dr. Chat" Date: Tue, 5 May 2015 15:34:13 -0500 Subject: [PATCH] Emulate some vector arithmetic opcodes (until we get a native implementation) --- src/xenia/cpu/backend/x64/x64_sequences.cc | 109 ++++++++++++++------- 1 file changed, 75 insertions(+), 34 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index c79528ef3..90104cf1d 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -4726,34 +4726,57 @@ EMITTER(VECTOR_SHL_V128, MATCH(I, V128<>, V128<>>)) { e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI16)); e.vmovaps(i.dest, e.xmm0); } + static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) { + alignas(16) uint32_t value[4]; + alignas(16) uint32_t shamt[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + for (size_t i = 0; i < 4; ++i) { + value[i] = value[i] << (shamt[i] & 0x1F); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); + } static void EmitInt32(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 4 - n; ++n) { - if (shamt.u32[n] != shamt.u32[n + 1]) { - all_same = false; - break; + if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { + if (i.src2.is_constant) { + const auto& shamt = i.src2.constant(); + bool all_same = true; + for (size_t n = 0; n < 4 - n; ++n) { + if (shamt.u32[n] != shamt.u32[n + 1]) { + all_same = false; + break; + } + } + if (all_same) { + // Every count is the same, so we can use vpslld. + e.vpslld(i.dest, i.src1, shamt.u8[0] & 0x1F); + } else { + // Counts differ, so pre-mask and load constant. + vec128_t masked = i.src2.constant(); + for (size_t n = 0; n < 4; ++n) { + masked.u32[n] &= 0x1F; + } + e.LoadConstantXmm(e.xmm0, masked); + e.vpsllvd(i.dest, i.src1, e.xmm0); } - } - if (all_same) { - // Every count is the same, so we can use vpslld. - e.vpslld(i.dest, i.src1, shamt.u8[0] & 0x1F); } else { - // Counts differ, so pre-mask and load constant. - vec128_t masked = i.src2.constant(); - for (size_t n = 0; n < 4; ++n) { - masked.u32[n] &= 0x1F; - } - e.LoadConstantXmm(e.xmm0, masked); + // Fully variable shift. + // src shift mask may have values >31, and x86 sets to zero when + // that happens so we mask. + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); e.vpsllvd(i.dest, i.src1, e.xmm0); } } else { - // Fully variable shift. - // src shift mask may have values >31, and x86 sets to zero when - // that happens so we mask. - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - e.vpsllvd(i.dest, i.src1, e.xmm0); + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI32)); + e.vmovaps(i.dest, e.xmm0); } } }; @@ -5058,6 +5081,16 @@ EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I, V128 } return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } + static __m128i EmulateVectorRotateLeftI32(void*, __m128i src1, __m128i src2) { + alignas(16) uint32_t value[4]; + alignas(16) uint32_t shamt[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + for (size_t i = 0; i < 4; ++i) { + value[i] = xe::rotate_left(value[i], shamt[i] & 0x1F); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); + } static void Emit(X64Emitter& e, const EmitArgType& i) { switch (i.instr->flags) { case INT8_TYPE: @@ -5075,19 +5108,27 @@ EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I, V128 e.vmovaps(i.dest, e.xmm0); break; case INT32_TYPE: { - Xmm temp = i.dest; - if (i.dest == i.src1 || i.dest == i.src2) { - temp = e.xmm2; + if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) { + Xmm temp = i.dest; + if (i.dest == i.src1 || i.dest == i.src2) { + temp = e.xmm2; + } + // Shift left (to get high bits): + e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + e.vpsllvd(e.xmm1, i.src1, e.xmm0); + // Shift right (to get low bits): + e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); + e.vpsubd(temp, e.xmm0); + e.vpsrlvd(i.dest, i.src1, temp); + // Merge: + e.vpor(i.dest, e.xmm1); + } else { + // TODO: Non-AVX2 native version + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.r9, e.StashXmm(1, i.src2)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI32)); + e.vmovaps(i.dest, e.xmm0); } - // Shift left (to get high bits): - e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); - e.vpsllvd(e.xmm1, i.src1, e.xmm0); - // Shift right (to get low bits): - e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); - e.vpsubd(temp, e.xmm0); - e.vpsrlvd(i.dest, i.src1, temp); - // Merge: - e.vpor(i.dest, e.xmm1); break; } default: