Emulated vector shr

This commit is contained in:
Dr. Chat 2015-05-05 15:58:51 -05:00
parent abf97ea44d
commit 26d8858684
1 changed files with 40 additions and 11 deletions

View File

@ -4864,6 +4864,16 @@ EMITTER(VECTOR_SHR_V128, MATCH(I<OPCODE_VECTOR_SHR, V128<>, V128<>, V128<>>)) {
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI16)); e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI16));
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
} }
static __m128i EmulateVectorShrI32(void*, __m128i src1, __m128i src2) {
alignas(16) uint16_t value[4];
alignas(16) uint16_t shamt[4];
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
for (size_t i = 0; i < 4; ++i) {
value[i] = value[i] >> (shamt[i] & 0x1F);
}
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
static void EmitInt32(X64Emitter& e, const EmitArgType& i) { static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
if (i.src2.is_constant) { if (i.src2.is_constant) {
const auto& shamt = i.src2.constant(); const auto& shamt = i.src2.constant();
@ -4877,7 +4887,9 @@ EMITTER(VECTOR_SHR_V128, MATCH(I<OPCODE_VECTOR_SHR, V128<>, V128<>, V128<>>)) {
if (all_same) { if (all_same) {
// Every count is the same, so we can use vpslld. // Every count is the same, so we can use vpslld.
e.vpsrld(i.dest, i.src1, shamt.u8[0] & 0x1F); e.vpsrld(i.dest, i.src1, shamt.u8[0] & 0x1F);
return;
} else { } else {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) {
// Counts differ, so pre-mask and load constant. // Counts differ, so pre-mask and load constant.
vec128_t masked = i.src2.constant(); vec128_t masked = i.src2.constant();
for (size_t n = 0; n < 4; ++n) { for (size_t n = 0; n < 4; ++n) {
@ -4885,15 +4897,32 @@ EMITTER(VECTOR_SHR_V128, MATCH(I<OPCODE_VECTOR_SHR, V128<>, V128<>, V128<>>)) {
} }
e.LoadConstantXmm(e.xmm0, masked); e.LoadConstantXmm(e.xmm0, masked);
e.vpsrlvd(i.dest, i.src1, e.xmm0); e.vpsrlvd(i.dest, i.src1, e.xmm0);
return;
}
} }
} else { } else {
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) {
// Fully variable shift. // Fully variable shift.
// src shift mask may have values >31, and x86 sets to zero when // src shift mask may have values >31, and x86 sets to zero when
// that happens so we mask. // that happens so we mask.
e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
e.vpsrlvd(i.dest, i.src1, e.xmm0); e.vpsrlvd(i.dest, i.src1, e.xmm0);
return;
} }
} }
// We've reached here if we don't have AVX2 and it's a variable shift
// TODO: native version
if (i.src2.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.lea(e.r9, e.StashXmm(1, e.xmm0));
} else {
e.lea(e.r9, e.StashXmm(1, i.src2));
}
e.lea(e.r8, e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShrI32));
e.vmovaps(i.dest, e.xmm0);
}
}; };
EMITTER_OPCODE_TABLE( EMITTER_OPCODE_TABLE(
OPCODE_VECTOR_SHR, OPCODE_VECTOR_SHR,