OPCODE_VECTOR_SHA and SPLAT_I16 for non-AVX2 CPUs

This commit is contained in:
Dr. Chat 2015-05-05 14:34:24 -05:00
parent 3249f84700
commit cff09a4509
1 changed files with 41 additions and 9 deletions

View File

@ -4901,6 +4901,16 @@ EMITTER(VECTOR_SHA_V128, MATCH(I<OPCODE_VECTOR_SHA, V128<>, V128<>, V128<>>)) {
} }
return _mm_load_si128(reinterpret_cast<__m128i*>(value)); return _mm_load_si128(reinterpret_cast<__m128i*>(value));
} }
static __m128i EmulateVectorShaI32(void*, __m128i src1, __m128i src2) {
alignas(16) int32_t value[4];
alignas(16) int32_t shamt[4];
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
for (size_t i = 0; i < 4; ++i) {
value[i] = value[i] >> (shamt[i] & 0x1F);
}
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
}
static void Emit(X64Emitter& e, const EmitArgType& i) { static void Emit(X64Emitter& e, const EmitArgType& i) {
switch (i.instr->flags) { switch (i.instr->flags) {
case INT8_TYPE: case INT8_TYPE:
@ -4928,15 +4938,29 @@ EMITTER(VECTOR_SHA_V128, MATCH(I<OPCODE_VECTOR_SHA, V128<>, V128<>, V128<>>)) {
e.vmovaps(i.dest, e.xmm0); e.vmovaps(i.dest, e.xmm0);
break; break;
case INT32_TYPE: case INT32_TYPE:
// src shift mask may have values >31, and x86 sets to zero when if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) {
// that happens so we mask. // src shift mask may have values >31, and x86 sets to zero when
if (i.src2.is_constant) { // that happens so we mask.
e.LoadConstantXmm(e.xmm0, i.src2.constant()); if (i.src2.is_constant) {
e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS));
} else {
e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
}
e.vpsravd(i.dest, i.src1, e.xmm0);
} else { } else {
e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); // Emulated for now...
// TODO: Native version
if (i.src2.is_constant) {
e.LoadConstantXmm(e.xmm0, i.src2.constant());
e.lea(e.r9, e.StashXmm(1, e.xmm0));
} else {
e.lea(e.r9, e.StashXmm(1, i.src2));
}
e.lea(e.r8, e.StashXmm(0, i.src1));
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShaI32));
e.vmovaps(i.dest, e.xmm0);
} }
e.vpsravd(i.dest, i.src1, e.xmm0);
break; break;
default: default:
assert_always(); assert_always();
@ -5475,8 +5499,16 @@ EMITTER(SPLAT_I16, MATCH(I<OPCODE_SPLAT, V128<>, I16<>>)) {
e.vpbroadcastw(i.dest, e.xmm0); e.vpbroadcastw(i.dest, e.xmm0);
} }
} else { } else {
// TODO if (i.src1.is_constant) {
e.DebugBreak(); e.mov(e.eax, i.src1.constant());
e.movd(e.xmm0, e.eax);
} else {
e.movd(e.xmm0, i.src1.reg().cvt32());
}
// Credits: VC++ compiler (i love you so much)
e.punpcklwd(e.xmm0, e.xmm0); // unpack low word data
e.pshufd(i.dest, e.xmm0, 0);
} }
} }
}; };