Emulate some vector arithmetic opcodes (until we get a native implementation)
This commit is contained in:
parent
87811bbf2b
commit
cb127ae9ba
|
@ -4726,7 +4726,18 @@ EMITTER(VECTOR_SHL_V128, MATCH(I<OPCODE_VECTOR_SHL, V128<>, V128<>, V128<>>)) {
|
||||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI16));
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI16));
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
}
|
}
|
||||||
|
static __m128i EmulateVectorShlI32(void*, __m128i src1, __m128i src2) {
|
||||||
|
alignas(16) uint32_t value[4];
|
||||||
|
alignas(16) uint32_t shamt[4];
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
|
||||||
|
for (size_t i = 0; i < 4; ++i) {
|
||||||
|
value[i] = value[i] << (shamt[i] & 0x1F);
|
||||||
|
}
|
||||||
|
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||||
|
}
|
||||||
static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
|
static void EmitInt32(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) {
|
||||||
if (i.src2.is_constant) {
|
if (i.src2.is_constant) {
|
||||||
const auto& shamt = i.src2.constant();
|
const auto& shamt = i.src2.constant();
|
||||||
bool all_same = true;
|
bool all_same = true;
|
||||||
|
@ -4755,6 +4766,18 @@ EMITTER(VECTOR_SHL_V128, MATCH(I<OPCODE_VECTOR_SHL, V128<>, V128<>, V128<>>)) {
|
||||||
e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
|
e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS));
|
||||||
e.vpsllvd(i.dest, i.src1, e.xmm0);
|
e.vpsllvd(i.dest, i.src1, e.xmm0);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// TODO(benvanik): native version (with shift magic).
|
||||||
|
if (i.src2.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
|
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||||
|
} else {
|
||||||
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
}
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorShlI32));
|
||||||
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
EMITTER_OPCODE_TABLE(
|
EMITTER_OPCODE_TABLE(
|
||||||
|
@ -5058,6 +5081,16 @@ EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I<OPCODE_VECTOR_ROTATE_LEFT, V128<>, V128
|
||||||
}
|
}
|
||||||
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||||
}
|
}
|
||||||
|
static __m128i EmulateVectorRotateLeftI32(void*, __m128i src1, __m128i src2) {
|
||||||
|
alignas(16) uint32_t value[4];
|
||||||
|
alignas(16) uint32_t shamt[4];
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(value), src1);
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2);
|
||||||
|
for (size_t i = 0; i < 4; ++i) {
|
||||||
|
value[i] = xe::rotate_left<uint32_t>(value[i], shamt[i] & 0x1F);
|
||||||
|
}
|
||||||
|
return _mm_load_si128(reinterpret_cast<__m128i*>(value));
|
||||||
|
}
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
switch (i.instr->flags) {
|
switch (i.instr->flags) {
|
||||||
case INT8_TYPE:
|
case INT8_TYPE:
|
||||||
|
@ -5075,6 +5108,7 @@ EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I<OPCODE_VECTOR_ROTATE_LEFT, V128<>, V128
|
||||||
e.vmovaps(i.dest, e.xmm0);
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
break;
|
break;
|
||||||
case INT32_TYPE: {
|
case INT32_TYPE: {
|
||||||
|
if (e.cpu()->has(Xbyak::util::Cpu::tAVX2)) {
|
||||||
Xmm temp = i.dest;
|
Xmm temp = i.dest;
|
||||||
if (i.dest == i.src1 || i.dest == i.src2) {
|
if (i.dest == i.src1 || i.dest == i.src2) {
|
||||||
temp = e.xmm2;
|
temp = e.xmm2;
|
||||||
|
@ -5088,6 +5122,13 @@ EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I<OPCODE_VECTOR_ROTATE_LEFT, V128<>, V128
|
||||||
e.vpsrlvd(i.dest, i.src1, temp);
|
e.vpsrlvd(i.dest, i.src1, temp);
|
||||||
// Merge:
|
// Merge:
|
||||||
e.vpor(i.dest, e.xmm1);
|
e.vpor(i.dest, e.xmm1);
|
||||||
|
} else {
|
||||||
|
// TODO: Non-AVX2 native version
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateVectorRotateLeftI32));
|
||||||
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|
Loading…
Reference in New Issue