I don't like it, but fixing permute by int16.
This commit is contained in:
parent
acc1286b72
commit
b2e03fa628
|
@ -41,7 +41,7 @@ using alloy::runtime::ThreadState;
|
||||||
static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024;
|
static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024;
|
||||||
|
|
||||||
static const size_t STASH_OFFSET = 32;
|
static const size_t STASH_OFFSET = 32;
|
||||||
static const size_t STASH_OFFSET_HIGH = 32 + 16;
|
static const size_t STASH_OFFSET_HIGH = 32 + 32;
|
||||||
|
|
||||||
// If we are running with tracing on we have to store the EFLAGS in the stack,
|
// If we are running with tracing on we have to store the EFLAGS in the stack,
|
||||||
// otherwise our calls out to C to print will clear it before DID_CARRY/etc
|
// otherwise our calls out to C to print will clear it before DID_CARRY/etc
|
||||||
|
|
|
@ -4958,6 +4958,7 @@ EMITTER_OPCODE_TABLE(
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
EMITTER(PERMUTE_I32, MATCH(I<OPCODE_PERMUTE, V128<>, I32<>, V128<>, V128<>>)) {
|
EMITTER(PERMUTE_I32, MATCH(I<OPCODE_PERMUTE, V128<>, I32<>, V128<>, V128<>>)) {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
assert_true(i.instr->flags == INT32_TYPE);
|
||||||
// Permute words between src2 and src3.
|
// Permute words between src2 and src3.
|
||||||
// TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
|
// TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
|
||||||
if (i.src1.is_constant) {
|
if (i.src1.is_constant) {
|
||||||
|
@ -5006,8 +5007,7 @@ EMITTER(PERMUTE_I32, MATCH(I<OPCODE_PERMUTE, V128<>, I32<>, V128<>, V128<>>)) {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
EMITTER(PERMUTE_V128, MATCH(I<OPCODE_PERMUTE, V128<>, V128<>, V128<>, V128<>>)) {
|
EMITTER(PERMUTE_V128, MATCH(I<OPCODE_PERMUTE, V128<>, V128<>, V128<>, V128<>>)) {
|
||||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
static void EmitByInt8(X64Emitter& e, const EmitArgType& i) {
|
||||||
assert_true(i.instr->flags == INT8_TYPE);
|
|
||||||
// TODO(benvanik): find out how to do this with only one temp register!
|
// TODO(benvanik): find out how to do this with only one temp register!
|
||||||
// Permute bytes between src2 and src3.
|
// Permute bytes between src2 and src3.
|
||||||
if (i.src3.value->IsConstantZero()) {
|
if (i.src3.value->IsConstantZero()) {
|
||||||
|
@ -5068,6 +5068,65 @@ EMITTER(PERMUTE_V128, MATCH(I<OPCODE_PERMUTE, V128<>, V128<>, V128<>, V128<>>))
|
||||||
e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
|
e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __m128i EmulateByInt16(void*, __m128i control, __m128i src1, __m128i src2) {
|
||||||
|
alignas(16) uint16_t c[8];
|
||||||
|
alignas(16) uint16_t a[8];
|
||||||
|
alignas(16) uint16_t b[8];
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(c), control);
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||||
|
_mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
|
||||||
|
for (size_t i = 0; i < 8; ++i) {
|
||||||
|
uint16_t si = (c[i] & 0xF) ^ 0x1;
|
||||||
|
c[i] = si >= 8 ? b[si - 8] : a[si];
|
||||||
|
}
|
||||||
|
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||||
|
}
|
||||||
|
static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
// TODO(benvanik): replace with proper version.
|
||||||
|
assert_true(i.src1.is_constant);
|
||||||
|
if (i.src1.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src1.constant());
|
||||||
|
e.lea(e.r8, e.StashXmm(0, e.xmm0));
|
||||||
|
} else {
|
||||||
|
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||||
|
}
|
||||||
|
if (i.src2.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||||
|
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||||
|
} else {
|
||||||
|
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||||
|
}
|
||||||
|
if (i.src3.is_constant) {
|
||||||
|
e.LoadConstantXmm(e.xmm0, i.src3.constant());
|
||||||
|
e.lea(e.r10, e.StashXmm(2, e.xmm0));
|
||||||
|
} else {
|
||||||
|
e.lea(e.r10, e.StashXmm(2, i.src3));
|
||||||
|
}
|
||||||
|
e.CallNativeSafe(reinterpret_cast<void*>(EmulateByInt16));
|
||||||
|
e.vmovaps(i.dest, e.xmm0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void EmitByInt32(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
assert_always();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||||
|
switch (i.instr->flags) {
|
||||||
|
case INT8_TYPE:
|
||||||
|
EmitByInt8(e, i);
|
||||||
|
break;
|
||||||
|
case INT16_TYPE:
|
||||||
|
EmitByInt16(e, i);
|
||||||
|
break;
|
||||||
|
case INT32_TYPE:
|
||||||
|
EmitByInt32(e, i);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
assert_unhandled_case(i.instr->flags);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
EMITTER_OPCODE_TABLE(
|
EMITTER_OPCODE_TABLE(
|
||||||
OPCODE_PERMUTE,
|
OPCODE_PERMUTE,
|
||||||
|
|
|
@ -118,6 +118,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
|
||||||
mov(rax, rdx);
|
mov(rax, rdx);
|
||||||
mov(rdx, r8);
|
mov(rdx, r8);
|
||||||
mov(r8, r9);
|
mov(r8, r9);
|
||||||
|
mov(r9, r10);
|
||||||
call(rax);
|
call(rax);
|
||||||
|
|
||||||
mov(rbx, qword[rsp + 48]);
|
mov(rbx, qword[rsp + 48]);
|
||||||
|
|
|
@ -96,14 +96,14 @@ namespace x64 {
|
||||||
* | |
|
* | |
|
||||||
* | |
|
* | |
|
||||||
* +------------------+
|
* +------------------+
|
||||||
* | scratch, 32b | rsp + 32
|
* | scratch, 48b | rsp + 32
|
||||||
* | |
|
* | |
|
||||||
* +------------------+
|
* +------------------+
|
||||||
* | rcx / context | rsp + 64
|
* | rcx / context | rsp + 80
|
||||||
* +------------------+
|
* +------------------+
|
||||||
* | guest ret addr | rsp + 72
|
* | guest ret addr | rsp + 88
|
||||||
* +------------------+
|
* +------------------+
|
||||||
* | call ret addr | rsp + 80
|
* | call ret addr | rsp + 96
|
||||||
* +------------------+
|
* +------------------+
|
||||||
* ... locals ...
|
* ... locals ...
|
||||||
* +------------------+
|
* +------------------+
|
||||||
|
@ -116,10 +116,10 @@ class StackLayout {
|
||||||
public:
|
public:
|
||||||
const static size_t THUNK_STACK_SIZE = 120;
|
const static size_t THUNK_STACK_SIZE = 120;
|
||||||
|
|
||||||
const static size_t GUEST_STACK_SIZE = 88;
|
const static size_t GUEST_STACK_SIZE = 104;
|
||||||
const static size_t GUEST_RCX_HOME = 64;
|
const static size_t GUEST_RCX_HOME = 80;
|
||||||
const static size_t GUEST_RET_ADDR = 72;
|
const static size_t GUEST_RET_ADDR = 88;
|
||||||
const static size_t GUEST_CALL_RET_ADDR = 80;
|
const static size_t GUEST_CALL_RET_ADDR = 96;
|
||||||
};
|
};
|
||||||
|
|
||||||
class X64ThunkEmitter : public X64Emitter {
|
class X64ThunkEmitter : public X64Emitter {
|
||||||
|
|
|
@ -1852,6 +1852,7 @@ Value* HIRBuilder::Splat(Value* value, TypeName target_type) {
|
||||||
Value* HIRBuilder::Permute(Value* control, Value* value1, Value* value2,
|
Value* HIRBuilder::Permute(Value* control, Value* value1, Value* value2,
|
||||||
TypeName part_type) {
|
TypeName part_type) {
|
||||||
ASSERT_TYPES_EQUAL(value1, value2);
|
ASSERT_TYPES_EQUAL(value1, value2);
|
||||||
|
assert_true(part_type >= INT8_TYPE && part_type <= INT32_TYPE);
|
||||||
|
|
||||||
// TODO(benvanik): could do some of this as constants.
|
// TODO(benvanik): could do some of this as constants.
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@ TEST_CASE("PERMUTE_V128_BY_INT32_CONSTANT", "[instr]") {
|
||||||
TEST_CASE("PERMUTE_V128_BY_V128", "[instr]") {
|
TEST_CASE("PERMUTE_V128_BY_V128", "[instr]") {
|
||||||
TestFunction test([](hir::HIRBuilder& b) {
|
TestFunction test([](hir::HIRBuilder& b) {
|
||||||
StoreVR(b, 3,
|
StoreVR(b, 3,
|
||||||
b.Permute(LoadVR(b, 3), LoadVR(b, 4), LoadVR(b, 5), VEC128_TYPE));
|
b.Permute(LoadVR(b, 3), LoadVR(b, 4), LoadVR(b, 5), INT8_TYPE));
|
||||||
b.Return();
|
b.Return();
|
||||||
});
|
});
|
||||||
test.Run([](PPCContext* ctx) {
|
test.Run([](PPCContext* ctx) {
|
||||||
|
|
Loading…
Reference in New Issue