I don't like it, but fixing permute by int16.
This commit is contained in:
parent
acc1286b72
commit
b2e03fa628
|
@ -41,7 +41,7 @@ using alloy::runtime::ThreadState;
|
|||
static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024;
|
||||
|
||||
static const size_t STASH_OFFSET = 32;
|
||||
static const size_t STASH_OFFSET_HIGH = 32 + 16;
|
||||
static const size_t STASH_OFFSET_HIGH = 32 + 32;
|
||||
|
||||
// If we are running with tracing on we have to store the EFLAGS in the stack,
|
||||
// otherwise our calls out to C to print will clear it before DID_CARRY/etc
|
||||
|
|
|
@ -4958,6 +4958,7 @@ EMITTER_OPCODE_TABLE(
|
|||
// ============================================================================
|
||||
EMITTER(PERMUTE_I32, MATCH(I<OPCODE_PERMUTE, V128<>, I32<>, V128<>, V128<>>)) {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
assert_true(i.instr->flags == INT32_TYPE);
|
||||
// Permute words between src2 and src3.
|
||||
// TODO(benvanik): check src3 for zero. if 0, we can use pshufb.
|
||||
if (i.src1.is_constant) {
|
||||
|
@ -5006,8 +5007,7 @@ EMITTER(PERMUTE_I32, MATCH(I<OPCODE_PERMUTE, V128<>, I32<>, V128<>, V128<>>)) {
|
|||
}
|
||||
};
|
||||
EMITTER(PERMUTE_V128, MATCH(I<OPCODE_PERMUTE, V128<>, V128<>, V128<>, V128<>>)) {
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
assert_true(i.instr->flags == INT8_TYPE);
|
||||
static void EmitByInt8(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): find out how to do this with only one temp register!
|
||||
// Permute bytes between src2 and src3.
|
||||
if (i.src3.value->IsConstantZero()) {
|
||||
|
@ -5068,6 +5068,65 @@ EMITTER(PERMUTE_V128, MATCH(I<OPCODE_PERMUTE, V128<>, V128<>, V128<>, V128<>>))
|
|||
e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest);
|
||||
}
|
||||
}
|
||||
|
||||
static __m128i EmulateByInt16(void*, __m128i control, __m128i src1, __m128i src2) {
|
||||
alignas(16) uint16_t c[8];
|
||||
alignas(16) uint16_t a[8];
|
||||
alignas(16) uint16_t b[8];
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(c), control);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(a), src1);
|
||||
_mm_store_si128(reinterpret_cast<__m128i*>(b), src2);
|
||||
for (size_t i = 0; i < 8; ++i) {
|
||||
uint16_t si = (c[i] & 0xF) ^ 0x1;
|
||||
c[i] = si >= 8 ? b[si - 8] : a[si];
|
||||
}
|
||||
return _mm_load_si128(reinterpret_cast<__m128i*>(c));
|
||||
}
|
||||
static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
|
||||
// TODO(benvanik): replace with proper version.
|
||||
assert_true(i.src1.is_constant);
|
||||
if (i.src1.is_constant) {
|
||||
e.LoadConstantXmm(e.xmm0, i.src1.constant());
|
||||
e.lea(e.r8, e.StashXmm(0, e.xmm0));
|
||||
} else {
|
||||
e.lea(e.r8, e.StashXmm(0, i.src1));
|
||||
}
|
||||
if (i.src2.is_constant) {
|
||||
e.LoadConstantXmm(e.xmm0, i.src2.constant());
|
||||
e.lea(e.r9, e.StashXmm(1, e.xmm0));
|
||||
} else {
|
||||
e.lea(e.r9, e.StashXmm(1, i.src2));
|
||||
}
|
||||
if (i.src3.is_constant) {
|
||||
e.LoadConstantXmm(e.xmm0, i.src3.constant());
|
||||
e.lea(e.r10, e.StashXmm(2, e.xmm0));
|
||||
} else {
|
||||
e.lea(e.r10, e.StashXmm(2, i.src3));
|
||||
}
|
||||
e.CallNativeSafe(reinterpret_cast<void*>(EmulateByInt16));
|
||||
e.vmovaps(i.dest, e.xmm0);
|
||||
}
|
||||
|
||||
static void EmitByInt32(X64Emitter& e, const EmitArgType& i) {
|
||||
assert_always();
|
||||
}
|
||||
|
||||
static void Emit(X64Emitter& e, const EmitArgType& i) {
|
||||
switch (i.instr->flags) {
|
||||
case INT8_TYPE:
|
||||
EmitByInt8(e, i);
|
||||
break;
|
||||
case INT16_TYPE:
|
||||
EmitByInt16(e, i);
|
||||
break;
|
||||
case INT32_TYPE:
|
||||
EmitByInt32(e, i);
|
||||
break;
|
||||
default:
|
||||
assert_unhandled_case(i.instr->flags);
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
EMITTER_OPCODE_TABLE(
|
||||
OPCODE_PERMUTE,
|
||||
|
|
|
@ -118,6 +118,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() {
|
|||
mov(rax, rdx);
|
||||
mov(rdx, r8);
|
||||
mov(r8, r9);
|
||||
mov(r9, r10);
|
||||
call(rax);
|
||||
|
||||
mov(rbx, qword[rsp + 48]);
|
||||
|
|
|
@ -96,14 +96,14 @@ namespace x64 {
|
|||
* | |
|
||||
* | |
|
||||
* +------------------+
|
||||
* | scratch, 32b | rsp + 32
|
||||
* | scratch, 48b | rsp + 32
|
||||
* | |
|
||||
* +------------------+
|
||||
* | rcx / context | rsp + 64
|
||||
* | rcx / context | rsp + 80
|
||||
* +------------------+
|
||||
* | guest ret addr | rsp + 72
|
||||
* | guest ret addr | rsp + 88
|
||||
* +------------------+
|
||||
* | call ret addr | rsp + 80
|
||||
* | call ret addr | rsp + 96
|
||||
* +------------------+
|
||||
* ... locals ...
|
||||
* +------------------+
|
||||
|
@ -116,10 +116,10 @@ class StackLayout {
|
|||
public:
|
||||
const static size_t THUNK_STACK_SIZE = 120;
|
||||
|
||||
const static size_t GUEST_STACK_SIZE = 88;
|
||||
const static size_t GUEST_RCX_HOME = 64;
|
||||
const static size_t GUEST_RET_ADDR = 72;
|
||||
const static size_t GUEST_CALL_RET_ADDR = 80;
|
||||
const static size_t GUEST_STACK_SIZE = 104;
|
||||
const static size_t GUEST_RCX_HOME = 80;
|
||||
const static size_t GUEST_RET_ADDR = 88;
|
||||
const static size_t GUEST_CALL_RET_ADDR = 96;
|
||||
};
|
||||
|
||||
class X64ThunkEmitter : public X64Emitter {
|
||||
|
|
|
@ -1852,6 +1852,7 @@ Value* HIRBuilder::Splat(Value* value, TypeName target_type) {
|
|||
Value* HIRBuilder::Permute(Value* control, Value* value1, Value* value2,
|
||||
TypeName part_type) {
|
||||
ASSERT_TYPES_EQUAL(value1, value2);
|
||||
assert_true(part_type >= INT8_TYPE && part_type <= INT32_TYPE);
|
||||
|
||||
// TODO(benvanik): could do some of this as constants.
|
||||
|
||||
|
|
|
@ -81,7 +81,7 @@ TEST_CASE("PERMUTE_V128_BY_INT32_CONSTANT", "[instr]") {
|
|||
TEST_CASE("PERMUTE_V128_BY_V128", "[instr]") {
|
||||
TestFunction test([](hir::HIRBuilder& b) {
|
||||
StoreVR(b, 3,
|
||||
b.Permute(LoadVR(b, 3), LoadVR(b, 4), LoadVR(b, 5), VEC128_TYPE));
|
||||
b.Permute(LoadVR(b, 3), LoadVR(b, 4), LoadVR(b, 5), INT8_TYPE));
|
||||
b.Return();
|
||||
});
|
||||
test.Run([](PPCContext* ctx) {
|
||||
|
|
Loading…
Reference in New Issue