diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 00723703c..79d904ea9 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -41,7 +41,7 @@ using alloy::runtime::ThreadState; static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024; static const size_t STASH_OFFSET = 32; -static const size_t STASH_OFFSET_HIGH = 32 + 16; +static const size_t STASH_OFFSET_HIGH = 32 + 32; // If we are running with tracing on we have to store the EFLAGS in the stack, // otherwise our calls out to C to print will clear it before DID_CARRY/etc diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index e2e10d4f6..fe422d8f0 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -4958,6 +4958,7 @@ EMITTER_OPCODE_TABLE( // ============================================================================ EMITTER(PERMUTE_I32, MATCH(I, I32<>, V128<>, V128<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { + assert_true(i.instr->flags == INT32_TYPE); // Permute words between src2 and src3. // TODO(benvanik): check src3 for zero. if 0, we can use pshufb. if (i.src1.is_constant) { @@ -5006,8 +5007,7 @@ EMITTER(PERMUTE_I32, MATCH(I, I32<>, V128<>, V128<>>)) { } }; EMITTER(PERMUTE_V128, MATCH(I, V128<>, V128<>, V128<>>)) { - static void Emit(X64Emitter& e, const EmitArgType& i) { - assert_true(i.instr->flags == INT8_TYPE); + static void EmitByInt8(X64Emitter& e, const EmitArgType& i) { // TODO(benvanik): find out how to do this with only one temp register! // Permute bytes between src2 and src3. if (i.src3.value->IsConstantZero()) { @@ -5068,6 +5068,65 @@ EMITTER(PERMUTE_V128, MATCH(I, V128<>, V128<>, V128<>>)) e.vpblendvb(i.dest, src2_shuf, src3_shuf, i.dest); } } + + static __m128i EmulateByInt16(void*, __m128i control, __m128i src1, __m128i src2) { + alignas(16) uint16_t c[8]; + alignas(16) uint16_t a[8]; + alignas(16) uint16_t b[8]; + _mm_store_si128(reinterpret_cast<__m128i*>(c), control); + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); + for (size_t i = 0; i < 8; ++i) { + uint16_t si = (c[i] & 0xF) ^ 0x1; + c[i] = si >= 8 ? b[si - 8] : a[si]; + } + return _mm_load_si128(reinterpret_cast<__m128i*>(c)); + } + static void EmitByInt16(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): replace with proper version. + assert_true(i.src1.is_constant); + if (i.src1.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src1.constant()); + e.lea(e.r8, e.StashXmm(0, e.xmm0)); + } else { + e.lea(e.r8, e.StashXmm(0, i.src1)); + } + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + if (i.src3.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src3.constant()); + e.lea(e.r10, e.StashXmm(2, e.xmm0)); + } else { + e.lea(e.r10, e.StashXmm(2, i.src3)); + } + e.CallNativeSafe(reinterpret_cast(EmulateByInt16)); + e.vmovaps(i.dest, e.xmm0); + } + + static void EmitByInt32(X64Emitter& e, const EmitArgType& i) { + assert_always(); + } + + static void Emit(X64Emitter& e, const EmitArgType& i) { + switch (i.instr->flags) { + case INT8_TYPE: + EmitByInt8(e, i); + break; + case INT16_TYPE: + EmitByInt16(e, i); + break; + case INT32_TYPE: + EmitByInt32(e, i); + break; + default: + assert_unhandled_case(i.instr->flags); + return; + } + } }; EMITTER_OPCODE_TABLE( OPCODE_PERMUTE, diff --git a/src/alloy/backend/x64/x64_thunk_emitter.cc b/src/alloy/backend/x64/x64_thunk_emitter.cc index 049377417..999766662 100644 --- a/src/alloy/backend/x64/x64_thunk_emitter.cc +++ b/src/alloy/backend/x64/x64_thunk_emitter.cc @@ -118,6 +118,7 @@ GuestToHostThunk X64ThunkEmitter::EmitGuestToHostThunk() { mov(rax, rdx); mov(rdx, r8); mov(r8, r9); + mov(r9, r10); call(rax); mov(rbx, qword[rsp + 48]); diff --git a/src/alloy/backend/x64/x64_thunk_emitter.h b/src/alloy/backend/x64/x64_thunk_emitter.h index a07b6f0c8..b8b5c612d 100644 --- a/src/alloy/backend/x64/x64_thunk_emitter.h +++ b/src/alloy/backend/x64/x64_thunk_emitter.h @@ -96,14 +96,14 @@ namespace x64 { * | | * | | * +------------------+ - * | scratch, 32b | rsp + 32 + * | scratch, 48b | rsp + 32 * | | * +------------------+ - * | rcx / context | rsp + 64 + * | rcx / context | rsp + 80 * +------------------+ - * | guest ret addr | rsp + 72 + * | guest ret addr | rsp + 88 * +------------------+ - * | call ret addr | rsp + 80 + * | call ret addr | rsp + 96 * +------------------+ * ... locals ... * +------------------+ @@ -116,10 +116,10 @@ class StackLayout { public: const static size_t THUNK_STACK_SIZE = 120; - const static size_t GUEST_STACK_SIZE = 88; - const static size_t GUEST_RCX_HOME = 64; - const static size_t GUEST_RET_ADDR = 72; - const static size_t GUEST_CALL_RET_ADDR = 80; + const static size_t GUEST_STACK_SIZE = 104; + const static size_t GUEST_RCX_HOME = 80; + const static size_t GUEST_RET_ADDR = 88; + const static size_t GUEST_CALL_RET_ADDR = 96; }; class X64ThunkEmitter : public X64Emitter { diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc index 62dbe16cf..c981d6915 100644 --- a/src/alloy/hir/hir_builder.cc +++ b/src/alloy/hir/hir_builder.cc @@ -1852,6 +1852,7 @@ Value* HIRBuilder::Splat(Value* value, TypeName target_type) { Value* HIRBuilder::Permute(Value* control, Value* value1, Value* value2, TypeName part_type) { ASSERT_TYPES_EQUAL(value1, value2); + assert_true(part_type >= INT8_TYPE && part_type <= INT32_TYPE); // TODO(benvanik): could do some of this as constants. diff --git a/src/alloy/test/test_permute.cc b/src/alloy/test/test_permute.cc index cacb8635b..8b2f7b6a8 100644 --- a/src/alloy/test/test_permute.cc +++ b/src/alloy/test/test_permute.cc @@ -81,7 +81,7 @@ TEST_CASE("PERMUTE_V128_BY_INT32_CONSTANT", "[instr]") { TEST_CASE("PERMUTE_V128_BY_V128", "[instr]") { TestFunction test([](hir::HIRBuilder& b) { StoreVR(b, 3, - b.Permute(LoadVR(b, 3), LoadVR(b, 4), LoadVR(b, 5), VEC128_TYPE)); + b.Permute(LoadVR(b, 3), LoadVR(b, 4), LoadVR(b, 5), INT8_TYPE)); b.Return(); }); test.Run([](PPCContext* ctx) {