From 8cddfcbf19fcb5830a32370f3cc30e6e283cd222 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 28 Jan 2014 22:06:45 -0800 Subject: [PATCH] More SSE work. --- src/alloy/backend/ivm/ivm_intcode.cc | 44 +++---- .../x64/lowering/lowering_sequences.cc | 118 ++++++++++-------- src/alloy/backend/x64/lowering/tracers.cc | 84 +++++-------- src/alloy/backend/x64/lowering/tracers.h | 16 +-- src/alloy/backend/x64/x64_emitter.cc | 4 +- 5 files changed, 132 insertions(+), 134 deletions(-) diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 756db9203..f7e0cdbde 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -2093,19 +2093,19 @@ int Translate_DID_SATURATE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, IntCode_DID_SATURATE); } -#define VECTOR_COMPARER(type, value, count, op) \ +#define VECTOR_COMPARER(type, value, dest_value, count, op) \ const vec128_t& src1 = ics.rf[i->src1_reg].v128; \ const vec128_t& src2 = ics.rf[i->src2_reg].v128; \ vec128_t& dest = ics.rf[i->dest_reg].v128; \ for (int n = 0; n < count; n++) { \ - dest.value[n] = ((type)src1.value[n] op (type)src2.value[n]) ? (type)0xFFFFFFFF : 0; \ + dest.dest_value[n] = ((type)src1.value[n] op (type)src2.value[n]) ? 0xFFFFFFFF : 0; \ } \ return IA_NEXT; -uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, ==) }; -uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, ==) }; +uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, ==) }; int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_EQ_I8, @@ -2119,10 +2119,10 @@ int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, 16, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, 8, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, 4, >) }; -uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, b16, 16, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, s8, 8, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, i4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >) }; int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_SGT_I8, @@ -2136,10 +2136,10 @@ int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, 16, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, 8, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, 4, >=) }; -uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, b16, 16, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, s8, 8, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, i4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >=) }; int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_SGE_I8, @@ -2153,10 +2153,10 @@ int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, >) }; -uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, >) }; +uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >) }; int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_UGT_I8, @@ -2170,10 +2170,10 @@ int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, >=) }; -uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, >=) }; +uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >=) }; int Translate_VECTOR_COMPARE_UGE(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_UGE_I8, diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc index 461ef62d5..f22d34b7d 100644 --- a/src/alloy/backend/x64/lowering/lowering_sequences.cc +++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc @@ -17,6 +17,9 @@ #include #include +// TODO(benvanik): reimplement packing functions +#include + using namespace alloy; using namespace alloy::backend::x64; using namespace alloy::backend::x64::lowering; @@ -87,6 +90,14 @@ void Dummy() { // } +void Unpack_FLOAT16_2(void* raw_context, __m128& v) { + uint32_t src = v.m128_i32[3]; + v.m128_f32[0] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src); + v.m128_f32[1] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)(src >> 16)); + v.m128_f32[2] = 0.0f; + v.m128_f32[3] = 1.0f; +} + uint64_t LoadClock(void* raw_context) { LARGE_INTEGER counter; uint64_t time = 0; @@ -378,7 +389,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { Xmm src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pextrd(dest, src, 0); + e.vmovd(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -389,7 +400,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { Xmm src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pextrq(dest, src, 0); + e.vmovq(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -400,7 +411,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { Reg32 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pinsrd(dest, src, 0); + e.vmovd(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -411,7 +422,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) { Reg64 src; e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); - e.pinsrq(dest, src, 0); + e.vmovq(dest, src); e.EndOp(dest, src); } else { UNIMPLEMENTED_SEQ(); @@ -582,7 +593,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) - e.cvtss2si(dest, src); + e.cvttss2si(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_F64)) { Reg32 dest; @@ -591,7 +602,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { i->src1.value, src, 0); // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) e.cvtsd2ss(e.xmm0, src); - e.cvtss2si(dest, e.xmm0); + e.cvttss2si(dest, e.xmm0); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_F64)) { Reg64 dest; @@ -599,7 +610,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) { e.BeginOp(i->dest, dest, REG_DEST, i->src1.value, src, 0); // TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?) - e.cvtsd2si(dest, src); + e.cvttsd2si(dest, src); e.EndOp(dest, src); } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_I32)) { Xmm dest; @@ -764,10 +775,11 @@ table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) { // -------------------------------------------------------------------------- table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { + auto addr = e.rcx + i->src1.offset; if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) { Reg8 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.byte[e.rcx + i->src1.offset]); + e.mov(dest, e.byte[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -777,7 +789,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) { Reg16 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.word[e.rcx + i->src1.offset]); + e.mov(dest, e.word[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -787,7 +799,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) { Reg32 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.dword[e.rcx + i->src1.offset]); + e.mov(dest, e.dword[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -797,7 +809,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) { Reg64 dest; e.BeginOp(i->dest, dest, REG_DEST); - e.mov(dest, e.qword[e.rcx + i->src1.offset]); + e.mov(dest, e.qword[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -807,28 +819,28 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); - e.movss(dest, e.dword[e.rcx + i->src1.offset]); + e.movss(dest, e.dword[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movaps(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceContextLoadF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); - e.movsd(dest, e.qword[e.rcx + i->src1.offset]); + e.movsd(dest, e.qword[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movaps(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceContextLoadF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { Xmm dest; e.BeginOp(i->dest, dest, REG_DEST); // NOTE: we always know we are aligned. - e.movaps(dest, e.ptr[e.rcx + i->src1.offset]); + e.movaps(dest, e.ptr[addr]); e.EndOp(dest); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -843,10 +855,11 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) { }); table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { + auto addr = e.rcx + i->src1.offset; if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) { Reg8 src; e.BeginOp(i->src2.value, src, 0); - e.mov(e.byte[e.rcx + i->src1.offset], src); + e.mov(e.byte[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -854,7 +867,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI8); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) { - e.mov(e.byte[e.rcx + i->src1.offset], i->src2.value->constant.i8); + e.mov(e.byte[addr], i->src2.value->constant.i8); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8b, i->src2.value->constant.i8); @@ -863,7 +876,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) { Reg16 src; e.BeginOp(i->src2.value, src, 0); - e.mov(e.word[e.rcx + i->src1.offset], src); + e.mov(e.word[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -871,7 +884,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI16); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) { - e.mov(e.word[e.rcx + i->src1.offset], i->src2.value->constant.i16); + e.mov(e.word[addr], i->src2.value->constant.i16); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8w, i->src2.value->constant.i16); @@ -880,7 +893,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) { Reg32 src; e.BeginOp(i->src2.value, src, 0); - e.mov(e.dword[e.rcx + i->src1.offset], src); + e.mov(e.dword[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -888,7 +901,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) { - e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); + e.mov(e.dword[addr], i->src2.value->constant.i32); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8d, i->src2.value->constant.i32); @@ -897,7 +910,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) { Reg64 src; e.BeginOp(i->src2.value, src, 0); - e.mov(e.qword[e.rcx + i->src1.offset], src); + e.mov(e.qword[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -905,7 +918,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { CallNative(e, TraceContextStoreI64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) { - MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.i64); + MovMem64(e, addr, i->src2.value->constant.i64); #if DTRACE e.mov(e.rdx, i->src1.offset); e.mov(e.r8, i->src2.value->constant.i64); @@ -914,42 +927,46 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) { Xmm src; e.BeginOp(i->src2.value, src, 0); - e.movss(e.dword[e.rcx + i->src1.offset], src); + e.movss(e.dword[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movss(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceContextStoreF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { - e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32); + e.mov(e.dword[addr], i->src2.value->constant.i32); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movss(e.xmm0, e.dword[e.rcx + i->src1.offset]); + e.mov(e.eax, i->src2.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.lea(e.r8, Stash(e, e.xmm0)); CallNative(e, TraceContextStoreF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { Xmm src; e.BeginOp(i->src2.value, src, 0); - e.movsd(e.qword[e.rcx + i->src1.offset], src); + e.movsd(e.qword[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movsd(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceContextStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { - MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.i64); + MovMem64(e, addr, i->src2.value->constant.i64); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.movsd(e.xmm0, e.qword[e.rcx + i->src1.offset]); + e.mov(e.rax, i->src2.value->constant.i64); + e.vmovq(e.xmm0, e.rax); + e.lea(e.r8, Stash(e, e.xmm0)); CallNative(e, TraceContextStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) { Xmm src; e.BeginOp(i->src2.value, src, 0); // NOTE: we always know we are aligned. - e.movaps(e.ptr[e.rcx + i->src1.offset], src); + e.movaps(e.ptr[addr], src); e.EndOp(src); #if DTRACE e.mov(e.rdx, i->src1.offset); @@ -959,11 +976,11 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) { } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) { // TODO(benvanik): check zero // TODO(benvanik): correct order? - MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.v128.low); - MovMem64(e, e.rcx + i->src1.offset + 8, i->src2.value->constant.v128.high); + MovMem64(e, addr, i->src2.value->constant.v128.low); + MovMem64(e, addr + 8, i->src2.value->constant.v128.high); #if DTRACE e.mov(e.rdx, i->src1.offset); - e.lea(e.r8, e.ptr[e.rcx + i->src1.offset]); + e.lea(e.r8, e.ptr[addr]); CallNative(e, TraceContextStoreV128); #endif // DTRACE } else { @@ -1062,7 +1079,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.EndOp(dest); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movss(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceMemoryLoadF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) { @@ -1072,7 +1089,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) { e.EndOp(dest); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movsd(e.xmm0, dest); + e.lea(e.r8, Stash(e, dest)); CallNative(e, TraceMemoryLoadF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) { @@ -1224,14 +1241,16 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.EndOp(src); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movss(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceMemoryStoreF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) { e.mov(e.dword[addr], i->src2.value->constant.i32); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movss(e.xmm0, e.ptr[addr]); + e.mov(e.eax, i->src2.value->constant.i32); + e.vmovd(e.xmm0, e.eax); + e.lea(e.r8, Stash(e, e.xmm0)); CallNative(e, TraceMemoryStoreF32); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) { @@ -1241,7 +1260,7 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) { e.EndOp(src); #if DTRACE e.lea(e.rdx, e.ptr[addr]); - e.movsd(e.xmm0, src); + e.lea(e.r8, Stash(e, src)); CallNative(e, TraceMemoryStoreF64); #endif // DTRACE } else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) { @@ -2160,7 +2179,6 @@ table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) { XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) { // src shift mask may have values >31, and x86 sets to zero when // that happens so we mask. - e.db(0xCC); e.mov(e.eax, 0x1F); e.vmovd(e.xmm0, e.eax); e.vpbroadcastd(e.xmm0, e.xmm0); @@ -2637,16 +2655,14 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) { XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) { // sx = src.iw >> 16; // sy = src.iw & 0xFFFF; - // dest = { 3.0 + (sx / float(1 << 22)), - // 3.0 + (sy / float(1 << 22)), + // dest = { XMConvertHalfToFloat(sx), + // XMConvertHalfToFloat(sy), // 0.0, - // 1.0); --- or 3.0? - // So: - // xmm = {0,0,0,packed} - // xmm <<= 1w {0,0,packed,0} - // xmm = VCVTPH2PS(xmm) {sx,sy,0,0} - // xmm /= - UNIMPLEMENTED_SEQ(); + // 1.0 }; + auto addr = Stash(e, src); + e.lea(e.rdx, addr); + CallNative(e, Unpack_FLOAT16_2); + e.movaps(dest, addr); }); } else if (i->flags == PACK_TYPE_FLOAT16_4) { // Could be shared with FLOAT16_2. diff --git a/src/alloy/backend/x64/lowering/tracers.cc b/src/alloy/backend/x64/lowering/tracers.cc index f53d70dc1..0d7975847 100644 --- a/src/alloy/backend/x64/lowering/tracers.cc +++ b/src/alloy/backend/x64/lowering/tracers.cc @@ -57,23 +57,18 @@ void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value) { auto thread_state = *((ThreadState**)raw_context); DPRINT("%lld (%llX) = ctx i64 +%d\n", (int64_t)value, value, offset); } -void TraceContextLoadF32(void* raw_context, uint64_t offset, float value) { +void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - union { - float f; - uint32_t u; - } x; - x.f = value; - DPRINT("%e (%X) = ctx f32 +%d\n", x.f, x.u, offset); + DPRINT("%e (%X) = ctx f32 +%d\n", value.m128_f32[0], value.m128_i32[0], offset); } -void TraceContextLoadF64(void* raw_context, uint64_t offset, double value) { +void TraceContextLoadF64(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); union { - double f; - uint64_t u; - } x; - x.f = value; - DPRINT("%lle (%llX) = ctx f64 +%d\n", x.f, x.u, offset); + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("%lle (%llX) = ctx f64 +%d\n", f.d, value.m128_i64[0], offset); } void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); @@ -99,23 +94,18 @@ void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value) { auto thread_state = *((ThreadState**)raw_context); DPRINT("ctx i64 +%d = %lld (%llX)\n", offset, (int64_t)value, value); } -void TraceContextStoreF32(void* raw_context, uint64_t offset, float value) { +void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - union { - float f; - uint32_t u; - } x; - x.f = value; - DPRINT("ctx f32 +%d = %e (%.X)\n", offset, x.f, x.u); + DPRINT("ctx f32 +%d = %e (%X)\n", offset, value.m128_i32[0], value.m128_f32[0]); } -void TraceContextStoreF64(void* raw_context, uint64_t offset, double value) { +void TraceContextStoreF64(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); union { - double f; - uint64_t u; - } x; - x.f = value; - DPRINT("ctx f64 +%d = %lle (%.llX)\n", offset, x.f, x.u); + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("ctx f64 +%d = %lle (%llX)\n", offset, value.m128_i64[0], f.d); } void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value) { auto thread_state = *((ThreadState**)raw_context); @@ -140,23 +130,18 @@ void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value) { auto thread_state = *((ThreadState**)raw_context); DPRINT("%lld (%llX) = load.i64 %.8X\n", (int64_t)value, value, address); } -void TraceMemoryLoadF32(void* raw_context, uint64_t address, float value) { +void TraceMemoryLoadF32(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - union { - float f; - uint32_t u; - } x; - x.f = value; - DPRINT("%e (%X) = load.f32 %.8X\n", x.f, x.u, address); + DPRINT("%e (%X) = load.f32 %.8X\n", value.m128_f32[0], value.m128_i32[0], address); } -void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value) { +void TraceMemoryLoadF64(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); union { - double f; - uint64_t u; - } x; - x.f = value; - DPRINT("%lle (%llX) = load.f64 %.8X\n", x.f, x.u, address); + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("%lle (%llX) = load.f64 %.8X\n", f.d, value.m128_i64[0], address); } void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); @@ -182,23 +167,18 @@ void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value) { auto thread_state = *((ThreadState**)raw_context); DPRINT("store.i64 %.8X = %lld (%llX)\n", address, (int64_t)value, value); } -void TraceMemoryStoreF32(void* raw_context, uint64_t address, float value) { +void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); - union { - float f; - uint32_t u; - } x; - x.f = value; - DPRINT("store.f32 %.8X = %e (%X)\n", address, x.f, x.u); + DPRINT("store.f32 %.8X = %e (%X)\n", address, value.m128_f32[0], value.m128_i32[0]); } -void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value) { +void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); union { - double f; - uint64_t u; - } x; - x.f = value; - DPRINT("store.f64 %.8X = %lle (%llX)\n", address, x.f, x.u); + double d; + uint64_t x; + } f; + f.x = value.m128_i64[0]; + DPRINT("store.f64 %.8X = %lle (%llX)\n", address, f.d, value.m128_i64[0]); } void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value) { auto thread_state = *((ThreadState**)raw_context); diff --git a/src/alloy/backend/x64/lowering/tracers.h b/src/alloy/backend/x64/lowering/tracers.h index eccc87de9..9afd58448 100644 --- a/src/alloy/backend/x64/lowering/tracers.h +++ b/src/alloy/backend/x64/lowering/tracers.h @@ -25,32 +25,32 @@ void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value); void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value); void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value); void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value); -void TraceContextLoadF32(void* raw_context, uint64_t offset, float value); -void TraceContextLoadF64(void* raw_context, uint64_t offset, double value); +void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value); +void TraceContextLoadF64(void* raw_context, uint64_t offset, __m128 value); void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value); void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value); void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value); void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value); void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value); -void TraceContextStoreF32(void* raw_context, uint64_t offset, float value); -void TraceContextStoreF64(void* raw_context, uint64_t offset, double value); +void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value); +void TraceContextStoreF64(void* raw_context, uint64_t offset, __m128 value); void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value); void TraceMemoryLoadI8(void* raw_context, uint64_t address, uint8_t value); void TraceMemoryLoadI16(void* raw_context, uint64_t address, uint16_t value); void TraceMemoryLoadI32(void* raw_context, uint64_t address, uint32_t value); void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value); -void TraceMemoryLoadF32(void* raw_context, uint64_t address, float value); -void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value); +void TraceMemoryLoadF32(void* raw_context, uint64_t address, __m128 value); +void TraceMemoryLoadF64(void* raw_context, uint64_t address, __m128 value); void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value); void TraceMemoryStoreI8(void* raw_context, uint64_t address, uint8_t value); void TraceMemoryStoreI16(void* raw_context, uint64_t address, uint16_t value); void TraceMemoryStoreI32(void* raw_context, uint64_t address, uint32_t value); void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value); -void TraceMemoryStoreF32(void* raw_context, uint64_t address, float value); -void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value); +void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value); +void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value); void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value); } // namespace lowering diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 1e20c84f4..59fcfb36a 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -116,7 +116,9 @@ int X64Emitter::Emit(HIRBuilder* builder) { GetRegBit(r11) | GetRegBit(xmm1) | GetRegBit(xmm2) | - GetRegBit(xmm3); + GetRegBit(xmm3) | + GetRegBit(xmm4) | + GetRegBit(xmm5); // Function prolog. // Must be 16b aligned.