More SSE work.

This commit is contained in:
Ben Vanik 2014-01-28 22:06:45 -08:00
parent b2e9086932
commit 8cddfcbf19
5 changed files with 132 additions and 134 deletions

View File

@ -2093,19 +2093,19 @@ int Translate_DID_SATURATE(TranslationContext& ctx, Instr* i) {
return DispatchToC(ctx, i, IntCode_DID_SATURATE);
}
#define VECTOR_COMPARER(type, value, count, op) \
#define VECTOR_COMPARER(type, value, dest_value, count, op) \
const vec128_t& src1 = ics.rf[i->src1_reg].v128; \
const vec128_t& src2 = ics.rf[i->src2_reg].v128; \
vec128_t& dest = ics.rf[i->dest_reg].v128; \
for (int n = 0; n < count; n++) { \
dest.value[n] = ((type)src1.value[n] op (type)src2.value[n]) ? (type)0xFFFFFFFF : 0; \
dest.dest_value[n] = ((type)src1.value[n] op (type)src2.value[n]) ? 0xFFFFFFFF : 0; \
} \
return IA_NEXT;
uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, ==) };
uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, ==) };
uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, ==) };
uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, ==) };
uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, ==) };
uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, ==) };
uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, ==) };
uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, ==) };
int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) {
static IntCodeFn fns[] = {
IntCode_VECTOR_COMPARE_EQ_I8,
@ -2119,10 +2119,10 @@ int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) {
return DispatchToC(ctx, i, fns[i->flags]);
}
uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, 16, >) };
uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, 8, >) };
uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, 4, >) };
uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >) };
uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, b16, 16, >) };
uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, s8, 8, >) };
uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, i4, 4, >) };
uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >) };
int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) {
static IntCodeFn fns[] = {
IntCode_VECTOR_COMPARE_SGT_I8,
@ -2136,10 +2136,10 @@ int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) {
return DispatchToC(ctx, i, fns[i->flags]);
}
uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, 16, >=) };
uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, 8, >=) };
uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, 4, >=) };
uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >=) };
uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int8_t, b16, b16, 16, >=) };
uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int16_t, s8, s8, 8, >=) };
uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(int32_t, i4, i4, 4, >=) };
uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >=) };
int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) {
static IntCodeFn fns[] = {
IntCode_VECTOR_COMPARE_SGE_I8,
@ -2153,10 +2153,10 @@ int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) {
return DispatchToC(ctx, i, fns[i->flags]);
}
uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, >) };
uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, >) };
uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, >) };
uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >) };
uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, >) };
uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, >) };
uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, >) };
uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >) };
int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) {
static IntCodeFn fns[] = {
IntCode_VECTOR_COMPARE_UGT_I8,
@ -2170,10 +2170,10 @@ int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) {
return DispatchToC(ctx, i, fns[i->flags]);
}
uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, 16, >=) };
uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, 8, >=) };
uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, 4, >=) };
uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, 4, >=) };
uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint8_t, b16, b16, 16, >=) };
uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint16_t, s8, s8, 8, >=) };
uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(uint32_t, i4, i4, 4, >=) };
uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { VECTOR_COMPARER(float, f4, i4, 4, >=) };
int Translate_VECTOR_COMPARE_UGE(TranslationContext& ctx, Instr* i) {
static IntCodeFn fns[] = {
IntCode_VECTOR_COMPARE_UGE_I8,

View File

@ -17,6 +17,9 @@
#include <alloy/runtime/runtime.h>
#include <alloy/runtime/thread_state.h>
// TODO(benvanik): reimplement packing functions
#include <DirectXPackedVector.h>
using namespace alloy;
using namespace alloy::backend::x64;
using namespace alloy::backend::x64::lowering;
@ -87,6 +90,14 @@ void Dummy() {
//
}
void Unpack_FLOAT16_2(void* raw_context, __m128& v) {
uint32_t src = v.m128_i32[3];
v.m128_f32[0] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)src);
v.m128_f32[1] = DirectX::PackedVector::XMConvertHalfToFloat((uint16_t)(src >> 16));
v.m128_f32[2] = 0.0f;
v.m128_f32[3] = 1.0f;
}
uint64_t LoadClock(void* raw_context) {
LARGE_INTEGER counter;
uint64_t time = 0;
@ -378,7 +389,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) {
Xmm src;
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, src, 0);
e.pextrd(dest, src, 0);
e.vmovd(dest, src);
e.EndOp(dest, src);
} else {
UNIMPLEMENTED_SEQ();
@ -389,7 +400,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) {
Xmm src;
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, src, 0);
e.pextrq(dest, src, 0);
e.vmovq(dest, src);
e.EndOp(dest, src);
} else {
UNIMPLEMENTED_SEQ();
@ -400,7 +411,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) {
Reg32 src;
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, src, 0);
e.pinsrd(dest, src, 0);
e.vmovd(dest, src);
e.EndOp(dest, src);
} else {
UNIMPLEMENTED_SEQ();
@ -411,7 +422,7 @@ table->AddSequence(OPCODE_CAST, [](X64Emitter& e, Instr*& i) {
Reg64 src;
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, src, 0);
e.pinsrq(dest, src, 0);
e.vmovq(dest, src);
e.EndOp(dest, src);
} else {
UNIMPLEMENTED_SEQ();
@ -582,7 +593,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) {
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, src, 0);
// TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?)
e.cvtss2si(dest, src);
e.cvttss2si(dest, src);
e.EndOp(dest, src);
} else if (i->Match(SIG_TYPE_I32, SIG_TYPE_F64)) {
Reg32 dest;
@ -591,7 +602,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) {
i->src1.value, src, 0);
// TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?)
e.cvtsd2ss(e.xmm0, src);
e.cvtss2si(dest, e.xmm0);
e.cvttss2si(dest, e.xmm0);
e.EndOp(dest, src);
} else if (i->Match(SIG_TYPE_I64, SIG_TYPE_F64)) {
Reg64 dest;
@ -599,7 +610,7 @@ table->AddSequence(OPCODE_CONVERT, [](X64Emitter& e, Instr*& i) {
e.BeginOp(i->dest, dest, REG_DEST,
i->src1.value, src, 0);
// TODO(benvanik): additional checks for saturation/etc? cvtt* (trunc?)
e.cvtsd2si(dest, src);
e.cvttsd2si(dest, src);
e.EndOp(dest, src);
} else if (i->Match(SIG_TYPE_F32, SIG_TYPE_I32)) {
Xmm dest;
@ -764,10 +775,11 @@ table->AddSequence(OPCODE_LOAD_CLOCK, [](X64Emitter& e, Instr*& i) {
// --------------------------------------------------------------------------
table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) {
auto addr = e.rcx + i->src1.offset;
if (i->Match(SIG_TYPE_I8, SIG_TYPE_IGNORE)) {
Reg8 dest;
e.BeginOp(i->dest, dest, REG_DEST);
e.mov(dest, e.byte[e.rcx + i->src1.offset]);
e.mov(dest, e.byte[addr]);
e.EndOp(dest);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -777,7 +789,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_I16, SIG_TYPE_IGNORE)) {
Reg16 dest;
e.BeginOp(i->dest, dest, REG_DEST);
e.mov(dest, e.word[e.rcx + i->src1.offset]);
e.mov(dest, e.word[addr]);
e.EndOp(dest);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -787,7 +799,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_I32, SIG_TYPE_IGNORE)) {
Reg32 dest;
e.BeginOp(i->dest, dest, REG_DEST);
e.mov(dest, e.dword[e.rcx + i->src1.offset]);
e.mov(dest, e.dword[addr]);
e.EndOp(dest);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -797,7 +809,7 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_I64, SIG_TYPE_IGNORE)) {
Reg64 dest;
e.BeginOp(i->dest, dest, REG_DEST);
e.mov(dest, e.qword[e.rcx + i->src1.offset]);
e.mov(dest, e.qword[addr]);
e.EndOp(dest);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -807,28 +819,28 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_F32, SIG_TYPE_IGNORE)) {
Xmm dest;
e.BeginOp(i->dest, dest, REG_DEST);
e.movss(dest, e.dword[e.rcx + i->src1.offset]);
e.movss(dest, e.dword[addr]);
e.EndOp(dest);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.movaps(e.xmm0, dest);
e.lea(e.r8, Stash(e, dest));
CallNative(e, TraceContextLoadF32);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) {
Xmm dest;
e.BeginOp(i->dest, dest, REG_DEST);
e.movsd(dest, e.qword[e.rcx + i->src1.offset]);
e.movsd(dest, e.qword[addr]);
e.EndOp(dest);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.movaps(e.xmm0, dest);
e.lea(e.r8, Stash(e, dest));
CallNative(e, TraceContextLoadF64);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) {
Xmm dest;
e.BeginOp(i->dest, dest, REG_DEST);
// NOTE: we always know we are aligned.
e.movaps(dest, e.ptr[e.rcx + i->src1.offset]);
e.movaps(dest, e.ptr[addr]);
e.EndOp(dest);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -843,10 +855,11 @@ table->AddSequence(OPCODE_LOAD_CONTEXT, [](X64Emitter& e, Instr*& i) {
});
table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
auto addr = e.rcx + i->src1.offset;
if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8)) {
Reg8 src;
e.BeginOp(i->src2.value, src, 0);
e.mov(e.byte[e.rcx + i->src1.offset], src);
e.mov(e.byte[addr], src);
e.EndOp(src);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -854,7 +867,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
CallNative(e, TraceContextStoreI8);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I8C)) {
e.mov(e.byte[e.rcx + i->src1.offset], i->src2.value->constant.i8);
e.mov(e.byte[addr], i->src2.value->constant.i8);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.mov(e.r8b, i->src2.value->constant.i8);
@ -863,7 +876,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16)) {
Reg16 src;
e.BeginOp(i->src2.value, src, 0);
e.mov(e.word[e.rcx + i->src1.offset], src);
e.mov(e.word[addr], src);
e.EndOp(src);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -871,7 +884,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
CallNative(e, TraceContextStoreI16);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I16C)) {
e.mov(e.word[e.rcx + i->src1.offset], i->src2.value->constant.i16);
e.mov(e.word[addr], i->src2.value->constant.i16);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.mov(e.r8w, i->src2.value->constant.i16);
@ -880,7 +893,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32)) {
Reg32 src;
e.BeginOp(i->src2.value, src, 0);
e.mov(e.dword[e.rcx + i->src1.offset], src);
e.mov(e.dword[addr], src);
e.EndOp(src);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -888,7 +901,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
CallNative(e, TraceContextStoreI32);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I32C)) {
e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32);
e.mov(e.dword[addr], i->src2.value->constant.i32);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.mov(e.r8d, i->src2.value->constant.i32);
@ -897,7 +910,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64)) {
Reg64 src;
e.BeginOp(i->src2.value, src, 0);
e.mov(e.qword[e.rcx + i->src1.offset], src);
e.mov(e.qword[addr], src);
e.EndOp(src);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -905,7 +918,7 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
CallNative(e, TraceContextStoreI64);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_I64C)) {
MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.i64);
MovMem64(e, addr, i->src2.value->constant.i64);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.mov(e.r8, i->src2.value->constant.i64);
@ -914,42 +927,46 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32)) {
Xmm src;
e.BeginOp(i->src2.value, src, 0);
e.movss(e.dword[e.rcx + i->src1.offset], src);
e.movss(e.dword[addr], src);
e.EndOp(src);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.movss(e.xmm0, src);
e.lea(e.r8, Stash(e, src));
CallNative(e, TraceContextStoreF32);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) {
e.mov(e.dword[e.rcx + i->src1.offset], i->src2.value->constant.i32);
e.mov(e.dword[addr], i->src2.value->constant.i32);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.movss(e.xmm0, e.dword[e.rcx + i->src1.offset]);
e.mov(e.eax, i->src2.value->constant.i32);
e.vmovd(e.xmm0, e.eax);
e.lea(e.r8, Stash(e, e.xmm0));
CallNative(e, TraceContextStoreF32);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) {
Xmm src;
e.BeginOp(i->src2.value, src, 0);
e.movsd(e.qword[e.rcx + i->src1.offset], src);
e.movsd(e.qword[addr], src);
e.EndOp(src);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.movsd(e.xmm0, src);
e.lea(e.r8, Stash(e, src));
CallNative(e, TraceContextStoreF64);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) {
MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.i64);
MovMem64(e, addr, i->src2.value->constant.i64);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.movsd(e.xmm0, e.qword[e.rcx + i->src1.offset]);
e.mov(e.rax, i->src2.value->constant.i64);
e.vmovq(e.xmm0, e.rax);
e.lea(e.r8, Stash(e, e.xmm0));
CallNative(e, TraceContextStoreF64);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128)) {
Xmm src;
e.BeginOp(i->src2.value, src, 0);
// NOTE: we always know we are aligned.
e.movaps(e.ptr[e.rcx + i->src1.offset], src);
e.movaps(e.ptr[addr], src);
e.EndOp(src);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
@ -959,11 +976,11 @@ table->AddSequence(OPCODE_STORE_CONTEXT, [](X64Emitter& e, Instr*& i) {
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_V128C)) {
// TODO(benvanik): check zero
// TODO(benvanik): correct order?
MovMem64(e, e.rcx + i->src1.offset, i->src2.value->constant.v128.low);
MovMem64(e, e.rcx + i->src1.offset + 8, i->src2.value->constant.v128.high);
MovMem64(e, addr, i->src2.value->constant.v128.low);
MovMem64(e, addr + 8, i->src2.value->constant.v128.high);
#if DTRACE
e.mov(e.rdx, i->src1.offset);
e.lea(e.r8, e.ptr[e.rcx + i->src1.offset]);
e.lea(e.r8, e.ptr[addr]);
CallNative(e, TraceContextStoreV128);
#endif // DTRACE
} else {
@ -1062,7 +1079,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) {
e.EndOp(dest);
#if DTRACE
e.lea(e.rdx, e.ptr[addr]);
e.movss(e.xmm0, dest);
e.lea(e.r8, Stash(e, dest));
CallNative(e, TraceMemoryLoadF32);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_F64, SIG_TYPE_IGNORE)) {
@ -1072,7 +1089,7 @@ table->AddSequence(OPCODE_LOAD, [](X64Emitter& e, Instr*& i) {
e.EndOp(dest);
#if DTRACE
e.lea(e.rdx, e.ptr[addr]);
e.movsd(e.xmm0, dest);
e.lea(e.r8, Stash(e, dest));
CallNative(e, TraceMemoryLoadF64);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_V128, SIG_TYPE_IGNORE)) {
@ -1224,14 +1241,16 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) {
e.EndOp(src);
#if DTRACE
e.lea(e.rdx, e.ptr[addr]);
e.movss(e.xmm0, src);
e.lea(e.r8, Stash(e, src));
CallNative(e, TraceMemoryStoreF32);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F32C)) {
e.mov(e.dword[addr], i->src2.value->constant.i32);
#if DTRACE
e.lea(e.rdx, e.ptr[addr]);
e.movss(e.xmm0, e.ptr[addr]);
e.mov(e.eax, i->src2.value->constant.i32);
e.vmovd(e.xmm0, e.eax);
e.lea(e.r8, Stash(e, e.xmm0));
CallNative(e, TraceMemoryStoreF32);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64)) {
@ -1241,7 +1260,7 @@ table->AddSequence(OPCODE_STORE, [](X64Emitter& e, Instr*& i) {
e.EndOp(src);
#if DTRACE
e.lea(e.rdx, e.ptr[addr]);
e.movsd(e.xmm0, src);
e.lea(e.r8, Stash(e, src));
CallNative(e, TraceMemoryStoreF64);
#endif // DTRACE
} else if (i->Match(SIG_TYPE_X, SIG_TYPE_IGNORE, SIG_TYPE_F64C)) {
@ -2160,7 +2179,6 @@ table->AddSequence(OPCODE_VECTOR_SHL, [](X64Emitter& e, Instr*& i) {
XmmBinaryOp(e, i, i->flags, [](X64Emitter& e, Instr& i, const Xmm& dest_src, const Xmm& src) {
// src shift mask may have values >31, and x86 sets to zero when
// that happens so we mask.
e.db(0xCC);
e.mov(e.eax, 0x1F);
e.vmovd(e.xmm0, e.eax);
e.vpbroadcastd(e.xmm0, e.xmm0);
@ -2637,16 +2655,14 @@ table->AddSequence(OPCODE_UNPACK, [](X64Emitter& e, Instr*& i) {
XmmUnaryOp(e, i, 0, [](X64Emitter& e, Instr& i, const Xmm& dest, const Xmm& src) {
// sx = src.iw >> 16;
// sy = src.iw & 0xFFFF;
// dest = { 3.0 + (sx / float(1 << 22)),
// 3.0 + (sy / float(1 << 22)),
// dest = { XMConvertHalfToFloat(sx),
// XMConvertHalfToFloat(sy),
// 0.0,
// 1.0); --- or 3.0?
// So:
// xmm = {0,0,0,packed}
// xmm <<= 1w {0,0,packed,0}
// xmm = VCVTPH2PS(xmm) {sx,sy,0,0}
// xmm /=
UNIMPLEMENTED_SEQ();
// 1.0 };
auto addr = Stash(e, src);
e.lea(e.rdx, addr);
CallNative(e, Unpack_FLOAT16_2);
e.movaps(dest, addr);
});
} else if (i->flags == PACK_TYPE_FLOAT16_4) {
// Could be shared with FLOAT16_2.

View File

@ -57,23 +57,18 @@ void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value) {
auto thread_state = *((ThreadState**)raw_context);
DPRINT("%lld (%llX) = ctx i64 +%d\n", (int64_t)value, value, offset);
}
void TraceContextLoadF32(void* raw_context, uint64_t offset, float value) {
void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
union {
float f;
uint32_t u;
} x;
x.f = value;
DPRINT("%e (%X) = ctx f32 +%d\n", x.f, x.u, offset);
DPRINT("%e (%X) = ctx f32 +%d\n", value.m128_f32[0], value.m128_i32[0], offset);
}
void TraceContextLoadF64(void* raw_context, uint64_t offset, double value) {
void TraceContextLoadF64(void* raw_context, uint64_t offset, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
union {
double f;
uint64_t u;
} x;
x.f = value;
DPRINT("%lle (%llX) = ctx f64 +%d\n", x.f, x.u, offset);
double d;
uint64_t x;
} f;
f.x = value.m128_i64[0];
DPRINT("%lle (%llX) = ctx f64 +%d\n", f.d, value.m128_i64[0], offset);
}
void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
@ -99,23 +94,18 @@ void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value) {
auto thread_state = *((ThreadState**)raw_context);
DPRINT("ctx i64 +%d = %lld (%llX)\n", offset, (int64_t)value, value);
}
void TraceContextStoreF32(void* raw_context, uint64_t offset, float value) {
void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
union {
float f;
uint32_t u;
} x;
x.f = value;
DPRINT("ctx f32 +%d = %e (%.X)\n", offset, x.f, x.u);
DPRINT("ctx f32 +%d = %e (%X)\n", offset, value.m128_i32[0], value.m128_f32[0]);
}
void TraceContextStoreF64(void* raw_context, uint64_t offset, double value) {
void TraceContextStoreF64(void* raw_context, uint64_t offset, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
union {
double f;
uint64_t u;
} x;
x.f = value;
DPRINT("ctx f64 +%d = %lle (%.llX)\n", offset, x.f, x.u);
double d;
uint64_t x;
} f;
f.x = value.m128_i64[0];
DPRINT("ctx f64 +%d = %lle (%llX)\n", offset, value.m128_i64[0], f.d);
}
void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
@ -140,23 +130,18 @@ void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value) {
auto thread_state = *((ThreadState**)raw_context);
DPRINT("%lld (%llX) = load.i64 %.8X\n", (int64_t)value, value, address);
}
void TraceMemoryLoadF32(void* raw_context, uint64_t address, float value) {
void TraceMemoryLoadF32(void* raw_context, uint64_t address, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
union {
float f;
uint32_t u;
} x;
x.f = value;
DPRINT("%e (%X) = load.f32 %.8X\n", x.f, x.u, address);
DPRINT("%e (%X) = load.f32 %.8X\n", value.m128_f32[0], value.m128_i32[0], address);
}
void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value) {
void TraceMemoryLoadF64(void* raw_context, uint64_t address, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
union {
double f;
uint64_t u;
} x;
x.f = value;
DPRINT("%lle (%llX) = load.f64 %.8X\n", x.f, x.u, address);
double d;
uint64_t x;
} f;
f.x = value.m128_i64[0];
DPRINT("%lle (%llX) = load.f64 %.8X\n", f.d, value.m128_i64[0], address);
}
void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
@ -182,23 +167,18 @@ void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value) {
auto thread_state = *((ThreadState**)raw_context);
DPRINT("store.i64 %.8X = %lld (%llX)\n", address, (int64_t)value, value);
}
void TraceMemoryStoreF32(void* raw_context, uint64_t address, float value) {
void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
union {
float f;
uint32_t u;
} x;
x.f = value;
DPRINT("store.f32 %.8X = %e (%X)\n", address, x.f, x.u);
DPRINT("store.f32 %.8X = %e (%X)\n", address, value.m128_f32[0], value.m128_i32[0]);
}
void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value) {
void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);
union {
double f;
uint64_t u;
} x;
x.f = value;
DPRINT("store.f64 %.8X = %lle (%llX)\n", address, x.f, x.u);
double d;
uint64_t x;
} f;
f.x = value.m128_i64[0];
DPRINT("store.f64 %.8X = %lle (%llX)\n", address, f.d, value.m128_i64[0]);
}
void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value) {
auto thread_state = *((ThreadState**)raw_context);

View File

@ -25,32 +25,32 @@ void TraceContextLoadI8(void* raw_context, uint64_t offset, uint8_t value);
void TraceContextLoadI16(void* raw_context, uint64_t offset, uint16_t value);
void TraceContextLoadI32(void* raw_context, uint64_t offset, uint32_t value);
void TraceContextLoadI64(void* raw_context, uint64_t offset, uint64_t value);
void TraceContextLoadF32(void* raw_context, uint64_t offset, float value);
void TraceContextLoadF64(void* raw_context, uint64_t offset, double value);
void TraceContextLoadF32(void* raw_context, uint64_t offset, __m128 value);
void TraceContextLoadF64(void* raw_context, uint64_t offset, __m128 value);
void TraceContextLoadV128(void* raw_context, uint64_t offset, __m128 value);
void TraceContextStoreI8(void* raw_context, uint64_t offset, uint8_t value);
void TraceContextStoreI16(void* raw_context, uint64_t offset, uint16_t value);
void TraceContextStoreI32(void* raw_context, uint64_t offset, uint32_t value);
void TraceContextStoreI64(void* raw_context, uint64_t offset, uint64_t value);
void TraceContextStoreF32(void* raw_context, uint64_t offset, float value);
void TraceContextStoreF64(void* raw_context, uint64_t offset, double value);
void TraceContextStoreF32(void* raw_context, uint64_t offset, __m128 value);
void TraceContextStoreF64(void* raw_context, uint64_t offset, __m128 value);
void TraceContextStoreV128(void* raw_context, uint64_t offset, __m128 value);
void TraceMemoryLoadI8(void* raw_context, uint64_t address, uint8_t value);
void TraceMemoryLoadI16(void* raw_context, uint64_t address, uint16_t value);
void TraceMemoryLoadI32(void* raw_context, uint64_t address, uint32_t value);
void TraceMemoryLoadI64(void* raw_context, uint64_t address, uint64_t value);
void TraceMemoryLoadF32(void* raw_context, uint64_t address, float value);
void TraceMemoryLoadF64(void* raw_context, uint64_t address, double value);
void TraceMemoryLoadF32(void* raw_context, uint64_t address, __m128 value);
void TraceMemoryLoadF64(void* raw_context, uint64_t address, __m128 value);
void TraceMemoryLoadV128(void* raw_context, uint64_t address, __m128 value);
void TraceMemoryStoreI8(void* raw_context, uint64_t address, uint8_t value);
void TraceMemoryStoreI16(void* raw_context, uint64_t address, uint16_t value);
void TraceMemoryStoreI32(void* raw_context, uint64_t address, uint32_t value);
void TraceMemoryStoreI64(void* raw_context, uint64_t address, uint64_t value);
void TraceMemoryStoreF32(void* raw_context, uint64_t address, float value);
void TraceMemoryStoreF64(void* raw_context, uint64_t address, double value);
void TraceMemoryStoreF32(void* raw_context, uint64_t address, __m128 value);
void TraceMemoryStoreF64(void* raw_context, uint64_t address, __m128 value);
void TraceMemoryStoreV128(void* raw_context, uint64_t address, __m128 value);
} // namespace lowering

View File

@ -116,7 +116,9 @@ int X64Emitter::Emit(HIRBuilder* builder) {
GetRegBit(r11) |
GetRegBit(xmm1) |
GetRegBit(xmm2) |
GetRegBit(xmm3);
GetRegBit(xmm3) |
GetRegBit(xmm4) |
GetRegBit(xmm5);
// Function prolog.
// Must be 16b aligned.