From f74aafeb8a010297397ac89a9e63f99645f5db26 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Fri, 29 Aug 2014 20:39:26 -0700 Subject: [PATCH] Swapping around vec128 to match AVX order. Was really hoping all this would fix some bugs, but no luck :( --- src/alloy/backend/ivm/ivm_intcode.cc | 693 +++++++++-------- src/alloy/backend/x64/x64_emitter.cc | 40 +- src/alloy/backend/x64/x64_emitter.h | 11 +- src/alloy/backend/x64/x64_sequences.cc | 760 +++++++++---------- src/alloy/frontend/ppc/ppc_emit_altivec.cc | 58 +- src/alloy/frontend/ppc/ppc_emit_alu.cc | 7 +- src/alloy/hir/opcodes.h | 7 +- src/alloy/hir/value.cc | 2 +- src/alloy/vec128.h | 152 ++-- src/poly/math.h | 5 + tools/alloy-sandbox/alloy-sandbox.gypi | 1 + tools/alloy-test/alloy-test.gypi | 29 +- tools/alloy-test/test_extract.cc | 141 ++++ tools/alloy-test/test_insert.cc | 83 ++ tools/alloy-test/test_load_vector_shl_shr.cc | 78 ++ tools/alloy-test/test_pack.cc | 111 +++ tools/alloy-test/test_permute.cc | 139 ++++ tools/alloy-test/test_sha.cc | 211 +++++ tools/alloy-test/test_shl.cc | 211 +++++ tools/alloy-test/test_shr.cc | 211 +++++ tools/alloy-test/test_swizzle.cc | 46 ++ tools/alloy-test/test_unpack.cc | 162 ++++ tools/alloy-test/test_vector_rotate_left.cc | 71 ++ tools/alloy-test/test_vector_sha.cc | 145 ++++ tools/alloy-test/test_vector_shl.cc | 145 ++++ tools/alloy-test/test_vector_shr.cc | 145 ++++ tools/alloy-test/util.h | 2 +- 27 files changed, 2845 insertions(+), 821 deletions(-) create mode 100644 tools/alloy-test/test_extract.cc create mode 100644 tools/alloy-test/test_insert.cc create mode 100644 tools/alloy-test/test_load_vector_shl_shr.cc create mode 100644 tools/alloy-test/test_pack.cc create mode 100644 tools/alloy-test/test_permute.cc create mode 100644 tools/alloy-test/test_sha.cc create mode 100644 tools/alloy-test/test_shl.cc create mode 100644 tools/alloy-test/test_shr.cc create mode 100644 tools/alloy-test/test_swizzle.cc create mode 100644 tools/alloy-test/test_unpack.cc create mode 100644 tools/alloy-test/test_vector_rotate_left.cc create mode 100644 tools/alloy-test/test_vector_sha.cc create mode 100644 tools/alloy-test/test_vector_shl.cc create mode 100644 tools/alloy-test/test_vector_shr.cc diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 10dcea2de..141d6790f 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -44,29 +44,13 @@ using alloy::runtime::FunctionInfo; #define DPRINT(...) (void()) #define DFLUSH() (void()) -//#define IPRINT if (ics.thread_state->thread_id() == 1) printf +//#define IPRINT \ +// if (ics.thread_state->thread_id() == 1) printf //#define IFLUSH() fflush(stdout) -//#define DPRINT if (ics.thread_state->thread_id() == 1) printf +//#define DPRINT \ +// if (ics.thread_state->thread_id() == 1) printf //#define DFLUSH() fflush(stdout) -#if XE_CPU_BIGENDIAN -#define VECB16(v, n) (v.b16[n]) -#define VECS8(v, n) (v.s8[n]) -#define VECI4(v, n) (v.i4[n]) -#define VECF4(v, n) (v.f4[n]) -#else -static const uint8_t __vector_b16_table[16] = { - 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, -}; -static const uint8_t __vector_s8_table[8] = { - 1, 0, 3, 2, 5, 4, 7, 6, -}; -#define VECB16(v, n) (v.b16[__vector_b16_table[(n)]]) -#define VECS8(v, n) (v.s8[__vector_s8_table[(n)]]) -#define VECI4(v, n) (v.i4[(n)]) -#define VECF4(v, n) (v.f4[(n)]) -#endif - uint32_t IntCode_INT_LOAD_CONSTANT(IntCodeState& ics, const IntCode* i) { // TODO(benvanik): optimize on type to avoid 16b copy per load. ics.rf[i->dest_reg].v128 = i->constant.v128; @@ -1011,7 +995,7 @@ uint32_t IntCode_ROUND_V128_ZERO(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t n = 0; n < 4; n++) { - dest.f4[n] = truncf(src1.f4[n]); + dest.f32[n] = truncf(src1.f32[n]); } return IA_NEXT; } @@ -1019,7 +1003,7 @@ uint32_t IntCode_ROUND_V128_NEAREST(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t n = 0; n < 4; n++) { - dest.f4[n] = roundf(src1.f4[n]); + dest.f32[n] = roundf(src1.f32[n]); } return IA_NEXT; } @@ -1027,8 +1011,8 @@ uint32_t IntCode_ROUND_V128_MINUS_INFINITY(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t n = 0; n < 4; n++) { - dest.f4[n] = floorf(src1.f4[n]); + for (int n = 0; n < 4; ++n) { + dest.f32[n] = floorf(src1.f32[n]); } return IA_NEXT; } @@ -1036,8 +1020,8 @@ uint32_t IntCode_ROUND_V128_POSITIVE_INFINTIY(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t n = 0; n < 4; n++) { - dest.f4[n] = ceilf(src1.f4[n]); + for (int n = 0; n < 4; ++n) { + dest.f32[n] = ceilf(src1.f32[n]); } return IA_NEXT; } @@ -1061,19 +1045,17 @@ int Translate_ROUND(TranslationContext& ctx, Instr* i) { uint32_t IntCode_VECTOR_CONVERT_I2F_S(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - VECF4(dest, 0) = (float)(int32_t)VECI4(src1, 0); - VECF4(dest, 1) = (float)(int32_t)VECI4(src1, 1); - VECF4(dest, 2) = (float)(int32_t)VECI4(src1, 2); - VECF4(dest, 3) = (float)(int32_t)VECI4(src1, 3); + for (int n = 0; n < 4; ++n) { + dest.f32[n] = (float)(int32_t)src1.u32[n]; + } return IA_NEXT; } uint32_t IntCode_VECTOR_CONVERT_I2F_U(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - VECF4(dest, 0) = (float)(uint32_t)VECI4(src1, 0); - VECF4(dest, 1) = (float)(uint32_t)VECI4(src1, 1); - VECF4(dest, 2) = (float)(uint32_t)VECI4(src1, 2); - VECF4(dest, 3) = (float)(uint32_t)VECI4(src1, 3); + for (int n = 0; n < 4; ++n) { + dest.f32[n] = (float)(uint32_t)src1.u32[n]; + } return IA_NEXT; } int Translate_VECTOR_CONVERT_I2F(TranslationContext& ctx, Instr* i) { @@ -1088,15 +1070,13 @@ uint32_t IntCode_VECTOR_CONVERT_F2I(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; if (i->flags & ARITHMETIC_UNSIGNED) { - VECI4(dest, 0) = (uint32_t)VECF4(src1, 0); - VECI4(dest, 1) = (uint32_t)VECF4(src1, 1); - VECI4(dest, 2) = (uint32_t)VECF4(src1, 2); - VECI4(dest, 3) = (uint32_t)VECF4(src1, 3); + for (int n = 0; n < 4; ++n) { + dest.u32[n] = (uint32_t)src1.f32[n]; + } } else { - VECI4(dest, 0) = (int32_t)VECF4(src1, 0); - VECI4(dest, 1) = (int32_t)VECF4(src1, 1); - VECI4(dest, 2) = (int32_t)VECF4(src1, 2); - VECI4(dest, 3) = (int32_t)VECF4(src1, 3); + for (int n = 0; n < 4; ++n) { + dest.u32[n] = (int32_t)src1.f32[n]; + } } return IA_NEXT; } @@ -1105,28 +1085,28 @@ uint32_t IntCode_VECTOR_CONVERT_F2I_SAT(IntCodeState& ics, const IntCode* i) { vec128_t& dest = ics.rf[i->dest_reg].v128; if (i->flags & ARITHMETIC_UNSIGNED) { for (int n = 0; n < 4; n++) { - float src = src1.f4[n]; + float src = src1.f32[n]; if (src < 0) { - VECI4(dest, n) = 0; + dest.u32[n] = 0; ics.did_saturate = 1; } else if (src > UINT_MAX) { - VECI4(dest, n) = UINT_MAX; + dest.u32[n] = UINT_MAX; ics.did_saturate = 1; } else { - VECI4(dest, n) = (uint32_t)src; + dest.u32[n] = (uint32_t)src; } } } else { for (int n = 0; n < 4; n++) { - float src = src1.f4[n]; + float src = src1.f32[n]; if (src < INT_MIN) { - VECI4(dest, n) = INT_MIN; + dest.u32[n] = INT_MIN; ics.did_saturate = 1; } else if (src > INT_MAX) { - VECI4(dest, n) = INT_MAX; + dest.u32[n] = INT_MAX; ics.did_saturate = 1; } else { - VECI4(dest, n) = (int32_t)src; + dest.u32[n] = (int32_t)src; } } } @@ -1140,51 +1120,46 @@ int Translate_VECTOR_CONVERT_F2I(TranslationContext& ctx, Instr* i) { } } -static uint8_t __lvsl_table[17][16] = { - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, - {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}, - {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}, - {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, - {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, - {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}, - {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}, - {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, - {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}, - {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, - {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}, - {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}, - {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}, - {14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}, - {15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}, - {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, +static const vec128_t lvsl_table[16] = { + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), }; -static uint8_t __lvsr_table[17][16] = { - {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, - {15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30}, - {14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}, - {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28}, - {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27}, - {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26}, - {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25}, - {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}, - {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23}, - {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22}, - {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21}, - {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}, - {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, - {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}, - {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}, - {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, - {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, +static const vec128_t lvsr_table[16] = { + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), }; uint32_t IntCode_LOAD_VECTOR_SHL(IntCodeState& ics, const IntCode* i) { - int8_t sh = std::min(16, ics.rf[i->src1_reg].i8); - vec128_t& dest = ics.rf[i->dest_reg].v128; - for (int n = 0; n < 16; n++) { - VECB16(dest, n) = __lvsl_table[sh][n]; - } + int8_t sh = ics.rf[i->src1_reg].i8 & 0xF; + ics.rf[i->dest_reg].v128 = lvsl_table[sh]; return IA_NEXT; } int Translate_LOAD_VECTOR_SHL(TranslationContext& ctx, Instr* i) { @@ -1192,11 +1167,8 @@ int Translate_LOAD_VECTOR_SHL(TranslationContext& ctx, Instr* i) { } uint32_t IntCode_LOAD_VECTOR_SHR(IntCodeState& ics, const IntCode* i) { - int8_t sh = std::min(16, ics.rf[i->src1_reg].i8); - vec128_t& dest = ics.rf[i->dest_reg].v128; - for (int n = 0; n < 16; n++) { - VECB16(dest, n) = __lvsr_table[sh][n]; - } + int8_t sh = ics.rf[i->src1_reg].i8 & 0xF; + ics.rf[i->dest_reg].v128 = lvsr_table[sh]; return IA_NEXT; } int Translate_LOAD_VECTOR_SHR(TranslationContext& ctx, Instr* i) { @@ -1331,10 +1303,10 @@ uint32_t IntCode_LOAD_CONTEXT_V128(IntCodeState& ics, const IntCode* i) { ics.rf[i->dest_reg].v128 = *((vec128_t*)(ics.context + ics.rf[i->src1_reg].u64)); DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = ctx v128 +%d\n", - VECF4(ics.rf[i->dest_reg].v128, 0), VECF4(ics.rf[i->dest_reg].v128, 1), - VECF4(ics.rf[i->dest_reg].v128, 2), VECF4(ics.rf[i->dest_reg].v128, 3), - VECI4(ics.rf[i->dest_reg].v128, 0), VECI4(ics.rf[i->dest_reg].v128, 1), - VECI4(ics.rf[i->dest_reg].v128, 2), VECI4(ics.rf[i->dest_reg].v128, 3), + ics.rf[i->dest_reg].v128.x, ics.rf[i->dest_reg].v128.y, + ics.rf[i->dest_reg].v128.z, ics.rf[i->dest_reg].v128.w, + ics.rf[i->dest_reg].v128.ux, ics.rf[i->dest_reg].v128.uy, + ics.rf[i->dest_reg].v128.uz, ics.rf[i->dest_reg].v128.uw, ics.rf[i->src1_reg].u64); return IA_NEXT; } @@ -1391,11 +1363,11 @@ uint32_t IntCode_STORE_CONTEXT_V128(IntCodeState& ics, const IntCode* i) { *((vec128_t*)(ics.context + ics.rf[i->src1_reg].u64)) = ics.rf[i->src2_reg].v128; DPRINT("ctx v128 +%d = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", - ics.rf[i->src1_reg].u64, VECF4(ics.rf[i->src2_reg].v128, 0), - VECF4(ics.rf[i->src2_reg].v128, 1), VECF4(ics.rf[i->src2_reg].v128, 2), - VECF4(ics.rf[i->src2_reg].v128, 3), VECI4(ics.rf[i->src2_reg].v128, 0), - VECI4(ics.rf[i->src2_reg].v128, 1), VECI4(ics.rf[i->src2_reg].v128, 2), - VECI4(ics.rf[i->src2_reg].v128, 3)); + ics.rf[i->src1_reg].u64, ics.rf[i->src2_reg].v128.x, + ics.rf[i->src2_reg].v128.y, ics.rf[i->src2_reg].v128.z, + ics.rf[i->src2_reg].v128.w, ics.rf[i->src2_reg].v128.ux, + ics.rf[i->src2_reg].v128.uy, ics.rf[i->src2_reg].v128.uz, + ics.rf[i->src2_reg].v128.uw); return IA_NEXT; } int Translate_STORE_CONTEXT(TranslationContext& ctx, Instr* i) { @@ -1480,12 +1452,10 @@ uint32_t IntCode_LOAD_V128(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = *((uint32_t*)(ics.membase + address + n * 4)); + dest.u32[n] = *((uint32_t*)(ics.membase + address + n * 4)); } - DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load.v128 %.8X\n", - VECF4(dest, 0), VECF4(dest, 1), VECF4(dest, 2), VECF4(dest, 3), - VECI4(dest, 0), VECI4(dest, 1), VECI4(dest, 2), VECI4(dest, 3), - address); + DPRINT("[%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X] = load.v128 %.8X\n", dest.x, + dest.y, dest.z, dest.w, dest.ux, dest.uy, dest.uz, dest.uw, address); DFLUSH(); return IA_NEXT; } @@ -1579,11 +1549,10 @@ uint32_t IntCode_STORE_F64(IntCodeState& ics, const IntCode* i) { uint32_t IntCode_STORE_V128(IntCodeState& ics, const IntCode* i) { uint32_t address = ics.rf[i->src1_reg].u32; DPRINT("store.v128 %.8X = [%e, %e, %e, %e] [%.8X, %.8X, %.8X, %.8X]\n", - address, VECF4(ics.rf[i->src2_reg].v128, 0), - VECF4(ics.rf[i->src2_reg].v128, 1), VECF4(ics.rf[i->src2_reg].v128, 2), - VECF4(ics.rf[i->src2_reg].v128, 3), VECI4(ics.rf[i->src2_reg].v128, 0), - VECI4(ics.rf[i->src2_reg].v128, 1), VECI4(ics.rf[i->src2_reg].v128, 2), - VECI4(ics.rf[i->src2_reg].v128, 3)); + address, ics.rf[i->src2_reg].v128.x, ics.rf[i->src2_reg].v128.y, + ics.rf[i->src2_reg].v128.z, ics.rf[i->src2_reg].v128.w, + ics.rf[i->src2_reg].v128.ux, ics.rf[i->src2_reg].v128.uy, + ics.rf[i->src2_reg].v128.uz, ics.rf[i->src2_reg].v128.uw); DFLUSH(); *((vec128_t*)(ics.membase + address)) = ics.rf[i->src2_reg].v128; MarkPageDirty(ics, address); @@ -1644,7 +1613,7 @@ uint32_t IntCode_MAX_V128_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.f4[n] = std::max(src1.f4[n], src2.f4[n]); + dest.f32[n] = std::max(src1.f32[n], src2.f32[n]); } return IA_NEXT; } @@ -1662,7 +1631,7 @@ uint32_t IntCode_VECTOR_MAX_I8_UNSIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - dest.b16[n] = std::max(src1.b16[n], src2.b16[n]); + dest.u8[n] = std::max(src1.u8[n], src2.u8[n]); } return IA_NEXT; } @@ -1671,7 +1640,7 @@ uint32_t IntCode_VECTOR_MAX_I16_UNSIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 8; n++) { - dest.s8[n] = std::max(src1.s8[n], src2.s8[n]); + dest.u16[n] = std::max(src1.u16[n], src2.u16[n]); } return IA_NEXT; } @@ -1680,7 +1649,7 @@ uint32_t IntCode_VECTOR_MAX_I32_UNSIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.i4[n] = std::max(src1.i4[n], src2.i4[n]); + dest.u32[n] = std::max(src1.u32[n], src2.u32[n]); } return IA_NEXT; } @@ -1689,7 +1658,7 @@ uint32_t IntCode_VECTOR_MAX_I8_SIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - dest.b16[n] = std::max((int8_t)src1.b16[n], (int8_t)src2.b16[n]); + dest.u8[n] = std::max((int8_t)src1.u8[n], (int8_t)src2.u8[n]); } return IA_NEXT; } @@ -1698,7 +1667,7 @@ uint32_t IntCode_VECTOR_MAX_I16_SIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 8; n++) { - dest.s8[n] = std::max((int16_t)src1.s8[n], (int16_t)src2.s8[n]); + dest.u16[n] = std::max((int16_t)src1.u16[n], (int16_t)src2.u16[n]); } return IA_NEXT; } @@ -1707,7 +1676,7 @@ uint32_t IntCode_VECTOR_MAX_I32_SIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.i4[n] = std::max((int32_t)src1.i4[n], (int32_t)src2.i4[n]); + dest.u32[n] = std::max((int32_t)src1.u32[n], (int32_t)src2.u32[n]); } return IA_NEXT; } @@ -1767,7 +1736,7 @@ uint32_t IntCode_MIN_V128_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.f4[n] = std::min(src1.f4[n], src2.f4[n]); + dest.f32[n] = std::min(src1.f32[n], src2.f32[n]); } return IA_NEXT; } @@ -1785,7 +1754,7 @@ uint32_t IntCode_VECTOR_MIN_I8_UNSIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - dest.b16[n] = std::min(src1.b16[n], src2.b16[n]); + dest.u8[n] = std::min(src1.u8[n], src2.u8[n]); } return IA_NEXT; } @@ -1794,7 +1763,7 @@ uint32_t IntCode_VECTOR_MIN_I16_UNSIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 8; n++) { - dest.s8[n] = std::min(src1.s8[n], src2.s8[n]); + dest.u16[n] = std::min(src1.u16[n], src2.u16[n]); } return IA_NEXT; } @@ -1803,7 +1772,7 @@ uint32_t IntCode_VECTOR_MIN_I32_UNSIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.i4[n] = std::min(src1.i4[n], src2.i4[n]); + dest.u32[n] = std::min(src1.u32[n], src2.u32[n]); } return IA_NEXT; } @@ -1812,7 +1781,7 @@ uint32_t IntCode_VECTOR_MIN_I8_SIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - dest.b16[n] = std::min((int8_t)src1.b16[n], (int8_t)src2.b16[n]); + dest.u8[n] = std::min((int8_t)src1.u8[n], (int8_t)src2.u8[n]); } return IA_NEXT; } @@ -1821,7 +1790,7 @@ uint32_t IntCode_VECTOR_MIN_I16_SIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 8; n++) { - dest.s8[n] = std::min((int16_t)src1.s8[n], (int16_t)src2.s8[n]); + dest.u16[n] = std::min((int16_t)src1.u16[n], (int16_t)src2.u16[n]); } return IA_NEXT; } @@ -1830,7 +1799,7 @@ uint32_t IntCode_VECTOR_MIN_I32_SIGNED(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.i4[n] = std::min((int32_t)src1.i4[n], (int32_t)src2.i4[n]); + dest.u32[n] = std::min((int32_t)src1.u32[n], (int32_t)src2.u32[n]); } return IA_NEXT; } @@ -2337,14 +2306,14 @@ int Translate_DID_SATURATE(TranslationContext& ctx, Instr* i) { } \ return IA_NEXT; -uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint8_t, b16, b16, 16, == )}; -uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint16_t, s8, s8, 8, == )}; -uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint32_t, i4, i4, 4, == )}; -uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(float, f4, i4, 4, == )}; +uint32_t IntCode_VECTOR_COMPARE_EQ_I8(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint8_t, u8, u8, 16, == )}; +uint32_t IntCode_VECTOR_COMPARE_EQ_I16(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint16_t, u16, u16, 8, == )}; +uint32_t IntCode_VECTOR_COMPARE_EQ_I32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint32_t, u32, u32, 4, == )}; +uint32_t IntCode_VECTOR_COMPARE_EQ_F32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(float, f32, u32, 4, == )}; int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_EQ_I8, IntCode_VECTOR_COMPARE_EQ_I16, @@ -2355,14 +2324,14 @@ int Translate_VECTOR_COMPARE_EQ(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(int8_t, b16, b16, 16, > )}; -uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(int16_t, s8, s8, 8, > )}; -uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(int32_t, i4, i4, 4, > )}; -uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(float, f4, i4, 4, > )}; +uint32_t IntCode_VECTOR_COMPARE_SGT_I8(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(int8_t, i8, i8, 16, > )}; +uint32_t IntCode_VECTOR_COMPARE_SGT_I16(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(int16_t, i16, i16, 8, > )}; +uint32_t IntCode_VECTOR_COMPARE_SGT_I32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(int32_t, i32, i32, 4, > )}; +uint32_t IntCode_VECTOR_COMPARE_SGT_F32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(float, f32, u32, 4, > )}; int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_SGT_I8, IntCode_VECTOR_COMPARE_SGT_I16, @@ -2373,14 +2342,14 @@ int Translate_VECTOR_COMPARE_SGT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(int8_t, b16, b16, 16, >= )}; -uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(int16_t, s8, s8, 8, >= )}; -uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(int32_t, i4, i4, 4, >= )}; -uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(float, f4, i4, 4, >= )}; +uint32_t IntCode_VECTOR_COMPARE_SGE_I8(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(int8_t, i8, i8, 16, >= )}; +uint32_t IntCode_VECTOR_COMPARE_SGE_I16(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(int16_t, i16, i16, 8, >= )}; +uint32_t IntCode_VECTOR_COMPARE_SGE_I32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(int32_t, i32, i32, 4, >= )}; +uint32_t IntCode_VECTOR_COMPARE_SGE_F32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(float, f32, u32, 4, >= )}; int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_SGE_I8, IntCode_VECTOR_COMPARE_SGE_I16, @@ -2391,14 +2360,14 @@ int Translate_VECTOR_COMPARE_SGE(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint8_t, b16, b16, 16, > )}; -uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint16_t, s8, s8, 8, > )}; -uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint32_t, i4, i4, 4, > )}; -uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(float, f4, i4, 4, > )}; +uint32_t IntCode_VECTOR_COMPARE_UGT_I8(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint8_t, u8, u8, 16, > )}; +uint32_t IntCode_VECTOR_COMPARE_UGT_I16(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint16_t, u16, u16, 8, > )}; +uint32_t IntCode_VECTOR_COMPARE_UGT_I32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint32_t, u32, u32, 4, > )}; +uint32_t IntCode_VECTOR_COMPARE_UGT_F32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(float, f32, u32, 4, > )}; int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_UGT_I8, IntCode_VECTOR_COMPARE_UGT_I16, @@ -2409,14 +2378,14 @@ int Translate_VECTOR_COMPARE_UGT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->flags]); } -uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint8_t, b16, b16, 16, >= )}; -uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint16_t, s8, s8, 8, >= )}; -uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(uint32_t, i4, i4, 4, >= )}; -uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i) { - VECTOR_COMPARER(float, f4, i4, 4, >= )}; +uint32_t IntCode_VECTOR_COMPARE_UGE_I8(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint8_t, u8, u8, 16, >= )}; +uint32_t IntCode_VECTOR_COMPARE_UGE_I16(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint16_t, u16, u16, 8, >= )}; +uint32_t IntCode_VECTOR_COMPARE_UGE_I32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(uint32_t, u32, u32, 4, >= )}; +uint32_t IntCode_VECTOR_COMPARE_UGE_F32(IntCodeState& ics, const IntCode* i){ + VECTOR_COMPARER(float, f32, u32, 4, >= )}; int Translate_VECTOR_COMPARE_UGE(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { IntCode_VECTOR_COMPARE_UGE_I8, IntCode_VECTOR_COMPARE_UGE_I16, @@ -2477,8 +2446,8 @@ uint32_t IntCode_ADD_F64_F64(IntCodeState& ics, const IntCode* i) { } int Translate_ADD(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { - IntCode_ADD_I8_I8, IntCode_ADD_I16_I16, IntCode_ADD_I32_I32, - IntCode_ADD_I64_I64, IntCode_ADD_F32_F32, IntCode_ADD_F64_F64, + IntCode_ADD_I8_I8, IntCode_ADD_I16_I16, IntCode_ADD_I32_I32, + IntCode_ADD_I64_I64, IntCode_ADD_F32_F32, IntCode_ADD_F64_F64, IntCode_INVALID_TYPE, }; return DispatchToC(ctx, i, fns[i->dest->type]); @@ -2556,31 +2525,31 @@ uint32_t Translate_VECTOR_ADD_I8(IntCodeState& ics, const IntCode* i) { if (arithmetic_flags & ARITHMETIC_SATURATE) { if (arithmetic_flags & ARITHMETIC_UNSIGNED) { for (int n = 0; n < 16; n++) { - uint16_t v = VECB16(src1, n) + VECB16(src2, n); + uint16_t v = src1.u8[n] + src2.u8[n]; if (v > 0xFF) { - VECB16(dest, n) = 0xFF; + dest.u8[n] = 0xFF; ics.did_saturate = 1; } else { - VECB16(dest, n) = (uint8_t)v; + dest.u8[n] = (uint8_t)v; } } } else { for (int n = 0; n < 16; n++) { - int16_t v = (int8_t)VECB16(src1, n) + (int8_t)VECB16(src2, n); + int16_t v = (int8_t)src1.u8[n] + (int8_t)src2.u8[n]; if (v > 0x7F) { - VECB16(dest, n) = 0x7F; + dest.u8[n] = 0x7F; ics.did_saturate = 1; } else if (v < -0x80) { - VECB16(dest, n) = -0x80; + dest.u8[n] = -0x80; ics.did_saturate = 1; } else { - VECB16(dest, n) = (uint8_t)v; + dest.u8[n] = (uint8_t)v; } } } } else { for (int n = 0; n < 16; n++) { - VECB16(dest, n) = VECB16(src1, n) + VECB16(src2, n); + dest.u8[n] = src1.u8[n] + src2.u8[n]; } } return IA_NEXT; @@ -2593,31 +2562,31 @@ uint32_t Translate_VECTOR_ADD_I16(IntCodeState& ics, const IntCode* i) { if (arithmetic_flags & ARITHMETIC_SATURATE) { if (arithmetic_flags & ARITHMETIC_UNSIGNED) { for (int n = 0; n < 8; n++) { - uint32_t v = VECS8(src1, n) + VECS8(src2, n); + uint32_t v = src1.u16[n] + src2.u16[n]; if (v > 0xFFFF) { - VECS8(dest, n) = 0xFFFF; + dest.u16[n] = 0xFFFF; ics.did_saturate = 1; } else { - VECS8(dest, n) = (uint16_t)v; + dest.u16[n] = (uint16_t)v; } } } else { for (int n = 0; n < 8; n++) { - int32_t v = (int16_t)VECS8(src1, n) + (int16_t)VECS8(src2, n); + int32_t v = (int16_t)src1.u16[n] + (int16_t)src2.u16[n]; if (v > 0x7FFF) { - VECS8(dest, n) = 0x7FFF; + dest.u16[n] = 0x7FFF; ics.did_saturate = 1; } else if (v < -0x8000) { - VECS8(dest, n) = -0x8000; + dest.u16[n] = -0x8000; ics.did_saturate = 1; } else { - VECS8(dest, n) = (uint16_t)v; + dest.u16[n] = (uint16_t)v; } } } } else { for (int n = 0; n < 8; n++) { - VECS8(dest, n) = VECS8(src1, n) + VECS8(src2, n); + dest.u16[n] = src1.u16[n] + src2.u16[n]; } } return IA_NEXT; @@ -2630,31 +2599,32 @@ uint32_t Translate_VECTOR_ADD_I32(IntCodeState& ics, const IntCode* i) { if (arithmetic_flags & ARITHMETIC_SATURATE) { if (arithmetic_flags & ARITHMETIC_UNSIGNED) { for (int n = 0; n < 4; n++) { - uint64_t v = (uint64_t)VECI4(src1, n) + (uint64_t)VECI4(src2, n); + uint64_t v = (uint64_t)src1.u32[n] + (uint64_t)src2.u32[n]; if (v > 0xFFFFFFFF) { - VECI4(dest, n) = 0xFFFFFFFF; + dest.u32[n] = 0xFFFFFFFF; ics.did_saturate = 1; } else { - VECI4(dest, n) = (uint32_t)v; + dest.u32[n] = (uint32_t)v; } } } else { for (int n = 0; n < 4; n++) { - int64_t v = (int64_t)(int32_t)VECI4(src1, n) + (int64_t)(int32_t)VECI4(src2, n); + int64_t v = + (int64_t)(int32_t)src1.u32[n] + (int64_t)(int32_t)src2.u32[n]; if (v > 0x7FFFFFFF) { - VECI4(dest, n) = 0x7FFFFFFF; + dest.u32[n] = 0x7FFFFFFF; ics.did_saturate = 1; } else if (v < -0x80000000ll) { - VECI4(dest, n) = 0x80000000; + dest.u32[n] = 0x80000000; ics.did_saturate = 1; } else { - VECI4(dest, n) = (uint32_t)v; + dest.u32[n] = (uint32_t)v; } } } } else { for (int n = 0; n < 4; n++) { - VECI4(dest, n) = VECI4(src1, n) + VECI4(src2, n); + dest.u32[n] = src1.u32[n] + src2.u32[n]; } } return IA_NEXT; @@ -2664,7 +2634,7 @@ uint32_t Translate_VECTOR_ADD_F32(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.f4[n] = src1.f4[n] + src2.f4[n]; + dest.f32[n] = src1.f32[n] + src2.f32[n]; } return IA_NEXT; } @@ -2728,27 +2698,80 @@ uint32_t IntCode_SUB_F64_F64(IntCodeState& ics, const IntCode* i) { } int Translate_SUB(TranslationContext& ctx, Instr* i) { static IntCodeFn fns[] = { - IntCode_SUB_I8_I8, IntCode_SUB_I16_I16, IntCode_SUB_I32_I32, - IntCode_SUB_I64_I64, IntCode_SUB_F32_F32, IntCode_SUB_F64_F64, + IntCode_SUB_I8_I8, IntCode_SUB_I16_I16, IntCode_SUB_I32_I32, + IntCode_SUB_I64_I64, IntCode_SUB_F32_F32, IntCode_SUB_F64_F64, IntCode_INVALID_TYPE, }; return DispatchToC(ctx, i, fns[i->dest->type]); } +uint32_t Translate_VECTOR_SUB_I8(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + uint32_t flags = i->flags >> 8; + // assert_zero(flags & ARITHMETIC_SATURATE); + if (flags & ARITHMETIC_UNSIGNED) { + for (int n = 0; n < 16; n++) { + dest.u8[n] = src1.u8[n] - src2.u8[n]; + } + } else { + for (int n = 0; n < 16; n++) { + dest.i8[n] = src1.i8[n] - src2.i8[n]; + } + } + return IA_NEXT; +} +uint32_t Translate_VECTOR_SUB_I16(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + uint32_t flags = i->flags >> 8; + // assert_zero(flags & ARITHMETIC_SATURATE); + if (flags & ARITHMETIC_UNSIGNED) { + for (int n = 0; n < 8; n++) { + dest.u16[n] = src1.u16[n] - src2.u16[n]; + } + } else { + for (int n = 0; n < 8; n++) { + dest.i16[n] = src1.i16[n] - src2.i16[n]; + } + } + return IA_NEXT; +} +uint32_t Translate_VECTOR_SUB_I32(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + uint32_t flags = i->flags >> 8; + // assert_zero(flags & ARITHMETIC_SATURATE); + if (flags & ARITHMETIC_UNSIGNED) { + for (int n = 0; n < 4; n++) { + dest.u32[n] = src1.u32[n] - src2.u32[n]; + } + } else { + for (int n = 0; n < 4; n++) { + dest.i32[n] = src1.i32[n] - src2.i32[n]; + } + } + return IA_NEXT; +} uint32_t Translate_VECTOR_SUB_F32(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; + uint32_t flags = i->flags >> 8; for (int n = 0; n < 4; n++) { - dest.f4[n] = src1.f4[n] - src2.f4[n]; + dest.f32[n] = src1.f32[n] - src2.f32[n]; } return IA_NEXT; } int Translate_VECTOR_SUB(TranslationContext& ctx, Instr* i) { TypeName part_type = (TypeName)(i->flags & 0xFF); static IntCodeFn fns[] = { - IntCode_INVALID_TYPE, IntCode_INVALID_TYPE, IntCode_INVALID_TYPE, - IntCode_INVALID_TYPE, Translate_VECTOR_SUB_F32, IntCode_INVALID_TYPE, + Translate_VECTOR_SUB_I8, Translate_VECTOR_SUB_I16, + Translate_VECTOR_SUB_I32, IntCode_INVALID_TYPE, + Translate_VECTOR_SUB_F32, IntCode_INVALID_TYPE, IntCode_INVALID_TYPE, }; return DispatchToC(ctx, i, fns[part_type]); @@ -2783,7 +2806,7 @@ uint32_t IntCode_MUL_V128_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.f4[n] = src1.f4[n] * src2.f4[n]; + dest.f32[n] = src1.f32[n] * src2.f32[n]; } return IA_NEXT; } @@ -2993,7 +3016,7 @@ uint32_t IntCode_DIV_V128_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.f4[n] = src1.f4[n] / src2.f4[n]; + dest.f32[n] = src1.f32[n] / src2.f32[n]; } return IA_NEXT; } @@ -3068,7 +3091,7 @@ uint32_t IntCode_MUL_ADD_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src3 = ics.rf[i->src3_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.f4[n] = src1.f4[n] * src2.f4[n] + src3.f4[n]; + dest.f32[n] = src1.f32[n] * src2.f32[n] + src3.f32[n]; } return IA_NEXT; } @@ -3118,7 +3141,7 @@ uint32_t IntCode_MUL_SUB_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src3 = ics.rf[i->src3_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.f4[n] = src1.f4[n] * src2.f4[n] - src3.f4[n]; + dest.f32[n] = src1.f32[n] * src2.f32[n] - src3.f32[n]; } return IA_NEXT; } @@ -3159,7 +3182,7 @@ uint32_t IntCode_NEG_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 4; i++) { - dest.f4[i] = -src1.f4[i]; + dest.f32[i] = -src1.f32[i]; } return IA_NEXT; } @@ -3199,7 +3222,7 @@ uint32_t IntCode_ABS_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 4; i++) { - dest.f4[i] = abs(src1.f4[i]); + dest.f32[i] = abs(src1.f32[i]); } return IA_NEXT; } @@ -3255,7 +3278,7 @@ uint32_t IntCode_SQRT_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 4; i++) { - dest.f4[i] = sqrt(src1.f4[i]); + dest.f32[i] = sqrt(src1.f32[i]); } return IA_NEXT; } @@ -3272,7 +3295,7 @@ uint32_t IntCode_RSQRT_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - dest.f4[n] = 1 / sqrtf(src1.f4[n]); + dest.f32[n] = 1 / sqrtf(src1.f32[n]); } return IA_NEXT; } @@ -3297,7 +3320,7 @@ uint32_t IntCode_POW2_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 4; i++) { - dest.f4[i] = (float)pow(2, src1.f4[i]); + dest.f32[i] = (float)pow(2, src1.f32[i]); } return IA_NEXT; } @@ -3322,7 +3345,7 @@ uint32_t IntCode_LOG2_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 4; i++) { - dest.f4[i] = log2(src1.f4[i]); + dest.f32[i] = log2(src1.f32[i]); } return IA_NEXT; } @@ -3356,7 +3379,7 @@ uint32_t IntCode_AND_V128_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = VECI4(src1, n) & VECI4(src2, n); + dest.u32[n] = src1.u32[n] & src2.u32[n]; } return IA_NEXT; } @@ -3390,7 +3413,7 @@ uint32_t IntCode_OR_V128_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = VECI4(src1, n) | VECI4(src2, n); + dest.u32[n] = src1.u32[n] | src2.u32[n]; } return IA_NEXT; } @@ -3424,7 +3447,7 @@ uint32_t IntCode_XOR_V128_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = VECI4(src1, n) ^ VECI4(src2, n); + dest.u32[n] = src1.u32[n] ^ src2.u32[n]; } return IA_NEXT; } @@ -3457,7 +3480,7 @@ uint32_t IntCode_NOT_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = ~VECI4(src1, n); + dest.u32[n] = ~src1.u32[n]; } return IA_NEXT; } @@ -3500,7 +3523,7 @@ uint32_t IntCode_VECTOR_SHL_I8(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - VECB16(dest, n) = VECB16(src1, n) << (VECB16(src2, n) & 0x7); + dest.u8[n] = src1.u8[n] << (src2.u8[n] & 0x7); } return IA_NEXT; } @@ -3509,7 +3532,7 @@ uint32_t IntCode_VECTOR_SHL_I16(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 8; n++) { - VECS8(dest, n) = VECS8(src1, n) << (VECS8(src2, n) & 0xF); + dest.u16[n] = src1.u16[n] << (src2.u16[n] & 0xF); } return IA_NEXT; } @@ -3518,7 +3541,7 @@ uint32_t IntCode_VECTOR_SHL_I32(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = VECI4(src1, n) << (VECI4(src2, n) & 0x1F); + dest.u32[n] = src1.u32[n] << (src2.u32[n] & 0x1F); } return IA_NEXT; } @@ -3561,7 +3584,7 @@ uint32_t IntCode_VECTOR_SHR_I8(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - VECB16(dest, n) = VECB16(src1, n) >> (VECB16(src2, n) & 0x7); + dest.u8[n] = src1.u8[n] >> (src2.u8[n] & 0x7); } return IA_NEXT; } @@ -3570,7 +3593,7 @@ uint32_t IntCode_VECTOR_SHR_I16(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 8; n++) { - VECS8(dest, n) = VECS8(src1, n) >> (VECS8(src2, n) & 0xF); + dest.u16[n] = src1.u16[n] >> (src2.u16[n] & 0xF); } return IA_NEXT; } @@ -3579,7 +3602,7 @@ uint32_t IntCode_VECTOR_SHR_I32(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = VECI4(src1, n) >> (VECI4(src2, n) & 0x1F); + dest.u32[n] = src1.u32[n] >> (src2.u32[n] & 0x1F); } return IA_NEXT; } @@ -3622,7 +3645,7 @@ uint32_t IntCode_VECTOR_SHA_I8(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - VECB16(dest, n) = int8_t(VECB16(src1, n)) >> (VECB16(src2, n) & 0x7); + dest.u8[n] = int8_t(src1.u8[n]) >> (src2.u8[n] & 0x7); } return IA_NEXT; } @@ -3631,7 +3654,7 @@ uint32_t IntCode_VECTOR_SHA_I16(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 8; n++) { - VECS8(dest, n) = int16_t(VECS8(src1, n)) >> (VECS8(src2, n) & 0xF); + dest.u16[n] = int16_t(src1.u16[n]) >> (src2.u16[n] & 0xF); } return IA_NEXT; } @@ -3640,7 +3663,7 @@ uint32_t IntCode_VECTOR_SHA_I32(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = int32_t(VECI4(src1, n)) >> (VECI4(src2, n) & 0x1F); + dest.u32[n] = int32_t(src1.u32[n]) >> (src2.u32[n] & 0x1F); } return IA_NEXT; } @@ -3684,9 +3707,41 @@ int Translate_ROTATE_LEFT(TranslationContext& ctx, Instr* i) { return DispatchToC(ctx, i, fns[i->dest->type]); } +uint32_t IntCode_VECTOR_ROTATE_LEFT_I8(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + for (int n = 0; n < 16; n++) { + dest.u8[n] = poly::rotate_left(src1.u8[n], src2.u8[n] & 0x7); + } + return IA_NEXT; +} +uint32_t IntCode_VECTOR_ROTATE_LEFT_I16(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + for (int n = 0; n < 8; n++) { + dest.u16[n] = poly::rotate_left(src1.u16[n], src2.u16[n] & 0xF); + } + return IA_NEXT; +} +uint32_t IntCode_VECTOR_ROTATE_LEFT_I32(IntCodeState& ics, const IntCode* i) { + const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& src2 = ics.rf[i->src2_reg].v128; + vec128_t& dest = ics.rf[i->dest_reg].v128; + for (int n = 0; n < 4; n++) { + dest.u32[n] = poly::rotate_left(src1.u32[n], src2.u32[n] & 0x1F); + } + return IA_NEXT; +} int Translate_VECTOR_ROTATE_LEFT(TranslationContext& ctx, Instr* i) { - assert_always(); - return 1; + static IntCodeFn fns[] = { + IntCode_VECTOR_ROTATE_LEFT_I8, IntCode_VECTOR_ROTATE_LEFT_I16, + IntCode_VECTOR_ROTATE_LEFT_I32, IntCode_INVALID_TYPE, + IntCode_INVALID_TYPE, IntCode_INVALID_TYPE, + IntCode_INVALID_TYPE, + }; + return DispatchToC(ctx, i, fns[i->flags]); } uint32_t IntCode_BYTE_SWAP_I16(IntCodeState& ics, const IntCode* i) { @@ -3705,7 +3760,7 @@ uint32_t IntCode_BYTE_SWAP_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 4; n++) { - VECI4(dest, n) = poly::byte_swap(VECI4(src1, n)); + dest.u32[n] = poly::byte_swap(src1.u32[n]); } return IA_NEXT; } @@ -3749,17 +3804,17 @@ int Translate_CNTLZ(TranslationContext& ctx, Instr* i) { uint32_t IntCode_EXTRACT_INT8_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i8 = VECB16(src1, ics.rf[i->src2_reg].i8); + ics.rf[i->dest_reg].i8 = src1.i8[ics.rf[i->src2_reg].i8 ^ 0x3]; return IA_NEXT; } uint32_t IntCode_EXTRACT_INT16_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i16 = VECS8(src1, ics.rf[i->src2_reg].i8); + ics.rf[i->dest_reg].i16 = src1.i16[ics.rf[i->src2_reg].i8 ^ 0x1]; return IA_NEXT; } uint32_t IntCode_EXTRACT_INT32_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; - ics.rf[i->dest_reg].i32 = VECI4(src1, ics.rf[i->src2_reg].i8); + ics.rf[i->dest_reg].i32 = src1.i32[ics.rf[i->src2_reg].i8]; return IA_NEXT; } int Translate_EXTRACT(TranslationContext& ctx, Instr* i) { @@ -3800,9 +3855,8 @@ uint32_t IntCode_INSERT_INT8_V128(IntCodeState& ics, const IntCode* i) { const size_t offset = ics.rf[i->src2_reg].i64; const uint8_t part = ics.rf[i->src3_reg].i8; vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t n = 0; n < 16; n++) { - VECB16(dest, n) = (n == offset) ? part : VECB16(src1, n); - } + dest = src1; + dest.u8[offset ^ 0x3] = part; return IA_NEXT; } uint32_t IntCode_INSERT_INT16_V128(IntCodeState& ics, const IntCode* i) { @@ -3810,9 +3864,8 @@ uint32_t IntCode_INSERT_INT16_V128(IntCodeState& ics, const IntCode* i) { const size_t offset = ics.rf[i->src2_reg].i64; const uint16_t part = ics.rf[i->src3_reg].i16; vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t n = 0; n < 8; n++) { - VECS8(dest, n) = (n == offset) ? part : VECS8(src1, n); - } + dest = src1; + dest.u16[offset ^ 0x1] = part; return IA_NEXT; } uint32_t IntCode_INSERT_INT32_V128(IntCodeState& ics, const IntCode* i) { @@ -3820,9 +3873,8 @@ uint32_t IntCode_INSERT_INT32_V128(IntCodeState& ics, const IntCode* i) { const size_t offset = ics.rf[i->src2_reg].i64; const uint32_t part = ics.rf[i->src3_reg].i32; vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t n = 0; n < 4; n++) { - VECI4(dest, n) = (n == offset) ? part : VECI4(src1, n); - } + dest = src1; + dest.u32[offset] = part; return IA_NEXT; } int Translate_INSERT(TranslationContext& ctx, Instr* i) { @@ -3862,7 +3914,7 @@ uint32_t IntCode_SPLAT_V128_INT8(IntCodeState& ics, const IntCode* i) { int8_t src1 = ics.rf[i->src1_reg].i8; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 16; i++) { - VECB16(dest, i) = src1; + dest.u8[i] = src1; } return IA_NEXT; } @@ -3870,7 +3922,7 @@ uint32_t IntCode_SPLAT_V128_INT16(IntCodeState& ics, const IntCode* i) { int16_t src1 = ics.rf[i->src1_reg].i16; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 8; i++) { - VECS8(dest, i) = src1; + dest.u16[i] = src1; } return IA_NEXT; } @@ -3878,7 +3930,7 @@ uint32_t IntCode_SPLAT_V128_INT32(IntCodeState& ics, const IntCode* i) { int32_t src1 = ics.rf[i->src1_reg].i32; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 4; i++) { - VECI4(dest, i) = src1; + dest.u32[i] = src1; } return IA_NEXT; } @@ -3886,7 +3938,7 @@ uint32_t IntCode_SPLAT_V128_FLOAT32(IntCodeState& ics, const IntCode* i) { float src1 = ics.rf[i->src1_reg].f32; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 4; i++) { - dest.f4[i] = src1; + dest.f32[i] = src1; } return IA_NEXT; } @@ -3928,10 +3980,14 @@ uint32_t IntCode_PERMUTE_V128_BY_INT32(IntCodeState& ics, const IntCode* i) { const vec128_t& src2 = ics.rf[i->src2_reg].v128; const vec128_t& src3 = ics.rf[i->src3_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t i = 0; i < 4; i++) { - size_t b = (table >> ((3 - i) * 8)) & 0x7; - VECI4(dest, i) = b < 4 ? VECI4(src2, b) : VECI4(src3, b - 4); - } + dest.i32[0] = + (table & 0x00000004) ? src3.i32[table & 0x3] : src2.i32[table & 0x3]; + dest.i32[1] = (table & 0x00000400) ? src3.i32[(table >> 8) & 0x3] + : src2.i32[(table >> 8) & 0x3]; + dest.i32[2] = (table & 0x00040000) ? src3.i32[(table >> 16) & 0x3] + : src2.i32[(table >> 16) & 0x3]; + dest.i32[3] = (table & 0x04000000) ? src3.i32[(table >> 24) & 0x3] + : src2.i32[(table >> 24) & 0x3]; return IA_NEXT; } uint32_t IntCode_PERMUTE_V128_BY_V128(IntCodeState& ics, const IntCode* i) { @@ -3941,9 +3997,8 @@ uint32_t IntCode_PERMUTE_V128_BY_V128(IntCodeState& ics, const IntCode* i) { vec128_t& dest = ics.rf[i->dest_reg].v128; dest.low = dest.high = 0; for (size_t n = 0; n < 16; n++) { - uint8_t index = VECB16(table, n) & 0x1F; - VECB16(dest, n) = - index < 16 ? VECB16(src2, index) : VECB16(src3, index - 16); + uint8_t index = (table.u8[n] & 0x1F) ^ 0x3; + dest.u8[n] = index < 16 ? src2.u8[index] : src3.u8[index - 16]; } return IA_NEXT; } @@ -3984,10 +4039,10 @@ uint32_t IntCode_SWIZZLE_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; uint32_t swizzle_mask = ics.rf[i->src2_reg].u32; vec128_t& dest = ics.rf[i->dest_reg].v128; - VECI4(dest, 0) = VECI4(src1, (swizzle_mask >> 6) & 0x3); - VECI4(dest, 1) = VECI4(src1, (swizzle_mask >> 4) & 0x3); - VECI4(dest, 2) = VECI4(src1, (swizzle_mask >> 2) & 0x3); - VECI4(dest, 3) = VECI4(src1, (swizzle_mask)&0x3); + dest.i32[0] = src1.i32[(swizzle_mask >> 0) & 0x3]; + dest.i32[1] = src1.i32[(swizzle_mask >> 2) & 0x3]; + dest.i32[2] = src1.i32[(swizzle_mask >> 4) & 0x3]; + dest.i32[3] = src1.i32[(swizzle_mask >> 6) & 0x3]; return IA_NEXT; } int Translate_SWIZZLE(TranslationContext& ctx, Instr* i) { @@ -4003,48 +4058,36 @@ uint32_t IntCode_PACK_D3DCOLOR(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; // RGBA (XYZW) -> ARGB (WXYZ) - dest.ix = dest.iy = dest.iz = 0; - float r = roundf(((src1.x < 0) ? 0 : ((1 < src1.x) ? 1 : src1.x)) * 255); - float g = roundf(((src1.y < 0) ? 0 : ((1 < src1.y) ? 1 : src1.y)) * 255); - float b = roundf(((src1.z < 0) ? 0 : ((1 < src1.z) ? 1 : src1.z)) * 255); - float a = roundf(((src1.w < 0) ? 0 : ((1 < src1.w) ? 1 : src1.w)) * 255); - dest.iw = ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | - ((uint32_t)b); + dest.ux = dest.uy = dest.uz = 0; + dest.uw = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | + ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF); return IA_NEXT; } uint32_t IntCode_PACK_FLOAT16_2(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - dest.ix = dest.iy = dest.iz = 0; - dest.iw = (uint32_t(poly::float_to_half(src1.x)) << 16) | + dest.ux = dest.uy = dest.uz = 0; + dest.uw = (uint32_t(poly::float_to_half(src1.x)) << 16) | poly::float_to_half(src1.y); return IA_NEXT; } uint32_t IntCode_PACK_FLOAT16_4(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - dest.ix = dest.iy = 0; - dest.iz = (uint32_t(poly::float_to_half(src1.x)) << 16) | + dest.ux = dest.uy = 0; + dest.uz = (uint32_t(poly::float_to_half(src1.x)) << 16) | poly::float_to_half(src1.y); - dest.iw = (uint32_t(poly::float_to_half(src1.z)) << 16) | + dest.uw = (uint32_t(poly::float_to_half(src1.z)) << 16) | poly::float_to_half(src1.w); return IA_NEXT; } uint32_t IntCode_PACK_SHORT_2(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - // sx = 3 + (x / 1<<22) - // x = (sx - 3) * 1<<22 - float sx = src1.x; - float sy = src1.y; - union { - int16_t dx; - int16_t dy; - }; - dx = (int16_t)((sx - 3.0f) * (float)(1 << 22)); - dy = (int16_t)((sy - 3.0f) * (float)(1 << 22)); - dest.ix = dest.iy = dest.iz = 0; - dest.iw = ((uint32_t)dx << 16) | dy; + int16_t dx = int16_t(poly::saturate(src1.x) * 32767.0f); + int16_t dy = int16_t(poly::saturate(src1.y) * 32767.0f); + dest.ux = dest.uy = dest.uz = 0; + dest.uw = (uint32_t(uint16_t(dx)) << 16) | uint32_t(uint16_t(dy)); return IA_NEXT; } int Translate_PACK(TranslationContext& ctx, Instr* i) { @@ -4061,93 +4104,85 @@ uint32_t IntCode_UNPACK_D3DCOLOR(IntCodeState& ics, const IntCode* i) { vec128_t& dest = ics.rf[i->dest_reg].v128; // ARGB (WXYZ) -> RGBA (XYZW) // XMLoadColor - int32_t src = (int32_t)src1.iw; - dest.f4[0] = (float)((src >> 16) & 0xFF) * (1.0f / 255.0f); - dest.f4[1] = (float)((src >> 8) & 0xFF) * (1.0f / 255.0f); - dest.f4[2] = (float)(src & 0xFF) * (1.0f / 255.0f); - dest.f4[3] = (float)((src >> 24) & 0xFF) * (1.0f / 255.0f); + int32_t src = (int32_t)src1.uw; + dest.u32[0] = 0x3F800000 | ((src >> 16) & 0xFF); + dest.u32[1] = 0x3F800000 | ((src >> 8) & 0xFF); + dest.u32[2] = 0x3F800000 | (src & 0xFF); + dest.u32[3] = 0x3F800000 | ((src >> 24) & 0xFF); return IA_NEXT; } uint32_t IntCode_UNPACK_FLOAT16_2(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - uint32_t src = src1.iw; - for (int n = 0; n < 2; n++) { - dest.f4[n] = poly::half_to_float(uint16_t(src)); - src >>= 16; - } - dest.f4[2] = 0.0f; - dest.f4[3] = 1.0f; + dest.f32[0] = poly::half_to_float(uint16_t(src1.uw >> 16)); + dest.f32[1] = poly::half_to_float(uint16_t(src1.uw)); + dest.f32[2] = 0.0f; + dest.f32[3] = 1.0f; return IA_NEXT; } uint32_t IntCode_UNPACK_FLOAT16_4(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - uint64_t src = src1.iz | ((uint64_t)src1.iw << 32); - for (int n = 0; n < 4; n++) { - dest.f4[n] = poly::half_to_float(uint16_t(src)); - src >>= 16; - } + dest.f32[0] = poly::half_to_float(src1.u16[5]); + dest.f32[1] = poly::half_to_float(src1.u16[4]); + dest.f32[2] = poly::half_to_float(src1.u16[7]); + dest.f32[3] = poly::half_to_float(src1.u16[6]); return IA_NEXT; } uint32_t IntCode_UNPACK_SHORT_2(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; // XMLoadShortN2 - union { - int16_t sx; - int16_t sy; - }; - sx = (int16_t)(src1.iw >> 16); - sy = (int16_t)src1.iw; - dest.f4[0] = 3.0f + ((float)sx / (float)(1 << 22)); - dest.f4[1] = 3.0f + ((float)sy / (float)(1 << 22)); - dest.f4[2] = 0.0f; - dest.f4[3] = 1.0f; // 3? + uint16_t sx = src1.uw >> 16; + uint16_t sy = src1.uw & 0xFFFF; + dest.u32[0] = sx & 0x8000 ? (0x403F0000 | sx) : (0x40400000 | sx); + dest.u32[1] = sy & 0x8000 ? (0x403F0000 | sy) : (0x40400000 | sy); + dest.u32[2] = 0; + dest.u32[3] = 0x3F800000; return IA_NEXT; } uint32_t IntCode_UNPACK_S8_IN_16_LO(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - VECS8(dest, 0) = (int16_t)(int8_t)VECB16(src1, 8 + 0); - VECS8(dest, 1) = (int16_t)(int8_t)VECB16(src1, 8 + 1); - VECS8(dest, 2) = (int16_t)(int8_t)VECB16(src1, 8 + 2); - VECS8(dest, 3) = (int16_t)(int8_t)VECB16(src1, 8 + 3); - VECS8(dest, 4) = (int16_t)(int8_t)VECB16(src1, 8 + 4); - VECS8(dest, 5) = (int16_t)(int8_t)VECB16(src1, 8 + 5); - VECS8(dest, 6) = (int16_t)(int8_t)VECB16(src1, 8 + 6); - VECS8(dest, 7) = (int16_t)(int8_t)VECB16(src1, 8 + 7); + dest.i16[0] = (int16_t)src1.i8[8 + 0]; + dest.i16[1] = (int16_t)src1.i8[8 + 1]; + dest.i16[2] = (int16_t)src1.i8[8 + 2]; + dest.i16[3] = (int16_t)src1.i8[8 + 3]; + dest.i16[4] = (int16_t)src1.i8[8 + 4]; + dest.i16[5] = (int16_t)src1.i8[8 + 5]; + dest.i16[6] = (int16_t)src1.i8[8 + 6]; + dest.i16[7] = (int16_t)src1.i8[8 + 7]; return IA_NEXT; } uint32_t IntCode_UNPACK_S8_IN_16_HI(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - VECS8(dest, 0) = (int16_t)(int8_t)VECB16(src1, 0); - VECS8(dest, 1) = (int16_t)(int8_t)VECB16(src1, 1); - VECS8(dest, 2) = (int16_t)(int8_t)VECB16(src1, 2); - VECS8(dest, 3) = (int16_t)(int8_t)VECB16(src1, 3); - VECS8(dest, 4) = (int16_t)(int8_t)VECB16(src1, 4); - VECS8(dest, 5) = (int16_t)(int8_t)VECB16(src1, 5); - VECS8(dest, 6) = (int16_t)(int8_t)VECB16(src1, 6); - VECS8(dest, 7) = (int16_t)(int8_t)VECB16(src1, 7); + dest.i16[0] = (int16_t)src1.i8[0]; + dest.i16[1] = (int16_t)src1.i8[1]; + dest.i16[2] = (int16_t)src1.i8[2]; + dest.i16[3] = (int16_t)src1.i8[3]; + dest.i16[4] = (int16_t)src1.i8[4]; + dest.i16[5] = (int16_t)src1.i8[5]; + dest.i16[6] = (int16_t)src1.i8[6]; + dest.i16[7] = (int16_t)src1.i8[7]; return IA_NEXT; } uint32_t IntCode_UNPACK_S16_IN_32_LO(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - VECI4(dest, 0) = (int32_t)(int16_t)VECS8(src1, 4 + 0); - VECI4(dest, 1) = (int32_t)(int16_t)VECS8(src1, 4 + 1); - VECI4(dest, 2) = (int32_t)(int16_t)VECS8(src1, 4 + 2); - VECI4(dest, 3) = (int32_t)(int16_t)VECS8(src1, 4 + 3); + dest.i32[0] = (int32_t)src1.i16[4 + 0]; + dest.i32[1] = (int32_t)src1.i16[4 + 1]; + dest.i32[2] = (int32_t)src1.i16[4 + 2]; + dest.i32[3] = (int32_t)src1.i16[4 + 3]; return IA_NEXT; } uint32_t IntCode_UNPACK_S16_IN_32_HI(IntCodeState& ics, const IntCode* i) { const vec128_t& src1 = ics.rf[i->src1_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; - VECI4(dest, 0) = (int32_t)(int16_t)VECS8(src1, 0); - VECI4(dest, 1) = (int32_t)(int16_t)VECS8(src1, 1); - VECI4(dest, 2) = (int32_t)(int16_t)VECS8(src1, 2); - VECI4(dest, 3) = (int32_t)(int16_t)VECS8(src1, 3); + dest.i32[0] = (int32_t)src1.i16[0]; + dest.i32[1] = (int32_t)src1.i16[1]; + dest.i32[2] = (int32_t)src1.i16[2]; + dest.i32[3] = (int32_t)src1.i16[3]; return IA_NEXT; } int Translate_UNPACK(TranslationContext& ctx, Instr* i) { diff --git a/src/alloy/backend/x64/x64_emitter.cc b/src/alloy/backend/x64/x64_emitter.cc index 3762fdd4e..0dac2fb36 100644 --- a/src/alloy/backend/x64/x64_emitter.cc +++ b/src/alloy/backend/x64/x64_emitter.cc @@ -41,6 +41,7 @@ using alloy::runtime::ThreadState; static const size_t MAX_CODE_SIZE = 1 * 1024 * 1024; static const size_t STASH_OFFSET = 32; +static const size_t STASH_OFFSET_HIGH = 32 + 16; // If we are running with tracing on we have to store the EFLAGS in the stack, // otherwise our calls out to C to print will clear it before DID_CARRY/etc @@ -786,8 +787,8 @@ void X64Emitter::MovMem64(const RegExp& addr, uint64_t v) { Address X64Emitter::GetXmmConstPtr(XmmConst id) { static const vec128_t xmm_consts[] = { - /* XMMZero */ vec128f(0.0f, 0.0f, 0.0f, 0.0f), - /* XMMOne */ vec128f(1.0f, 1.0f, 1.0f, 1.0f), + /* XMMZero */ vec128f(0.0f), + /* XMMOne */ vec128f(1.0f), /* XMMNegativeOne */ vec128f(-1.0f, -1.0f, -1.0f, -1.0f), /* XMMMaskX16Y16 */ vec128i(0x0000FFFFu, 0xFFFF0000u, 0x00000000u, 0x00000000u), @@ -808,14 +809,24 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { 0xFFFFFFFFu, 0x7FFFFFFFu), /* XMMByteSwapMask */ vec128i(0x00010203u, 0x04050607u, 0x08090A0Bu, 0x0C0D0E0Fu), - /* XMMPermuteControl15 */ vec128b(15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15), + /* XMMPermuteControl15 */ vec128b(15), /* XMMPackD3DCOLOR */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, 0xFFFFFFFFu, 0x0C000408u), /* XMMUnpackD3DCOLOR */ vec128i(0xFFFFFF0Eu, 0xFFFFFF0Du, 0xFFFFFF0Cu, 0xFFFFFF0Fu), - /* XMMOneOver255 */ vec128f(1.0f / 255.0f, 1.0f / 255.0f, - 1.0f / 255.0f, 1.0f / 255.0f), + /* XMMPackFLOAT16_2 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, + 0xFFFFFFFFu, 0x01000302u), + /* XMMUnpackFLOAT16_2 */ vec128i(0x0D0C0F0Eu, 0xFFFFFFFFu, + 0xFFFFFFFFu, 0xFFFFFFFFu), + /* XMMPackFLOAT16_4 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, + 0x05040706u, 0x01000302u), + /* XMMUnpackFLOAT16_4 */ vec128i(0x09080B0Au, 0x0D0C0F0Eu, + 0xFFFFFFFFu, 0xFFFFFFFFu), + /* XMMPackSHORT_2 */ vec128i(0xFFFFFFFFu, 0xFFFFFFFFu, + 0xFFFFFFFFu, 0x01000504u), + /* XMMUnpackSHORT_2 */ vec128i(0xFFFF0F0Eu, 0xFFFF0D0Cu, + 0xFFFFFFFFu, 0xFFFFFFFFu), + /* XMMOneOver255 */ vec128f(1.0f / 255.0f), /* XMMMaskEvenPI16 */ vec128i(0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu, 0x0000FFFFu), /* XMMShiftMaskEvenPI16 */ vec128i(0x0000000Fu, 0x0000000Fu, @@ -826,8 +837,8 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { 0x000000FFu, 0x000000FFu), /* XMMUnsignedDwordMax */ vec128i(0xFFFFFFFFu, 0x00000000u, 0xFFFFFFFFu, 0x00000000u), - /* XMM255 */ vec128f(255.0f, 255.0f, 255.0f, 255.0f), - /* XMMPI32 */ vec128i(32, 32, 32, 32), + /* XMM255 */ vec128f(255.0f), + /* XMMPI32 */ vec128i(32), /* XMMSignMaskI8 */ vec128i(0x80808080u, 0x80808080u, 0x80808080u, 0x80808080u), /* XMMSignMaskI16 */ vec128i(0x80008000u, 0x80008000u, @@ -836,6 +847,8 @@ Address X64Emitter::GetXmmConstPtr(XmmConst id) { 0x80000000u, 0x80000000u), /* XMMSignMaskF32 */ vec128i(0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u), + /* XMMShortMinPS */ vec128f(SHRT_MIN), + /* XMMShortMaxPS */ vec128f(SHRT_MAX), }; // TODO(benvanik): cache base pointer somewhere? stack? It'd be nice to // prevent this move. @@ -901,19 +914,12 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) { } } -Address X64Emitter::StashXmm(const Xmm& r) { - auto addr = ptr[rsp + STASH_OFFSET]; +Address X64Emitter::StashXmm(int index, const Xmm& r) { + auto addr = ptr[rsp + STASH_OFFSET + (index * 16)]; vmovups(addr, r); return addr; } -Address X64Emitter::StashXmm(const vec128_t& v) { - auto addr = ptr[rsp + STASH_OFFSET]; - LoadConstantXmm(xmm0, v); - vmovups(addr, xmm0); - return addr; -} - } // namespace x64 } // namespace backend } // namespace alloy diff --git a/src/alloy/backend/x64/x64_emitter.h b/src/alloy/backend/x64/x64_emitter.h index e63b72b7f..7755c8cdb 100644 --- a/src/alloy/backend/x64/x64_emitter.h +++ b/src/alloy/backend/x64/x64_emitter.h @@ -56,6 +56,12 @@ enum XmmConst { XMMPermuteControl15, XMMPackD3DCOLOR, XMMUnpackD3DCOLOR, + XMMPackFLOAT16_2, + XMMUnpackFLOAT16_2, + XMMPackFLOAT16_4, + XMMUnpackFLOAT16_4, + XMMPackSHORT_2, + XMMUnpackSHORT_2, XMMOneOver255, XMMMaskEvenPI16, XMMShiftMaskEvenPI16, @@ -68,6 +74,8 @@ enum XmmConst { XMMSignMaskI16, XMMSignMaskI32, XMMSignMaskF32, + XMMShortMinPS, + XMMShortMaxPS, }; // Unfortunately due to the design of xbyak we have to pass this to the ctor. @@ -158,8 +166,7 @@ class X64Emitter : public Xbyak::CodeGenerator { void LoadConstantXmm(Xbyak::Xmm dest, float v); void LoadConstantXmm(Xbyak::Xmm dest, double v); void LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v); - Xbyak::Address StashXmm(const Xbyak::Xmm& r); - Xbyak::Address StashXmm(const vec128_t& v); + Xbyak::Address StashXmm(int index, const Xbyak::Xmm& r); size_t stack_size() const { return stack_size_; } diff --git a/src/alloy/backend/x64/x64_sequences.cc b/src/alloy/backend/x64/x64_sequences.cc index 2edda3870..f992b9949 100644 --- a/src/alloy/backend/x64/x64_sequences.cc +++ b/src/alloy/backend/x64/x64_sequences.cc @@ -994,24 +994,23 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_LOAD_VECTOR_SHL // ============================================================================ -static vec128_t lvsl_table[17] = { - vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), - vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), - vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), - vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), - vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), - vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), - vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), - vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), - vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), - vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), - vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), - vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), - vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), - vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), - vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), - vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), - vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), +static const vec128_t lvsl_table[16] = { + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), + vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), + vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), }; EMITTER(LOAD_VECTOR_SHL_I8, MATCH(I, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { @@ -1021,17 +1020,10 @@ EMITTER(LOAD_VECTOR_SHL_I8, MATCH(I, I8<>>)) { e.mov(e.rax, (uintptr_t)&lvsl_table[sh]); e.vmovaps(i.dest, e.ptr[e.rax]); } else { -#if XE_DEBUG - // We should only ever be getting values in [0,16]. Assert that. - Xbyak::Label skip; - e.cmp(i.src1, 17); - e.jb(skip); - e.Trap(); - e.L(skip); -#endif // XE_DEBUG // TODO(benvanik): find a cheaper way of doing this. e.movzx(e.rdx, i.src1); - e.shl(e.rdx, 4); + e.and(e.dx, 0xF); + e.shl(e.dx, 4); e.mov(e.rax, (uintptr_t)lvsl_table); e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); e.ReloadEDX(); @@ -1046,24 +1038,23 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_LOAD_VECTOR_SHR // ============================================================================ -static vec128_t lvsr_table[17] = { - vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), - vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), - vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), - vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), - vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), - vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), - vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), - vec128b( 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), - vec128b( 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), - vec128b( 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), - vec128b( 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), - vec128b( 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), - vec128b( 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), - vec128b( 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), - vec128b( 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), - vec128b( 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), - vec128b( 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), +static const vec128_t lvsr_table[16] = { + vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), + vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30), + vec128b(14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29), + vec128b(13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28), + vec128b(12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), + vec128b(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), + vec128b(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), + vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), + vec128b(8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23), + vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22), + vec128b(6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21), + vec128b(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20), + vec128b(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19), + vec128b(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), + vec128b(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), + vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), }; EMITTER(LOAD_VECTOR_SHR_I8, MATCH(I, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { @@ -1073,17 +1064,10 @@ EMITTER(LOAD_VECTOR_SHR_I8, MATCH(I, I8<>>)) { e.mov(e.rax, (uintptr_t)&lvsr_table[sh]); e.vmovaps(i.dest, e.ptr[e.rax]); } else { -#if XE_DEBUG - // We should only ever be getting values in [0,16]. Assert that. - Xbyak::Label skip; - e.cmp(i.src1, 17); - e.jb(skip); - e.Trap(); - e.L(skip); -#endif // XE_DEBUG // TODO(benvanik): find a cheaper way of doing this. e.movzx(e.rdx, i.src1); - e.shl(e.rdx, 4); + e.and(e.dx, 0xF); + e.shl(e.dx, 4); e.mov(e.rax, (uintptr_t)lvsr_table); e.vmovaps(i.dest, e.ptr[e.rax + e.rdx]); e.ReloadEDX(); @@ -2676,7 +2660,41 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_VECTOR_ADD // ============================================================================ -EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { +EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)){ + static __m128i EmulateVectorAddUnsignedSatI32(void*, __m128i src1, + __m128i src2){ + alignas(16) uint32_t a[4]; + alignas(16) uint32_t b[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); + for (size_t i = 0; i < 4; ++i) { + uint64_t v = (uint64_t)a[i] + (uint64_t)b[i]; + if (v > 0xFFFFFFFF) { + a[i] = 0xFFFFFFFF; + } else { + a[i] = (uint32_t)v; + } + } + return _mm_load_si128(reinterpret_cast<__m128i*>(a)); + } + static __m128i EmulateVectorAddSignedSatI32(void*, __m128i src1, + __m128i src2){ + alignas(16) int32_t a[4]; + alignas(16) int32_t b[4]; + _mm_store_si128(reinterpret_cast<__m128i*>(a), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(b), src2); + for (size_t i = 0; i < 4; ++i) { + int64_t v = (int64_t)a[i] + (int64_t)b[i]; + if (v > 0x7FFFFFFF) { + a[i] = 0x7FFFFFFF; + } else if (v < -0x80000000ll) { + a[i] = 0x80000000; + } else { + a[i] = (uint32_t)v; + } + } + return _mm_load_si128(reinterpret_cast<__m128i*>(a)); + } static void Emit(X64Emitter& e, const EmitArgType& i) { EmitCommutativeBinaryXmmOp(e, i, [&i](X64Emitter& e, const Xmm& dest, const Xmm& src1, const Xmm& src2) { @@ -2712,44 +2730,66 @@ EMITTER(VECTOR_ADD, MATCH(I, V128<>, V128<>>)) { case INT32_TYPE: if (saturate) { if (is_unsigned) { - // We reuse all these temps... - assert_true(src1 != e.xmm0 && src1 != e.xmm1 && src1 != e.xmm2); - assert_true(src2 != e.xmm0 && src2 != e.xmm1 && src2 != e.xmm2); - // Clamp to 0xFFFFFFFF. - // Wish there was a vpaddusd... - // | A | B | C | D | - // | B | D | - e.vpsllq(e.xmm0, src1, 32); - e.vpsllq(e.xmm1, src2, 32); - e.vpsrlq(e.xmm0, 32); - e.vpsrlq(e.xmm1, 32); - e.vpaddq(e.xmm0, e.xmm1); - e.vpcmpgtq(e.xmm0, e.GetXmmConstPtr(XMMUnsignedDwordMax)); - e.vpsllq(e.xmm0, 32); - e.vpsrlq(e.xmm0, 32); - // | A | C | - e.vpsrlq(e.xmm1, src1, 32); - e.vpsrlq(e.xmm2, src2, 32); - e.vpaddq(e.xmm1, e.xmm2); - e.vpcmpgtq(e.xmm1, e.GetXmmConstPtr(XMMUnsignedDwordMax)); - e.vpsllq(e.xmm1, 32); - // xmm0 = mask for with saturated dwords == 111... - e.vpor(e.xmm0, e.xmm1); - e.vpaddd(dest, src1, src2); - // dest.f[n] = xmm1.f[n] ? xmm1.f[n] : dest.f[n]; - e.vblendvps(dest, dest, e.xmm1, e.xmm1); + // TODO(benvanik): broken with UINT32MAX+1 + //// We reuse all these temps... + //assert_true(src1 != e.xmm0 && src1 != e.xmm1 && src1 != e.xmm2); + //assert_true(src2 != e.xmm0 && src2 != e.xmm1 && src2 != e.xmm2); + //// Clamp to 0xFFFFFFFF. + //// Wish there was a vpaddusd... + //// | A | B | C | D | + //// | B | D | + //e.vpsllq(e.xmm0, src1, 32); + //e.vpsllq(e.xmm1, src2, 32); + //e.vpsrlq(e.xmm0, 32); + //e.vpsrlq(e.xmm1, 32); + //e.vpaddq(e.xmm0, e.xmm1); + //e.vpcmpgtq(e.xmm0, e.GetXmmConstPtr(XMMUnsignedDwordMax)); + //e.vpsllq(e.xmm0, 32); + //e.vpsrlq(e.xmm0, 32); + //// | A | C | + //e.vpsrlq(e.xmm1, src1, 32); + //e.vpsrlq(e.xmm2, src2, 32); + //e.vpaddq(e.xmm1, e.xmm2); + //e.vpcmpgtq(e.xmm1, e.GetXmmConstPtr(XMMUnsignedDwordMax)); + //e.vpsllq(e.xmm1, 32); + //// xmm0 = mask for with saturated dwords == 111... + //e.vpor(e.xmm0, e.xmm1); + //e.vpaddd(dest, src1, src2); + //// dest.f[n] = xmm1.f[n] ? xmm1.f[n] : dest.f[n]; + //e.vblendvps(dest, dest, e.xmm1, e.xmm1); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorAddUnsignedSatI32)); + e.vmovaps(i.dest, e.xmm0); } else { // https://software.intel.com/en-us/forums/topic/285219 + // TODO(benvanik): this is broken with INTMAX+1. // We reuse all these temps... - assert_true(src1 != e.xmm0 && src1 != e.xmm1 && src1 != e.xmm2); - assert_true(src2 != e.xmm0 && src2 != e.xmm1 && src2 != e.xmm2); - e.vpaddd(e.xmm0, src1, src2); // res - e.vpand(e.xmm1, src1, src2); // sign_and - e.vpandn(e.xmm2, e.xmm0, e.xmm1); // min_sat_mask - e.vblendvps(e.xmm2, e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS), e.xmm2); - e.vpor(e.xmm1, src1, src2); // sign_or - e.vpandn(e.xmm1, e.xmm0); // max_sat_mask - e.vblendvps(dest, e.GetXmmConstPtr(XMMAbsMaskPS), e.xmm1); + //assert_true(src1 != e.xmm0 && src1 != e.xmm1 && src1 != e.xmm2); + //assert_true(src2 != e.xmm0 && src2 != e.xmm1 && src2 != e.xmm2); + //e.vpaddd(e.xmm0, src1, src2); // res + //e.vpand(e.xmm1, src1, src2); // sign_and + //e.vpandn(e.xmm2, e.xmm0, e.xmm1); // min_sat_mask + //e.vblendvps(dest, e.xmm0, e.GetXmmConstPtr(XMMSignMaskPS), e.xmm2); + //e.vpor(e.xmm1, src1, src2); // sign_or + //e.vpandn(e.xmm1, e.xmm0); // max_sat_mask + //e.vblendvps(dest, e.GetXmmConstPtr(XMMAbsMaskPS), e.xmm1); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe( + reinterpret_cast(EmulateVectorAddSignedSatI32)); + e.vmovaps(i.dest, e.xmm0); } } else { e.vpaddd(dest, src1, src2); @@ -2852,17 +2892,17 @@ EMITTER_OPCODE_TABLE( // OPCODE_VECTOR_SUB // ============================================================================ EMITTER(VECTOR_SUB, MATCH(I, V128<>, V128<>>)) { - static __m128i EmulateVectorSubSignedSatI32(__m128i src1, __m128i src2) { + static __m128i EmulateVectorSubSignedSatI32(void*, __m128i src1, __m128i src2) { alignas(16) int32_t src1v[4]; alignas(16) int32_t src2v[4]; alignas(16) int32_t value[4]; - _mm_store_si128(reinterpret_cast<__m128i*>(&src1v), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(&src2v), src2); + _mm_store_si128(reinterpret_cast<__m128i*>(src1v), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(src2v), src2); for (size_t i = 0; i < 4; ++i) { auto t = int64_t(src1v[i]) - int64_t(src2v[i]); value[i] = t < INT_MIN ? INT_MIN : (t > INT_MAX ? INT_MAX : int32_t(t)); } - return _mm_load_si128(reinterpret_cast<__m128i*>(&value)); + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } static void Emit(X64Emitter& e, const EmitArgType& i) { EmitCommutativeBinaryXmmOp(e, i, @@ -2901,8 +2941,8 @@ EMITTER(VECTOR_SUB, MATCH(I, V128<>, V128<>>)) { if (is_unsigned) { assert_always(); } else { - e.lea(e.r8, e.StashXmm(i.src1)); - e.lea(e.r9, e.StashXmm(i.src2)); + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.r9, e.StashXmm(1, i.src2)); e.CallNativeSafe( reinterpret_cast(EmulateVectorSubSignedSatI32)); e.vmovaps(i.dest, e.xmm0); @@ -3638,7 +3678,7 @@ EMITTER_OPCODE_TABLE( // TODO(benvanik): use approx here: // http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html EMITTER(POW2_F32, MATCH(I, F32<>>)) { - static __m128 EmulatePow2(__m128 src) { + static __m128 EmulatePow2(void*, __m128 src) { float src_value; _mm_store_ss(&src_value, src); float result = std::pow(2.0f, src_value); @@ -3646,13 +3686,13 @@ EMITTER(POW2_F32, MATCH(I, F32<>>)) { } static void Emit(X64Emitter& e, const EmitArgType& i) { assert_always(); - e.lea(e.r8, e.StashXmm(i.src1)); + e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulatePow2)); e.vmovaps(i.dest, e.xmm0); } }; EMITTER(POW2_F64, MATCH(I, F64<>>)) { - static __m128d EmulatePow2(__m128d src) { + static __m128d EmulatePow2(void*, __m128d src) { double src_value; _mm_store_sd(&src_value, src); double result = std::pow(2, src_value); @@ -3660,13 +3700,13 @@ EMITTER(POW2_F64, MATCH(I, F64<>>)) { } static void Emit(X64Emitter& e, const EmitArgType& i) { assert_always(); - e.lea(e.r8, e.StashXmm(i.src1)); + e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulatePow2)); e.vmovaps(i.dest, e.xmm0); } }; EMITTER(POW2_V128, MATCH(I, V128<>>)) { - static __m128 EmulatePow2(__m128 src) { + static __m128 EmulatePow2(void*, __m128 src) { alignas(16) float values[4]; _mm_store_ps(values, src); for (size_t i = 0; i < 4; ++i) { @@ -3675,7 +3715,7 @@ EMITTER(POW2_V128, MATCH(I, V128<>>)) { return _mm_load_ps(values); } static void Emit(X64Emitter& e, const EmitArgType& i) { - e.lea(e.r8, e.StashXmm(i.src1)); + e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulatePow2)); e.vmovaps(i.dest, e.xmm0); } @@ -3694,7 +3734,7 @@ EMITTER_OPCODE_TABLE( // http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html // TODO(benvanik): this emulated fn destroys all xmm registers! don't do it! EMITTER(LOG2_F32, MATCH(I, F32<>>)) { - static __m128 EmulateLog2(__m128 src) { + static __m128 EmulateLog2(void*, __m128 src) { float src_value; _mm_store_ss(&src_value, src); float result = std::log2(src_value); @@ -3702,13 +3742,13 @@ EMITTER(LOG2_F32, MATCH(I, F32<>>)) { } static void Emit(X64Emitter& e, const EmitArgType& i) { assert_always(); - e.lea(e.r8, e.StashXmm(i.src1)); + e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateLog2)); e.vmovaps(i.dest, e.xmm0); } }; EMITTER(LOG2_F64, MATCH(I, F64<>>)) { - static __m128d EmulateLog2(__m128d src) { + static __m128d EmulateLog2(void*, __m128d src) { double src_value; _mm_store_sd(&src_value, src); double result = std::log2(src_value); @@ -3716,13 +3756,13 @@ EMITTER(LOG2_F64, MATCH(I, F64<>>)) { } static void Emit(X64Emitter& e, const EmitArgType& i) { assert_always(); - e.lea(e.r8, e.StashXmm(i.src1)); + e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateLog2)); e.vmovaps(i.dest, e.xmm0); } }; EMITTER(LOG2_V128, MATCH(I, V128<>>)) { - static __m128 EmulateLog2(__m128 src) { + static __m128 EmulateLog2(void*, __m128 src) { alignas(16) float values[4]; _mm_store_ps(values, src); for (size_t i = 0; i < 4; ++i) { @@ -3731,7 +3771,7 @@ EMITTER(LOG2_V128, MATCH(I, V128<>>)) { return _mm_load_ps(values); } static void Emit(X64Emitter& e, const EmitArgType& i) { - e.lea(e.r8, e.StashXmm(i.src1)); + e.lea(e.r8, e.StashXmm(0, i.src1)); e.CallNativeSafe(reinterpret_cast(EmulateLog2)); e.vmovaps(i.dest, e.xmm0); } @@ -4132,131 +4172,83 @@ EMITTER(VECTOR_SHL_V128, MATCH(I, V128<>, V128<>>)) { break; } } - static void EmitInt8(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 16 - n; ++n) { - if (shamt.b16[n] != shamt.b16[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same. - uint8_t sh = shamt.b16[0] & 0x7; - if (!sh) { - // No shift? - e.vmovaps(i.dest, i.src1); - } else { - // Even bytes. - e.vpsrlw(e.xmm0, i.src1, 8); - e.vpsllw(e.xmm0, sh + 8); - // Odd bytes. - e.vpsllw(i.dest, i.src1, 8); - e.vpsrlw(i.dest, 8 - sh); - // Mix. - e.vpor(i.dest, e.xmm0); - } - } else { - // Counts differ, so pre-mask and load constant. - assert_always(); - } - } else { - // Fully variable shift. - // TODO(benvanik): find a better sequence. - Xmm temp = i.dest; - if (i.dest == i.src1 || i.dest == i.src2) { - temp = e.xmm2; - } - auto byte_mask = e.GetXmmConstPtr(XMMShiftByteMask); - // AABBCCDD|EEFFGGHH|IIJJKKLL|MMNNOOPP - // DD| HH| LL| PP - e.vpand(e.xmm0, i.src1, byte_mask); - e.vpand(e.xmm1, i.src2, byte_mask); - e.vpsllvd(temp, e.xmm0, e.xmm1); - // CC | GG | KK | OO - e.vpsrld(e.xmm0, i.src1, 8); - e.vpand(e.xmm0, byte_mask); - e.vpsrld(e.xmm1, i.src2, 8); - e.vpand(e.xmm1, byte_mask); - e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); - e.vpslld(e.xmm0, 8); - e.vpor(temp, e.xmm0); - // BB | FF | JJ | NN - e.vpsrld(e.xmm0, i.src1, 16); - e.vpand(e.xmm0, byte_mask); - e.vpsrld(e.xmm1, i.src2, 16); - e.vpand(e.xmm1, byte_mask); - e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); - e.vpslld(e.xmm0, 16); - e.vpor(temp, e.xmm0); - // AA |EE |II |MM - e.vpsrld(e.xmm0, i.src1, 24); - e.vpand(e.xmm0, byte_mask); - e.vpsrld(e.xmm1, i.src2, 24); - e.vpand(e.xmm1, byte_mask); - e.vpsllvd(e.xmm0, e.xmm0, e.xmm1); - e.vpslld(e.xmm0, 24); - e.vpor(i.dest, temp, e.xmm0); + static __m128i EmulateVectorShlI8(void*, __m128i src1, __m128i src2) { + alignas(16) uint8_t value[16]; + alignas(16) uint8_t shamt[16]; + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + for (size_t i = 0; i < 16; ++i) { + value[i] = value[i] << (shamt[i] & 0x7); } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); + } + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI8)); + e.vmovaps(i.dest, e.xmm0); + } + static __m128i EmulateVectorShlI16(void*, __m128i src1, __m128i src2) { + alignas(16) uint16_t value[8]; + alignas(16) uint16_t shamt[8]; + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + for (size_t i = 0; i < 8; ++i) { + value[i] = value[i] << (shamt[i] & 0xF); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } static void EmitInt16(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); bool all_same = true; for (size_t n = 0; n < 8 - n; ++n) { - if (shamt.s8[n] != shamt.s8[n + 1]) { + if (shamt.u16[n] != shamt.u16[n + 1]) { all_same = false; break; } } if (all_same) { // Every count is the same, so we can use vpsllw. - e.vpsllw(i.dest, i.src1, shamt.s8[0] & 0xF); - } else { - // Counts differ, so pre-mask and load constant. - assert_always(); + e.vpsllw(i.dest, i.src1, shamt.u16[0] & 0xF); + return; } - } else { - // Fully variable shift. - // TODO(benvanik): find a better sequence. - Xmm src1 = !i.src1.is_constant ? i.src1 : e.xmm2; - if (i.src1.is_constant) { - e.LoadConstantXmm(src1, i.src1.constant()); - } - // Even: - e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskEvenPI16)); - e.vpsllvd(e.xmm1, src1, e.xmm0); - e.vpand(e.xmm1, e.GetXmmConstPtr(XMMMaskEvenPI16)); - // Odd: - e.vpsrld(e.xmm0, i.src2, 16); - e.vpand(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskEvenPI16)); - e.vpsrld(i.dest, src1, 16); - e.vpsllvd(i.dest, i.dest, e.xmm0); - e.vpslld(i.dest, 8); - // Merge: - e.vpor(i.dest, e.xmm1); } + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShlI16)); + e.vmovaps(i.dest, e.xmm0); } static void EmitInt32(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); bool all_same = true; for (size_t n = 0; n < 4 - n; ++n) { - if (shamt.i4[n] != shamt.i4[n + 1]) { + if (shamt.u32[n] != shamt.u32[n + 1]) { all_same = false; break; } } if (all_same) { // Every count is the same, so we can use vpslld. - e.vpslld(i.dest, i.src1, shamt.b16[0] & 0x1F); + e.vpslld(i.dest, i.src1, shamt.u8[0] & 0x1F); } else { // Counts differ, so pre-mask and load constant. vec128_t masked = i.src2.constant(); for (size_t n = 0; n < 4; ++n) { - masked.i4[n] &= 0x1F; + masked.u32[n] &= 0x1F; } e.LoadConstantXmm(e.xmm0, masked); e.vpsllvd(i.dest, i.src1, e.xmm0); @@ -4295,81 +4287,83 @@ EMITTER(VECTOR_SHR_V128, MATCH(I, V128<>, V128<>>)) { break; } } - static void EmitInt8(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - const auto& shamt = i.src2.constant(); - bool all_same = true; - for (size_t n = 0; n < 16 - n; ++n) { - if (shamt.b16[n] != shamt.b16[n + 1]) { - all_same = false; - break; - } - } - if (all_same) { - // Every count is the same. - uint8_t sh = shamt.b16[0] & 0x7; - if (!sh) { - // No shift? - e.vmovaps(i.dest, i.src1); - } else { - // Even bytes. - e.vpsllw(e.xmm0, i.src1, 8); - e.vpsrlw(e.xmm0, sh + 8); - // Odd bytes. - e.vpsrlw(i.dest, i.src1, 8); - e.vpsllw(i.dest, 8 - sh); - // Mix. - e.vpor(i.dest, e.xmm0); - } - } else { - // Counts differ, so pre-mask and load constant. - assert_always(); - } - } else { - // Fully variable shift. - assert_always(); + static __m128i EmulateVectorShrI8(void*, __m128i src1, __m128i src2) { + alignas(16) uint8_t value[16]; + alignas(16) uint8_t shamt[16]; + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + for (size_t i = 0; i < 16; ++i) { + value[i] = value[i] >> (shamt[i] & 0x7); } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); + } + static void EmitInt8(X64Emitter& e, const EmitArgType& i) { + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI8)); + e.vmovaps(i.dest, e.xmm0); + } + static __m128i EmulateVectorShrI16(void*, __m128i src1, __m128i src2) { + alignas(16) uint16_t value[8]; + alignas(16) uint16_t shamt[8]; + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + for (size_t i = 0; i < 8; ++i) { + value[i] = value[i] >> (shamt[i] & 0xF); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } static void EmitInt16(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); bool all_same = true; for (size_t n = 0; n < 8 - n; ++n) { - if (shamt.s8[n] != shamt.s8[n + 1]) { + if (shamt.u16[n] != shamt.u16[n + 1]) { all_same = false; break; } } if (all_same) { // Every count is the same, so we can use vpsllw. - e.vpsrlw(i.dest, i.src1, shamt.s8[0] & 0xF); - } else { - // Counts differ, so pre-mask and load constant. - assert_always(); + e.vpsrlw(i.dest, i.src1, shamt.u16[0] & 0xF); + return; } - } else { - // Fully variable shift. - assert_always(); } + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShrI16)); + e.vmovaps(i.dest, e.xmm0); } static void EmitInt32(X64Emitter& e, const EmitArgType& i) { if (i.src2.is_constant) { const auto& shamt = i.src2.constant(); bool all_same = true; for (size_t n = 0; n < 4 - n; ++n) { - if (shamt.i4[n] != shamt.i4[n + 1]) { + if (shamt.u32[n] != shamt.u32[n + 1]) { all_same = false; break; } } if (all_same) { // Every count is the same, so we can use vpslld. - e.vpsrld(i.dest, i.src1, shamt.b16[0] & 0x1F); + e.vpsrld(i.dest, i.src1, shamt.u8[0] & 0x1F); } else { // Counts differ, so pre-mask and load constant. vec128_t masked = i.src2.constant(); for (size_t n = 0; n < 4; ++n) { - masked.i4[n] &= 0x1F; + masked.u32[n] &= 0x1F; } e.LoadConstantXmm(e.xmm0, masked); e.vpsrlvd(i.dest, i.src1, e.xmm0); @@ -4392,26 +4386,61 @@ EMITTER_OPCODE_TABLE( // OPCODE_VECTOR_SHA // ============================================================================ EMITTER(VECTOR_SHA_V128, MATCH(I, V128<>, V128<>>)) { + static __m128i EmulateVectorShaI8(void*, __m128i src1, __m128i src2) { + alignas(16) int8_t value[16]; + alignas(16) int8_t shamt[16]; + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + for (size_t i = 0; i < 16; ++i) { + value[i] = value[i] >> (shamt[i] & 0x7); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); + } + static __m128i EmulateVectorShaI16(void*, __m128i src1, __m128i src2) { + alignas(16) int16_t value[8]; + alignas(16) int16_t shamt[8]; + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); + for (size_t i = 0; i < 8; ++i) { + value[i] = value[i] >> (shamt[i] & 0xF); + } + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); + } static void Emit(X64Emitter& e, const EmitArgType& i) { switch (i.instr->flags) { + case INT8_TYPE: + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI8)); + e.vmovaps(i.dest, e.xmm0); + break; case INT16_TYPE: - // Even halfwords: - e.vpand(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskEvenPI16)); - e.vpslld(e.xmm1, i.src1, 16); - e.vpsrad(e.xmm1, 8); - e.vpsravd(e.xmm1, e.xmm1, e.xmm0); - // Odd halfwords: - e.vpsrld(e.xmm0, i.src2, 16); - e.vpand(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskEvenPI16)); - e.vpslld(i.dest, i.src1, 16); - e.vpsravd(i.dest, i.dest, e.xmm0); - // Merge: - e.vpor(i.dest, e.xmm1); + // TODO(benvanik): native version (with shift magic). + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.lea(e.r9, e.StashXmm(1, e.xmm0)); + } else { + e.lea(e.r9, e.StashXmm(1, i.src2)); + } + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.CallNativeSafe(reinterpret_cast(EmulateVectorShaI16)); + e.vmovaps(i.dest, e.xmm0); break; case INT32_TYPE: // src shift mask may have values >31, and x86 sets to zero when // that happens so we mask. - e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + if (i.src2.is_constant) { + e.LoadConstantXmm(e.xmm0, i.src2.constant()); + e.vandps(e.xmm0, e.GetXmmConstPtr(XMMShiftMaskPS)); + } else { + e.vandps(e.xmm0, i.src2, e.GetXmmConstPtr(XMMShiftMaskPS)); + } e.vpsravd(i.dest, i.src1, e.xmm0); break; default: @@ -4490,39 +4519,39 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // TODO(benvanik): AVX512 has a native variable rotate (rolv). EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I, V128<>, V128<>>)) { - static __m128i EmulateVectorRotateLeftI8(__m128i src1, __m128i src2) { + static __m128i EmulateVectorRotateLeftI8(void*, __m128i src1, __m128i src2) { alignas(16) uint8_t value[16]; alignas(16) uint8_t shamt[16]; - _mm_store_si128(reinterpret_cast<__m128i*>(&value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(&shamt), src2); + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); for (size_t i = 0; i < 16; ++i) { - value[i] = poly::rotate_left(value[i], shamt[i] & 0x3); + value[i] = poly::rotate_left(value[i], shamt[i] & 0x7); } - return _mm_load_si128(reinterpret_cast<__m128i*>(&value)); + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } - static __m128i EmulateVectorRotateLeftI16(__m128i src1, __m128i src2) { + static __m128i EmulateVectorRotateLeftI16(void*, __m128i src1, __m128i src2) { alignas(16) uint16_t value[8]; alignas(16) uint16_t shamt[8]; - _mm_store_si128(reinterpret_cast<__m128i*>(&value), src1); - _mm_store_si128(reinterpret_cast<__m128i*>(&shamt), src2); + _mm_store_si128(reinterpret_cast<__m128i*>(value), src1); + _mm_store_si128(reinterpret_cast<__m128i*>(shamt), src2); for (size_t i = 0; i < 8; ++i) { value[i] = poly::rotate_left(value[i], shamt[i] & 0xF); } - return _mm_load_si128(reinterpret_cast<__m128i*>(&value)); + return _mm_load_si128(reinterpret_cast<__m128i*>(value)); } static void Emit(X64Emitter& e, const EmitArgType& i) { switch (i.instr->flags) { case INT8_TYPE: // TODO(benvanik): native version (with shift magic). - e.lea(e.r8, e.StashXmm(i.src1)); - e.lea(e.r9, e.StashXmm(i.src2)); + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.r9, e.StashXmm(1, i.src2)); e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI8)); e.vmovaps(i.dest, e.xmm0); break; case INT16_TYPE: // TODO(benvanik): native version (with shift magic). - e.lea(e.r8, e.StashXmm(i.src1)); - e.lea(e.r9, e.StashXmm(i.src2)); + e.lea(e.r8, e.StashXmm(0, i.src1)); + e.lea(e.r9, e.StashXmm(1, i.src2)); e.CallNativeSafe(reinterpret_cast(EmulateVectorRotateLeftI16)); e.vmovaps(i.dest, e.xmm0); break; @@ -4537,7 +4566,7 @@ EMITTER(VECTOR_ROTATE_LEFT_V128, MATCH(I, V128 // Shift right (to get low bits): e.vmovaps(temp, e.GetXmmConstPtr(XMMPI32)); e.vpsubd(temp, e.xmm0); - e.vpsrlvd(i.dest, i.src1, e.xmm0); + e.vpsrlvd(i.dest, i.src1, temp); // Merge: e.vpor(i.dest, e.xmm1); break; @@ -4629,6 +4658,23 @@ EMITTER_OPCODE_TABLE( // ============================================================================ // OPCODE_INSERT // ============================================================================ +EMITTER(INSERT_I8, MATCH(I, V128<>, I64<>, I8<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + } +}; +EMITTER(INSERT_I16, MATCH(I, V128<>, I64<>, I16<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + } +}; +EMITTER(INSERT_I32, MATCH(I, V128<>, I64<>, I32<>>)) { + static void Emit(X64Emitter& e, const EmitArgType& i) { + } +}; +EMITTER_OPCODE_TABLE( + OPCODE_INSERT, + INSERT_I8, + INSERT_I16, + INSERT_I32); // ============================================================================ @@ -4643,14 +4689,13 @@ EMITTER(EXTRACT_I8, MATCH(I, V128<>, I8<>>)) { if (i.src2.is_constant) { e.vpextrb(i.dest.reg().cvt32(), i.src1, VEC128_B(i.src2.constant())); } else { - assert_always(); - // TODO(benvanik): try out hlide's version: - // e.mov(e.eax, 0x80808003); - // e.xor(e.al, i.src2); - // e.and(e.al, 15); - // e.vmovd(e.xmm0, e.eax); - // e.vpshufb(e.xmm0, i.src1, e.xmm0); - // e.vmovd(i.dest.reg().cvt32(), e.xmm0); + e.mov(e.eax, 0x00000003); + e.xor(e.al, i.src2); + e.and(e.al, 0x1F); + e.vmovd(e.xmm0, e.eax); + e.vpshufb(e.xmm0, i.src1, e.xmm0); + e.vmovd(i.dest.reg().cvt32(), e.xmm0); + e.and(i.dest, uint8_t(0xFF)); } } }; @@ -4659,20 +4704,21 @@ EMITTER(EXTRACT_I16, MATCH(I, V128<>, I8<>>)) { if (i.src2.is_constant) { e.vpextrw(i.dest.reg().cvt32(), i.src1, VEC128_W(i.src2.constant())); } else { - // TODO(benvanik): try out hlide's version: e.mov(e.al, i.src2); - e.xor(e.al, 0x1); + e.xor(e.al, 0x01); + e.shl(e.al, 1); e.mov(e.ah, e.al); e.add(e.ah, 1); e.vmovd(e.xmm0, e.eax); e.vpshufb(e.xmm0, i.src1, e.xmm0); e.vmovd(i.dest.reg().cvt32(), e.xmm0); + e.and(i.dest.reg().cvt32(), 0xFFFFu); } } }; EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { static void Emit(X64Emitter& e, const EmitArgType& i) { - static vec128_t extract_table_32[4] = { + static const vec128_t extract_table_32[4] = { vec128b( 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), vec128b( 7, 6, 5, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), vec128b(11, 10, 9, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), @@ -4706,29 +4752,11 @@ EMITTER(EXTRACT_I32, MATCH(I, V128<>, I8<>>)) { } } }; -EMITTER(EXTRACT_F32, MATCH(I, V128<>, I8<>>)) { - static void Emit(X64Emitter& e, const EmitArgType& i) { - if (i.src2.is_constant) { - e.vextractps(i.dest, i.src1, VEC128_F(i.src2.constant())); - } else { - assert_always(); - // TODO(benvanik): try out hlide's version: - // e.mov(e.eax, 3); - // e.and(e.al, i.src2); // eax = [(i&3), 0, 0, 0] - // e.imul(e.eax, 0x04040404); // [(i&3)*4, (i&3)*4, (i&3)*4, (i&3)*4] - // e.add(e.eax, 0x00010203); // [((i&3)*4)+3, ((i&3)*4)+2, ((i&3)*4)+1, ((i&3)*4)+0] - // e.vmovd(e.xmm0, e.eax); - // e.vpshufb(e.xmm0, i.src1, e.xmm0); - // e.vmovd(i.dest, e.xmm0); - } - } -}; EMITTER_OPCODE_TABLE( OPCODE_EXTRACT, EXTRACT_I8, EXTRACT_I16, - EXTRACT_I32, - EXTRACT_F32); + EXTRACT_I32); // ============================================================================ @@ -4805,15 +4833,15 @@ EMITTER(PERMUTE_I32, MATCH(I, I32<>, V128<>, V128<>>)) { // Shuffle things into the right places in dest & xmm0, // then we blend them together. uint32_t src_control = - (((control >> 24) & 0x3) << 0) | - (((control >> 16) & 0x3) << 2) | - (((control >> 8) & 0x3) << 4) | - (((control >> 0) & 0x3) << 6); + (((control >> 24) & 0x3) << 6) | + (((control >> 16) & 0x3) << 4) | + (((control >> 8) & 0x3) << 2) | + (((control >> 0) & 0x3) << 0); uint32_t blend_control = - (((control >> 26) & 0x1) << 0) | - (((control >> 18) & 0x1) << 1) | - (((control >> 10) & 0x1) << 2) | - (((control >> 2) & 0x1) << 3); + (((control >> 26) & 0x1) << 3) | + (((control >> 18) & 0x1) << 2) | + (((control >> 10) & 0x1) << 1) | + (((control >> 2) & 0x1) << 0); // TODO(benvanik): if src2/src3 are constants, shuffle now! Xmm src2; if (i.src2.is_constant) { @@ -4923,11 +4951,6 @@ EMITTER(SWIZZLE, MATCH(I, V128<>, OffsetOp>)) { assert_always(); } else if (element_type == INT32_TYPE || element_type == FLOAT32_TYPE) { uint8_t swizzle_mask = static_cast(i.src2.value); - swizzle_mask = - (((swizzle_mask >> 6) & 0x3) << 0) | - (((swizzle_mask >> 4) & 0x3) << 2) | - (((swizzle_mask >> 2) & 0x3) << 4) | - (((swizzle_mask >> 0) & 0x3) << 6); e.vpshufd(i.dest, i.src1, swizzle_mask); } else if (element_type == INT64_TYPE || element_type == FLOAT64_TYPE) { assert_always(); @@ -4976,54 +4999,40 @@ EMITTER(PACK, MATCH(I, V128<>>)) { } static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { // RGBA (XYZW) -> ARGB (WXYZ) - // float r = roundf(((src1.x < 0) ? 0 : ((1 < src1.x) ? 1 : src1.x)) * 255); - // float g = roundf(((src1.y < 0) ? 0 : ((1 < src1.y) ? 1 : src1.y)) * 255); - // float b = roundf(((src1.z < 0) ? 0 : ((1 < src1.z) ? 1 : src1.z)) * 255); - // float a = roundf(((src1.w < 0) ? 0 : ((1 < src1.w) ? 1 : src1.w)) * 255); - // dest.iw = ((uint32_t)a << 24) | - // ((uint32_t)r << 16) | - // ((uint32_t)g << 8) | - // ((uint32_t)b); - // f2i(clamp(src, 0, 1) * 255) - e.vpxor(e.xmm0, e.xmm0); + // w = ((src1.uw & 0xFF) << 24) | ((src1.ux & 0xFF) << 16) | + // ((src1.uy & 0xFF) << 8) | (src1.uz & 0xFF) if (i.src1.is_constant) { - e.LoadConstantXmm(e.xmm1, i.src1.constant()); - e.vmaxps(e.xmm0, e.xmm1); + e.LoadConstantXmm(i.dest, i.src1.constant()); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackD3DCOLOR)); } else { - e.vmaxps(e.xmm0, i.src1); + e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMPackD3DCOLOR)); } - e.vminps(e.xmm0, e.GetXmmConstPtr(XMMOne)); - e.vmulps(e.xmm0, e.GetXmmConstPtr(XMM255)); - e.vcvttps2dq(e.xmm0, e.xmm0); - e.vpshufb(i.dest, e.xmm0, e.GetXmmConstPtr(XMMPackD3DCOLOR)); } static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { // http://blogs.msdn.com/b/chuckw/archive/2012/09/11/directxmath-f16c-and-fma.aspx // dest = [(src1.x | src1.y), 0, 0, 0] // 0|0|0|0|W|Z|Y|X - e.vcvtps2ph(e.xmm0, i.src1, B00000011); - // Y|X|W|Z|0|0|0|0 - e.vpshufd(e.xmm0, e.xmm0, B00011011); - // Shuffle to X|Y|Z|W|0|0|0|0 - e.vpshufhw(e.xmm0, e.xmm0, B10110001); - // Select just X|Y - e.vxorps(i.dest, i.dest); - e.vpblendw(i.dest, e.xmm0, B11000000); + e.vcvtps2ph(i.dest, i.dest, B00000011); + // Shuffle to X|Y|0|0|0|0|0|0 + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_2)); } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { // dest = [(src1.x | src1.y), (src1.z | src1.w), 0, 0] // 0|0|0|0|W|Z|Y|X e.vcvtps2ph(e.xmm0, i.src1, B00000011); - // Y|X|W|Z|0|0|0|0 - e.vpshufd(e.xmm0, e.xmm0, B00011011); // Shuffle to X|Y|Z|W|0|0|0|0 - e.vpshufhw(e.xmm0, e.xmm0, B10110001); - // Select just X|Y|Z|W - e.vxorps(i.dest, i.dest); - e.vpblendw(i.dest, e.xmm0, B11110000); + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackFLOAT16_4)); } static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { - assert_always(); + // Saturate. + e.vmaxps(i.dest, i.src1, e.GetXmmConstPtr(XMMNegativeOne)); + e.vminps(i.dest, i.dest, e.GetXmmConstPtr(XMMOne)); + // Multiply by SHRT_MAX. + e.vmulps(i.dest, i.dest, e.GetXmmConstPtr(XMMShortMaxPS)); + // Convert to int32. + e.vcvtps2dq(i.dest, i.dest); + // Pack. + e.vpshufb(i.dest, i.dest, e.GetXmmConstPtr(XMMPackSHORT_2)); } static void EmitS8_IN_16_LO(X64Emitter& e, const EmitArgType& i) { assert_always(); @@ -5079,23 +5088,14 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { static void EmitD3DCOLOR(X64Emitter& e, const EmitArgType& i) { // ARGB (WXYZ) -> RGBA (XYZW) // XMLoadColor - // int32_t src = (int32_t)src1.iw; - // dest.f4[0] = (float)((src >> 16) & 0xFF) * (1.0f / 255.0f); - // dest.f4[1] = (float)((src >> 8) & 0xFF) * (1.0f / 255.0f); - // dest.f4[2] = (float)(src & 0xFF) * (1.0f / 255.0f); - // dest.f4[3] = (float)((src >> 24) & 0xFF) * (1.0f / 255.0f); if (i.src1.is_constant) { - e.vpxor(i.dest, i.dest); - return; + assert_always(); } - // src = ZZYYXXWW - // unpack to 000000ZZ,000000YY,000000XX,000000WW + // Unpack to 000000ZZ,000000YY,000000XX,000000WW e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackD3DCOLOR)); - // int -> float - e.vcvtdq2ps(i.dest, i.dest); - // mult by 1/255 - e.vmulps(i.dest, e.GetXmmConstPtr(XMMOneOver255)); + // Add 1.0f to each. + e.vpor(i.dest, e.GetXmmConstPtr(XMMOne)); } static void EmitFLOAT16_2(X64Emitter& e, const EmitArgType& i) { // 1 bit sign, 5 bit exponent, 10 bit mantissa @@ -5114,13 +5114,17 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { // XMConvertHalfToFloat(sy), // 0.0, // 1.0 }; - e.vcvtph2ps(i.dest, i.src1); + // Shuffle to 0|0|0|0|0|0|Y|X + e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackFLOAT16_2)); + e.vcvtph2ps(i.dest, i.dest); e.vpshufd(i.dest, i.dest, B10100100); e.vpor(i.dest, e.GetXmmConstPtr(XMM0001)); } static void EmitFLOAT16_4(X64Emitter& e, const EmitArgType& i) { // src = [(dest.x | dest.y), (dest.z | dest.w), 0, 0] - e.vcvtph2ps(i.dest, i.src1); + // Shuffle to 0|0|0|0|W|Z|Y|X + e.vpshufb(i.dest, i.src1, e.GetXmmConstPtr(XMMUnpackFLOAT16_4)); + e.vcvtph2ps(i.dest, i.dest); } static void EmitSHORT_2(X64Emitter& e, const EmitArgType& i) { // (VD.x) = 3.0 + (VB.x>>16)*2^-22 @@ -5131,31 +5135,23 @@ EMITTER(UNPACK, MATCH(I, V128<>>)) { // XMLoadShortN2 plus 3,3,0,3 (for some reason) // src is (xx,xx,xx,VALUE) // (VALUE,VALUE,VALUE,VALUE) + Xmm src; if (i.src1.is_constant) { if (i.src1.value->IsConstantZero()) { - e.vpxor(i.dest, i.dest); + e.vmovdqa(i.dest, e.GetXmmConstPtr(XMM3301)); + return; } else { - // TODO(benvanik): check other common constants. - e.LoadConstantXmm(i.dest, i.src1.constant()); - e.vbroadcastss(i.dest, i.src1); + // TODO(benvanik): check other common constants/perform shuffle/or here. + src = e.xmm0; + e.LoadConstantXmm(src, i.src1.constant()); } } else { - e.vbroadcastss(i.dest, i.src1); + src = i.src1; } - // (VALUE&0xFFFF,VALUE&0xFFFF0000,0,0) - e.vandps(i.dest, e.GetXmmConstPtr(XMMMaskX16Y16)); - // Sign extend. - e.vxorps(i.dest, e.GetXmmConstPtr(XMMFlipX16Y16)); - // Convert int->float. - e.cvtpi2ps(i.dest, e.StashXmm(i.dest)); - // 0x8000 to undo sign. - e.vaddps(i.dest, e.GetXmmConstPtr(XMMFixX16Y16)); - // Normalize. - e.vmulps(i.dest, e.GetXmmConstPtr(XMMNormalizeX16Y16)); - // Clamp. - e.vmaxps(i.dest, e.GetXmmConstPtr(XMMNegativeOne)); + // Shuffle bytes. + e.vpshufb(i.dest, src, e.GetXmmConstPtr(XMMUnpackSHORT_2)); // Add 3,3,0,1. - e.vaddps(i.dest, e.GetXmmConstPtr(XMM3301)); + e.vpor(i.dest, e.GetXmmConstPtr(XMM3301)); } static void EmitS8_IN_16_LO(X64Emitter& e, const EmitArgType& i) { e.vpunpckhbw(i.dest, i.src1, i.src1); @@ -5368,7 +5364,7 @@ void RegisterSequences() { REGISTER_EMITTER_OPCODE_TABLE(OPCODE_VECTOR_ROTATE_LEFT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_BYTE_SWAP); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_CNTLZ); - //REGISTER_EMITTER_OPCODE_TABLE(OPCODE_INSERT); + REGISTER_EMITTER_OPCODE_TABLE(OPCODE_INSERT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_EXTRACT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_SPLAT); REGISTER_EMITTER_OPCODE_TABLE(OPCODE_PERMUTE); diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index b99c003b8..479c94f79 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -28,6 +28,7 @@ Value* CalculateEA_0(PPCHIRBuilder& f, uint32_t ra, uint32_t rb); // Most of this file comes from: // http://biallas.net/doc/vmx128/vmx128.txt // https://github.com/kakaroto/ps3ida/blob/master/plugins/PPCAltivec/src/main.cpp +// http://sannybuilder.com/forums/viewtopic.php?id=190 #define OP(x) ((((uint32_t)(x)) & 0x3f) << 26) #define VX128(op, xop) (OP(op) | (((uint32_t)(xop)) & 0x3d0)) @@ -154,7 +155,7 @@ XEEMITTER(lvxl128, VX128_1(4, 707), VX128_1)(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(stvebx, 0x7C00010E, X)(PPCHIRBuilder& f, InstrData& i) { Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); - Value* el = f.And(ea, f.LoadConstant(0xFull)); + Value* el = f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstant(uint8_t(0xF))); Value* v = f.Extract(f.LoadVR(i.X.RT), el, INT8_TYPE); f.Store(ea, v); return 0; @@ -163,7 +164,8 @@ XEEMITTER(stvebx, 0x7C00010E, X)(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(stvehx, 0x7C00014E, X)(PPCHIRBuilder& f, InstrData& i) { Value* ea = CalculateEA_0(f, i.X.RA, i.X.RB); ea = f.And(ea, f.LoadConstant(~0x1ull)); - Value* el = f.Shr(f.And(ea, f.LoadConstant(0xFull)), 1); + Value* el = + f.Shr(f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstant(uint8_t(0xF))), 1); Value* v = f.Extract(f.LoadVR(i.X.RT), el, INT16_TYPE); f.Store(ea, f.ByteSwap(v)); return 0; @@ -173,7 +175,8 @@ int InstrEmit_stvewx_(PPCHIRBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, uint32_t rb) { Value* ea = CalculateEA_0(f, ra, rb); ea = f.And(ea, f.LoadConstant(~0x3ull)); - Value* el = f.Shr(f.And(ea, f.LoadConstant(0xFull)), 2); + Value* el = + f.Shr(f.And(f.Truncate(ea, INT8_TYPE), f.LoadConstant(uint8_t(0xF))), 2); Value* v = f.Extract(f.LoadVR(vd), el, INT32_TYPE); f.Store(ea, f.ByteSwap(v)); return 0; @@ -239,8 +242,8 @@ int InstrEmit_lvrx_(PPCHIRBuilder& f, InstrData& i, uint32_t vd, uint32_t ra, ea = f.And(ea, f.LoadConstant(~0xFull)); // v = (new >> (16 - eb)) Value* v = f.Permute(f.LoadVectorShr(f.Sub(f.LoadConstant((int8_t)16), eb)), - f.LoadZero(VEC128_TYPE), - f.ByteSwap(f.Load(ea, VEC128_TYPE)), INT8_TYPE); + f.ByteSwap(f.Load(ea, VEC128_TYPE)), + f.LoadZero(VEC128_TYPE), INT8_TYPE); f.StoreVR(vd, v); return 0; } @@ -935,8 +938,8 @@ int InstrEmit_vmrghw_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) { // (VD.y) = (VB.x) // (VD.z) = (VA.y) // (VD.w) = (VB.y) - Value* v = f.Permute(f.LoadConstant(0x00040105), f.LoadVR(va), f.LoadVR(vb), - INT32_TYPE); + Value* v = f.Permute(f.LoadConstant(PERMUTE_MASK(0, 0, 1, 0, 0, 1, 1, 1)), + f.LoadVR(va), f.LoadVR(vb), INT32_TYPE); f.StoreVR(vd, v); return 0; } @@ -962,8 +965,8 @@ int InstrEmit_vmrglw_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb) { // (VD.y) = (VB.z) // (VD.z) = (VA.w) // (VD.w) = (VB.w) - Value* v = f.Permute(f.LoadConstant(0x02060307), f.LoadVR(va), f.LoadVR(vb), - INT32_TYPE); + Value* v = f.Permute(f.LoadConstant(PERMUTE_MASK(0, 2, 1, 2, 0, 3, 1, 3)), + f.LoadVR(va), f.LoadVR(vb), INT32_TYPE); f.StoreVR(vd, v); return 0; } @@ -1140,7 +1143,8 @@ XEEMITTER(vpermwi128, VX128_P(6, 528), VX128_P)(PPCHIRBuilder& f, const uint32_t vd = i.VX128_P.VD128l | (i.VX128_P.VD128h << 5); const uint32_t vb = i.VX128_P.VB128l | (i.VX128_P.VB128h << 5); uint32_t uimm = i.VX128_P.PERMl | (i.VX128_P.PERMh << 5); - Value* v = f.Swizzle(f.LoadVR(vb), INT32_TYPE, uimm); + uint32_t mask = SWIZZLE_MASK(uimm >> 6, uimm >> 4, uimm >> 2, uimm >> 0); + Value* v = f.Swizzle(f.LoadVR(vb), INT32_TYPE, mask); f.StoreVR(vd, v); return 0; } @@ -1213,14 +1217,16 @@ XEEMITTER(vrfiz128, VX128_3(6, 1008), VX128_3)(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(vrlb, 0x10000004, VX)(PPCHIRBuilder& f, InstrData& i) { // (VD) <- ROTL((VA), (VB)&0x3) - Value* v = f.VectorRotateLeft(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT8_TYPE); + Value* v = + f.VectorRotateLeft(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT8_TYPE); f.StoreVR(i.VX.VD, v); return 0; } XEEMITTER(vrlh, 0x10000044, VX)(PPCHIRBuilder& f, InstrData& i) { // (VD) <- ROTL((VA), (VB)&0xF) - Value* v = f.VectorRotateLeft(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT16_TYPE); + Value* v = + f.VectorRotateLeft(f.LoadVR(i.VX.VA), f.LoadVR(i.VX.VB), INT16_TYPE); f.StoreVR(i.VX.VD, v); return 0; } @@ -1244,10 +1250,10 @@ XEEMITTER(vrlimi128, VX128_4(6, 1808), VX128_4)(PPCHIRBuilder& f, const uint32_t vb = i.VX128_4.VB128l | (i.VX128_4.VB128h << 5); uint32_t blend_mask_src = i.VX128_4.IMM; uint32_t blend_mask = 0; - for (int n = 3; n >= 0; n--) { - blend_mask |= ((blend_mask_src & 0x1) ? n : (4 + n)) << ((3 - n) * 8); - blend_mask_src >>= 1; - } + blend_mask |= (((blend_mask_src >> 3) & 0x1) ? 0 : 4) << 0; + blend_mask |= (((blend_mask_src >> 2) & 0x1) ? 1 : 5) << 8; + blend_mask |= (((blend_mask_src >> 1) & 0x1) ? 2 : 6) << 16; + blend_mask |= (((blend_mask_src >> 0) & 0x1) ? 3 : 7) << 24; uint32_t rotate = i.VX128_4.z; // This is just a fancy permute. // X Y Z W, rotated left by 2 = Z W X Y @@ -1278,7 +1284,7 @@ XEEMITTER(vrlimi128, VX128_4(6, 1808), VX128_4)(PPCHIRBuilder& f, } else { v = f.LoadVR(vb); } - if (blend_mask != 0x00010203) { + if (blend_mask != PERMUTE_IDENTITY) { v = f.Permute(f.LoadConstant(blend_mask), v, f.LoadVR(vd), INT32_TYPE); } f.StoreVR(vd, v); @@ -1382,7 +1388,7 @@ int InstrEmit_vsldoi_(PPCHIRBuilder& f, uint32_t vd, uint32_t va, uint32_t vb, // (VA << SH) OR (VB >> (16 - SH)) vec128_t shift = *((vec128_t*)(__vsldoi_table[sh])); for (int i = 0; i < 4; ++i) { - shift.i4[i] = poly::byte_swap(shift.i4[i]); + shift.u32[i] = poly::byte_swap(shift.u32[i]); } Value* control = f.LoadConstant(shift); Value* v = f.Permute(control, f.LoadVR(va), f.LoadVR(vb), INT8_TYPE); @@ -1410,7 +1416,7 @@ XEEMITTER(vspltb, 0x1000020C, VX)(PPCHIRBuilder& f, InstrData& i) { // b <- UIMM*8 // do i = 0 to 127 by 8 // (VD)[i:i+7] <- (VB)[b:b+7] - Value* b = f.Extract(f.LoadVR(i.VX.VB), (i.VX.VA & 0xF), INT8_TYPE); + Value* b = f.Extract(f.LoadVR(i.VX.VB), i.VX.VA & 0xF, INT8_TYPE); Value* v = f.Splat(b, VEC128_TYPE); f.StoreVR(i.VX.VD, v); return 0; @@ -1418,7 +1424,7 @@ XEEMITTER(vspltb, 0x1000020C, VX)(PPCHIRBuilder& f, InstrData& i) { XEEMITTER(vsplth, 0x1000024C, VX)(PPCHIRBuilder& f, InstrData& i) { // (VD.xyzw) <- (VB.uimm) - Value* h = f.Extract(f.LoadVR(i.VX.VB), (i.VX.VA & 0x7), INT16_TYPE); + Value* h = f.Extract(f.LoadVR(i.VX.VB), i.VX.VA & 0x7, INT16_TYPE); Value* v = f.Splat(h, VEC128_TYPE); f.StoreVR(i.VX.VD, v); return 0; @@ -1427,7 +1433,7 @@ XEEMITTER(vsplth, 0x1000024C, VX)(PPCHIRBuilder& f, InstrData& i) { int InstrEmit_vspltw_(PPCHIRBuilder& f, uint32_t vd, uint32_t vb, uint32_t uimm) { // (VD.xyzw) <- (VB.uimm) - Value* w = f.Extract(f.LoadVR(vb), (uimm & 0x3), INT32_TYPE); + Value* w = f.Extract(f.LoadVR(vb), uimm & 0x3, INT32_TYPE); Value* v = f.Splat(w, VEC128_TYPE); f.StoreVR(vd, v); return 0; @@ -1856,8 +1862,8 @@ XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f, } // http://hlssmod.net/he_code/public/pixelwriter.h // control = prev:0123 | new:4567 - uint32_t control = 0x00010203; // original - uint32_t src = xerotl(0x04050607, shift * 8); + uint32_t control = PERMUTE_IDENTITY; // original + uint32_t src = xerotl(0x07060504, shift * 8); uint32_t mask = 0; switch (pack) { case 1: // VPACK_32 @@ -1870,8 +1876,8 @@ XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f, mask = 0x0000FFFF << (shift * 8); } else { // w - src = 0x00000007; - mask = 0x000000FF; + src = 0x07000000; + mask = 0xFF000000; } control = (control & ~mask) | (src & mask); break; @@ -1880,7 +1886,7 @@ XEEMITTER(vpkd3d128, VX128_4(6, 1552), VX128_4)(PPCHIRBuilder& f, mask = 0x0000FFFF << (shift * 8); } else { // z - src = 0x00000006; + src = 0x00000004; mask = 0x000000FF; } control = (control & ~mask) | (src & mask); diff --git a/src/alloy/frontend/ppc/ppc_emit_alu.cc b/src/alloy/frontend/ppc/ppc_emit_alu.cc index 8638a0216..acfa78e47 100644 --- a/src/alloy/frontend/ppc/ppc_emit_alu.cc +++ b/src/alloy/frontend/ppc/ppc_emit_alu.cc @@ -1072,7 +1072,7 @@ XEEMITTER(sradx, 0x7C000634, X)(PPCHIRBuilder& f, InstrData& i) { Value* v = f.LoadGPR(i.X.RT); Value* sh = f.And(f.Truncate(f.LoadGPR(i.X.RB), INT8_TYPE), - f.LoadConstant((int8_t)0x7F)); + f.LoadConstant((int8_t)0x3F)); // CA is set if any bits are shifted out of the right and if the result // is negative. Start tracking that here. @@ -1137,14 +1137,15 @@ XEEMITTER(srawx, 0x7C000630, X)(PPCHIRBuilder& f, InstrData& i) { // if n >= 32: rA <- 64 sign bits of rS, XER[CA] = sign bit of lo_32(rS) Value* v = f.Truncate(f.LoadGPR(i.X.RT), INT32_TYPE); Value* sh = - f.And(f.Truncate(f.LoadGPR(i.X.RB), INT32_TYPE), f.LoadConstant(0x7F)); + f.And(f.Truncate(f.LoadGPR(i.X.RB), INT32_TYPE), f.LoadConstant(0x1F)); // CA is set if any bits are shifted out of the right and if the result // is negative. Value* mask = f.Not(f.Shl(f.LoadConstant(-1), sh)); Value* ca = f.And(f.Truncate(f.Shr(v, 31), INT8_TYPE), f.IsTrue(f.And(v, mask))); f.StoreCA(ca); - v = f.Sha(v, sh), v = f.SignExtend(v, INT64_TYPE); + v = f.Sha(v, sh); + v = f.SignExtend(v, INT64_TYPE); f.StoreGPR(i.X.RA, v); if (i.X.Rc) { f.UpdateCR(0, v); diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index cf84944cc..ae3b9ca33 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -51,11 +51,14 @@ enum ArithmeticFlags { ARITHMETIC_UNSIGNED = (1 << 2), ARITHMETIC_SATURATE = (1 << 3), }; +#define PERMUTE_MASK(sel_x, x, sel_y, y, sel_z, z, sel_w, w) \ + ((((x)&0x3) << 0) | (sel_x << 2) | (((y)&0x3) << 8) | (sel_y << 10) | \ + (((z)&0x3) << 16) | (sel_z << 18) | (((w)&0x3) << 24) | (sel_w << 26)) enum Permutes { - PERMUTE_XY_ZW = 0x00010405, + PERMUTE_IDENTITY = PERMUTE_MASK(0, 0, 0, 1, 0, 2, 0, 3), }; #define SWIZZLE_MASK(x, y, z, w) \ - ((((x)&0x3) << 6) | (((y)&0x3) << 4) | (((z)&0x3) << 2) | (((w)&0x3))) + ((((x)&0x3) << 0) | (((y)&0x3) << 2) | (((z)&0x3) << 4) | (((w)&0x3) << 6)) enum Swizzles { SWIZZLE_XYZW_TO_XYZW = SWIZZLE_MASK(0, 1, 2, 3), SWIZZLE_XYZW_TO_YZWX = SWIZZLE_MASK(1, 2, 3, 0), diff --git a/src/alloy/hir/value.cc b/src/alloy/hir/value.cc index 211481c93..ae90900d4 100644 --- a/src/alloy/hir/value.cc +++ b/src/alloy/hir/value.cc @@ -582,7 +582,7 @@ void Value::ByteSwap() { break; case VEC128_TYPE: for (int n = 0; n < 4; n++) { - constant.v128.i4[n] = poly::byte_swap(constant.v128.i4[n]); + constant.v128.u32[n] = poly::byte_swap(constant.v128.u32[n]); } break; default: diff --git a/src/alloy/vec128.h b/src/alloy/vec128.h index 9a7b8f728..8a4a9a039 100644 --- a/src/alloy/vec128.h +++ b/src/alloy/vec128.h @@ -14,6 +14,58 @@ namespace alloy { +// The first rule of vector programming is to only rely on exact positions +// when absolutely required - prefer dumb loops to exact offsets. +// Vectors in memory are laid out as in AVX registers on little endian +// machines. Note that little endian is dumb, so the byte at index 0 in +// the vector is is really byte 15 (or the high byte of short 7 or int 3). +// Because of this, all byte access should be via the accessors instead of +// the direct array. + +// Altivec big endian layout: AVX little endian layout: +// +---------+---------+---------+ +---------+---------+---------+ +// | int32 0 | int16 0 | int8 0 | | int32 3 | int16 7 | int8 15 | +// | | +---------+ | | +---------+ +// | | | int8 1 | | | | int8 14 | +// | +---------+---------+ | +---------+---------+ +// | | int16 1 | int8 2 | | | int16 6 | int8 13 | +// | | +---------+ | | +---------+ +// | | | int8 3 | | | | int8 12 | +// +---------+---------+---------+ +---------+---------+---------+ +// | int32 1 | int16 2 | int8 4 | | int32 2 | int16 5 | int8 11 | +// | | +---------+ | | +---------+ +// | | | int8 5 | | | | int8 10 | +// | +---------+---------+ | +---------+---------+ +// | | int16 3 | int8 6 | | | int16 4 | int8 9 | +// | | +---------+ | | +---------+ +// | | | int8 7 | | | | int8 8 | +// +---------+---------+---------+ +---------+---------+---------+ +// | int32 2 | int16 4 | int8 8 | | int32 1 | int16 3 | int8 7 | +// | | +---------+ | | +---------+ +// | | | int8 9 | | | | int8 6 | +// | +---------+---------+ | +---------+---------+ +// | | int16 5 | int8 10 | | | int16 2 | int8 5 | +// | | +---------+ | | +---------+ +// | | | int8 11 | | | | int8 4 | +// +---------+---------+---------+ +---------+---------+---------+ +// | int32 3 | int16 6 | int8 12 | | int32 0 | int16 1 | int8 3 | +// | | +---------+ | | +---------+ +// | | | int8 13 | | | | int8 2 | +// | +---------+---------+ | +---------+---------+ +// | | int16 7 | int8 14 | | | int16 0 | int8 1 | +// | | +---------+ | | +---------+ +// | | | int8 15 | | | | int8 0 | +// +---------+---------+---------+ +---------+---------+---------+ +// +// Logical order: +// +-----+-----+-----+-----+ +-----+-----+-----+-----+ +// | X | Y | Z | W | | W | Z | Y | X | +// +-----+-----+-----+-----+ +-----+-----+-----+-----+ +// +// Mapping indices is easy: +// int32[i ^ 0x3] +// int16[i ^ 0x7] +// int8[i ^ 0xF] typedef struct alignas(16) vec128_s { union { struct { @@ -23,15 +75,26 @@ typedef struct alignas(16) vec128_s { float w; }; struct { - uint32_t ix; - uint32_t iy; - uint32_t iz; - uint32_t iw; + int32_t ix; + int32_t iy; + int32_t iz; + int32_t iw; }; - float f4[4]; - uint32_t i4[4]; - uint16_t s8[8]; - uint8_t b16[16]; + struct { + uint32_t ux; + uint32_t uy; + uint32_t uz; + uint32_t uw; + }; + float f32[4]; + int8_t i8[16]; + uint8_t u8[16]; + int16_t i16[8]; + uint16_t u16[8]; + int32_t i32[4]; + uint32_t u32[4]; + int64_t i64[2]; + uint64_t u64[2]; struct { uint64_t low; uint64_t high; @@ -42,40 +105,41 @@ typedef struct alignas(16) vec128_s { return low == b.low && high == b.high; } } vec128_t; + static inline vec128_t vec128i(uint32_t src) { vec128_t v; for (auto i = 0; i < 4; ++i) { - v.i4[i] = src; + v.u32[i] = src; } return v; } static inline vec128_t vec128i(uint32_t x, uint32_t y, uint32_t z, uint32_t w) { vec128_t v; - v.i4[0] = x; - v.i4[1] = y; - v.i4[2] = z; - v.i4[3] = w; + v.u32[0] = x; + v.u32[1] = y; + v.u32[2] = z; + v.u32[3] = w; return v; } static inline vec128_t vec128f(float src) { vec128_t v; for (auto i = 0; i < 4; ++i) { - v.f4[i] = src; + v.f32[i] = src; } return v; } static inline vec128_t vec128f(float x, float y, float z, float w) { vec128_t v; - v.f4[0] = x; - v.f4[1] = y; - v.f4[2] = z; - v.f4[3] = w; + v.f32[0] = x; + v.f32[1] = y; + v.f32[2] = z; + v.f32[3] = w; return v; } static inline vec128_t vec128s(uint16_t src) { vec128_t v; for (auto i = 0; i < 8; ++i) { - v.s8[i] = src; + v.u16[i] = src; } return v; } @@ -83,20 +147,20 @@ static inline vec128_t vec128s(uint16_t x0, uint16_t x1, uint16_t y0, uint16_t y1, uint16_t z0, uint16_t z1, uint16_t w0, uint16_t w1) { vec128_t v; - v.s8[0] = x0; - v.s8[1] = x1; - v.s8[2] = y0; - v.s8[3] = y1; - v.s8[4] = z0; - v.s8[5] = z1; - v.s8[6] = w0; - v.s8[7] = w1; + v.u16[0] = x1; + v.u16[1] = x0; + v.u16[2] = y1; + v.u16[3] = y0; + v.u16[4] = z1; + v.u16[5] = z0; + v.u16[6] = w1; + v.u16[7] = w0; return v; } static inline vec128_t vec128b(uint8_t src) { vec128_t v; for (auto i = 0; i < 16; ++i) { - v.b16[i] = src; + v.u8[i] = src; } return v; } @@ -105,22 +169,22 @@ static inline vec128_t vec128b(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t z0, uint8_t z1, uint8_t z2, uint8_t z3, uint8_t w0, uint8_t w1, uint8_t w2, uint8_t w3) { vec128_t v; - v.b16[0] = x3; - v.b16[1] = x2; - v.b16[2] = x1; - v.b16[3] = x0; - v.b16[4] = y3; - v.b16[5] = y2; - v.b16[6] = y1; - v.b16[7] = y0; - v.b16[8] = z3; - v.b16[9] = z2; - v.b16[10] = z1; - v.b16[11] = z0; - v.b16[12] = w3; - v.b16[13] = w2; - v.b16[14] = w1; - v.b16[15] = w0; + v.u8[0] = x3; + v.u8[1] = x2; + v.u8[2] = x1; + v.u8[3] = x0; + v.u8[4] = y3; + v.u8[5] = y2; + v.u8[6] = y1; + v.u8[7] = y0; + v.u8[8] = z3; + v.u8[9] = z2; + v.u8[10] = z1; + v.u8[11] = z0; + v.u8[12] = w3; + v.u8[13] = w2; + v.u8[14] = w1; + v.u8[15] = w0; return v; } diff --git a/src/poly/math.h b/src/poly/math.h index b8ab8c6de..2bd43619c 100644 --- a/src/poly/math.h +++ b/src/poly/math.h @@ -12,6 +12,7 @@ #include +#include #include #include #include @@ -38,6 +39,10 @@ T round_up(T value, V multiple) { return value + multiple - 1 - (value - 1) % multiple; } +inline float saturate(float value) { + return std::max(std::min(1.0f, value), -1.0f); +} + // Gets the next power of two value that is greater than or equal to the given // value. template diff --git a/tools/alloy-sandbox/alloy-sandbox.gypi b/tools/alloy-sandbox/alloy-sandbox.gypi index e5b947eac..7988e708f 100644 --- a/tools/alloy-sandbox/alloy-sandbox.gypi +++ b/tools/alloy-sandbox/alloy-sandbox.gypi @@ -13,6 +13,7 @@ 'dependencies': [ 'alloy', + 'xenia', ], 'include_dirs': [ diff --git a/tools/alloy-test/alloy-test.gypi b/tools/alloy-test/alloy-test.gypi index 103ce7f8d..85cc9bc5c 100644 --- a/tools/alloy-test/alloy-test.gypi +++ b/tools/alloy-test/alloy-test.gypi @@ -13,6 +13,7 @@ 'dependencies': [ 'alloy', + 'xenia', ], 'include_dirs': [ @@ -40,11 +41,11 @@ #'test_div.cc', #'test_dot_product_3.cc', #'test_dot_product_4.cc', - #'test_extract.cc', - #'test_insert.cc', + 'test_extract.cc', + 'test_insert.cc', #'test_is_true_false.cc', #'test_load_clock.cc', - #'test_load_vector.cc', + 'test_load_vector_shl_shr.cc', #'test_log2.cc', #'test_max.cc', #'test_min.cc', @@ -55,32 +56,32 @@ #'test_neg.cc', #'test_not.cc', #'test_or.cc', - #'test_pack.cc', - #'test_permute.cc', + 'test_pack.cc', + 'test_permute.cc', #'test_pow2.cc', #'test_rotate_left.cc', #'test_round.cc', #'test_rsqrt.cc', #'test_select.cc', - #'test_sha.cc', - #'test_shl.cc', - #'test_shr.cc', + 'test_sha.cc', + 'test_shl.cc', + 'test_shr.cc', #'test_sign_extend.cc', #'test_splat.cc', #'test_sqrt.cc', #'test_sub.cc', - #'test_swizzle.cc', + 'test_swizzle.cc', #'test_truncate.cc', - #'test_unpack.cc', + 'test_unpack.cc', 'test_vector_add.cc', #'test_vector_compare.cc', #'test_vector_convert.cc', 'test_vector_max.cc', 'test_vector_min.cc', - #'test_vector_rotate_left.cc', - #'test_vector_sha.cc', - #'test_vector_shl.cc', - #'test_vector_shr.cc', + 'test_vector_rotate_left.cc', + 'test_vector_sha.cc', + 'test_vector_shl.cc', + 'test_vector_shr.cc', #'test_vector_sub.cc', #'test_xor.cc', #'test_zero_extend.cc', diff --git a/tools/alloy-test/test_extract.cc b/tools/alloy-test/test_extract.cc new file mode 100644 index 000000000..9e8935399 --- /dev/null +++ b/tools/alloy-test/test_extract.cc @@ -0,0 +1,141 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("EXTRACT_INT8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Extract(LoadVR(b, 4), + b.Truncate(LoadGPR(b, 4), INT8_TYPE), + INT8_TYPE), + INT64_TYPE)); + b.Return(); + }); + for (int i = 0; i < 16; ++i) { + test.Run([i](PPCContext* ctx) { + ctx->r[4] = i; + ctx->v[4] = vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15); + }, + [i](PPCContext* ctx) { + auto result = ctx->r[3]; + REQUIRE(result == i); + }); + } +} + +TEST_CASE("EXTRACT_INT8_CONSTANT", "[instr]") { + for (int i = 0; i < 16; ++i) { + TestFunction([i](hir::HIRBuilder& b) { + StoreGPR(b, 3, + b.ZeroExtend( + b.Extract(LoadVR(b, 4), + b.LoadConstant(int8_t(i)), INT8_TYPE), + INT64_TYPE)); + b.Return(); + }).Run([i](PPCContext* ctx) { + ctx->r[4] = i; + ctx->v[4] = vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); + }, + [i](PPCContext* ctx) { + auto result = ctx->r[3]; + REQUIRE(result == i); + }); + } +} + +TEST_CASE("EXTRACT_INT16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Extract(LoadVR(b, 4), + b.Truncate(LoadGPR(b, 4), INT8_TYPE), + INT16_TYPE), + INT64_TYPE)); + b.Return(); + }); + for (int i = 0; i < 8; ++i) { + test.Run([i](PPCContext* ctx) { + ctx->r[4] = i; + ctx->v[4] = vec128s(0x0000, 0x1001, 0x2002, 0x3003, 0x4004, + 0x5005, 0x6006, 0x7007); + }, + [i](PPCContext* ctx) { + auto result = ctx->r[3]; + REQUIRE(result == (i | (i << 12))); + }); + } +} + +TEST_CASE("EXTRACT_INT16_CONSTANT", "[instr]") { + for (int i = 0; i < 8; ++i) { + TestFunction([i](hir::HIRBuilder& b) { + StoreGPR(b, 3, + b.ZeroExtend(b.Extract(LoadVR(b, 4), + b.LoadConstant(int8_t(i)), + INT16_TYPE), + INT64_TYPE)); + b.Return(); + }).Run([i](PPCContext* ctx) { + ctx->r[4] = i; + ctx->v[4] = vec128s(0, 1, 2, 3, 4, 5, 6, 7); + }, + [i](PPCContext* ctx) { + auto result = ctx->r[3]; + REQUIRE(result == i); + }); + } +} + +TEST_CASE("EXTRACT_INT32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Extract(LoadVR(b, 4), + b.Truncate(LoadGPR(b, 4), INT8_TYPE), + INT32_TYPE), + INT64_TYPE)); + b.Return(); + }); + for (int i = 0; i < 4; ++i) { + test.Run([i](PPCContext* ctx) { + ctx->r[4] = i; + ctx->v[4] = vec128i(0, 1, 2, 3); + }, + [i](PPCContext* ctx) { + auto result = ctx->r[3]; + REQUIRE(result == i); + }); + } +} + +TEST_CASE("EXTRACT_INT32_CONSTANT", "[instr]") { + for (int i = 0; i < 4; ++i) { + TestFunction([i](hir::HIRBuilder& b) { + StoreGPR(b, 3, + b.ZeroExtend(b.Extract(LoadVR(b, 4), + b.LoadConstant(int8_t(i)), + INT32_TYPE), + INT64_TYPE)); + b.Return(); + }).Run([i](PPCContext* ctx) { + ctx->r[4] = i; + ctx->v[4] = vec128i(0, 1, 2, 3); + }, + [i](PPCContext* ctx) { + auto result = ctx->r[3]; + REQUIRE(result == i); + }); + } +} diff --git a/tools/alloy-test/test_insert.cc b/tools/alloy-test/test_insert.cc new file mode 100644 index 000000000..c14e57051 --- /dev/null +++ b/tools/alloy-test/test_insert.cc @@ -0,0 +1,83 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("INSERT_INT8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Insert(LoadVR(b, 4), LoadGPR(b, 4), + b.Truncate(LoadGPR(b, 5), INT8_TYPE))); + b.Return(); + }); + for (int i = 0; i < 16; ++i) { + test.Run([i](PPCContext* ctx) { + ctx->v[4] = vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15); + ctx->r[4] = i; + ctx->r[5] = 100 + i; + }, + [i](PPCContext* ctx) { + auto result = ctx->v[3]; + auto expected = vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15); + expected.i8[i ^ 0x3] = 100 + i; + REQUIRE(result == expected); + }); + } +} + +TEST_CASE("INSERT_INT16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Insert(LoadVR(b, 4), LoadGPR(b, 4), + b.Truncate(LoadGPR(b, 5), INT16_TYPE))); + b.Return(); + }); + for (int i = 0; i < 8; ++i) { + test.Run([i](PPCContext* ctx) { + ctx->v[4] = vec128s(0, 1, 2, 3, 4, 5, 6, 7); + ctx->r[4] = i; + ctx->r[5] = 100 + i; + }, + [i](PPCContext* ctx) { + auto result = ctx->v[3]; + auto expected = vec128s(0, 1, 2, 3, 4, 5, 6, 7); + expected.i16[i ^ 0x1] = 100 + i; + REQUIRE(result == expected); + }); + } +} + +TEST_CASE("INSERT_INT32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Insert(LoadVR(b, 4), LoadGPR(b, 4), + b.Truncate(LoadGPR(b, 5), INT32_TYPE))); + b.Return(); + }); + for (int i = 0; i < 4; ++i) { + test.Run([i](PPCContext* ctx) { + ctx->v[4] = vec128i(0, 1, 2, 3); + ctx->r[4] = i; + ctx->r[5] = 100 + i; + }, + [i](PPCContext* ctx) { + auto result = ctx->v[3]; + auto expected = vec128i(0, 1, 2, 3); + expected.i32[i] = 100 + i; + REQUIRE(result == expected); + }); + } +} diff --git a/tools/alloy-test/test_load_vector_shl_shr.cc b/tools/alloy-test/test_load_vector_shl_shr.cc new file mode 100644 index 000000000..dee8f07d2 --- /dev/null +++ b/tools/alloy-test/test_load_vector_shl_shr.cc @@ -0,0 +1,78 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("LOAD_VECTOR_SHL", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.LoadVectorShl(b.Truncate(LoadGPR(b, 4), INT8_TYPE))); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->r[4] = 0; }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15)); + }); + test.Run([](PPCContext* ctx) { ctx->r[4] = 7; }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22)); + }); + test.Run([](PPCContext* ctx) { ctx->r[4] = 15; }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30)); + }); + test.Run([](PPCContext* ctx) { ctx->r[4] = 16; }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15)); + }); +} + +TEST_CASE("LOAD_VECTOR_SHR", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.LoadVectorShr(b.Truncate(LoadGPR(b, 4), INT8_TYPE))); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->r[4] = 0; }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31)); + }); + test.Run([](PPCContext* ctx) { ctx->r[4] = 7; }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24)); + }); + test.Run([](PPCContext* ctx) { ctx->r[4] = 15; }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16)); + }); + test.Run([](PPCContext* ctx) { ctx->r[4] = 16; }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31)); + }); +} diff --git a/tools/alloy-test/test_pack.cc b/tools/alloy-test/test_pack.cc new file mode 100644 index 000000000..4c693e729 --- /dev/null +++ b/tools/alloy-test/test_pack.cc @@ -0,0 +1,111 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("PACK_D3DCOLOR", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Pack(LoadVR(b, 4), PACK_TYPE_D3DCOLOR)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128f(1.0f); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x3F800050, 0x3F800060, 0x3F800070, 0x3F800080); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0, 0, 0, 0x80506070)); + }); +} + +TEST_CASE("PACK_FLOAT16_2", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Pack(LoadVR(b, 4), PACK_TYPE_FLOAT16_2)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0, 0, 0, 0x3F800000); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x47FFE000, 0xC7FFE000, 0x00000000, 0x3F800000); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0, 0, 0, 0x7FFFFFFF)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x42AAA000, 0x44CCC000, 0x00000000, 0x3F800000); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0, 0, 0, 0x55556666)); + }); +} + +TEST_CASE("PACK_FLOAT16_4", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Pack(LoadVR(b, 4), PACK_TYPE_FLOAT16_4)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0, 0, 0, 0); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x449A4000, 0x45B17000, 0x41103261, 0x40922B6B); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x00000000, 0x00000000, 0x64D26D8C, 0x48824491)); + }); +} + +TEST_CASE("PACK_SHORT_2", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Pack(LoadVR(b, 4), PACK_TYPE_SHORT_2)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128i(0x43817E00, 0xC37CFC00, 0, 0); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0, 0, 0, 0x7FFF8001)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128i(0xC0D47D97, 0xC2256E9D, 0, 0); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0, 0, 0, 0x80018001)); + }); +} diff --git a/tools/alloy-test/test_permute.cc b/tools/alloy-test/test_permute.cc new file mode 100644 index 000000000..ab16ebc68 --- /dev/null +++ b/tools/alloy-test/test_permute.cc @@ -0,0 +1,139 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("PERMUTE_V128_BY_INT32_CONSTANT", "[instr]") { + { + uint32_t mask = PERMUTE_MASK(0, 0, 0, 1, 0, 2, 0, 3); + TestFunction([mask](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Permute(b.LoadConstant(mask), LoadVR(b, 4), + LoadVR(b, 5), INT32_TYPE)); + b.Return(); + }).Run([](PPCContext* ctx) { + ctx->v[4] = vec128i(0, 1, 2, 3); + ctx->v[5] = vec128i(4, 5, 6, 7); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0, 1, 2, 3)); + }); + } + { + uint32_t mask = PERMUTE_MASK(1, 0, 1, 1, 1, 2, 1, 3); + TestFunction([mask](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Permute(b.LoadConstant(mask), LoadVR(b, 4), + LoadVR(b, 5), INT32_TYPE)); + b.Return(); + }).Run([](PPCContext* ctx) { + ctx->v[4] = vec128i(0, 1, 2, 3); + ctx->v[5] = vec128i(4, 5, 6, 7); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(4, 5, 6, 7)); + }); + } + { + uint32_t mask = PERMUTE_MASK(0, 3, 0, 2, 0, 1, 0, 0); + TestFunction([mask](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Permute(b.LoadConstant(mask), LoadVR(b, 4), + LoadVR(b, 5), INT32_TYPE)); + b.Return(); + }).Run([](PPCContext* ctx) { + ctx->v[4] = vec128i(0, 1, 2, 3); + ctx->v[5] = vec128i(4, 5, 6, 7); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(3, 2, 1, 0)); + }); + } + { + uint32_t mask = PERMUTE_MASK(1, 3, 1, 2, 1, 1, 1, 0); + TestFunction([mask](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Permute(b.LoadConstant(mask), LoadVR(b, 4), + LoadVR(b, 5), INT32_TYPE)); + b.Return(); + }).Run([](PPCContext* ctx) { + ctx->v[4] = vec128i(0, 1, 2, 3); + ctx->v[5] = vec128i(4, 5, 6, 7); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(7, 6, 5, 4)); + }); + } +} + +TEST_CASE("PERMUTE_V128_BY_V128", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, + b.Permute(LoadVR(b, 3), LoadVR(b, 4), LoadVR(b, 5), VEC128_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[3] = + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + ctx->v[4] = vec128b(100, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15); + ctx->v[5] = vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(100, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[3] = vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31); + ctx->v[4] = + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + ctx->v[5] = vec128b(116, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(116, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 29, 30, 31)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[3] = + vec128b(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + ctx->v[4] = vec128b(100, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15); + ctx->v[5] = vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 31); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, + 3, 2, 1, 100)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[3] = vec128b(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, + 19, 18, 17, 16); + ctx->v[4] = + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + ctx->v[5] = vec128b(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 28, 29, 30, 131); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(131, 30, 29, 28, 27, 26, 25, 24, 23, 22, + 21, 20, 19, 18, 17, 16)); + }); +} diff --git a/tools/alloy-test/test_sha.cc b/tools/alloy-test/test_sha.cc new file mode 100644 index 000000000..61e4b0007 --- /dev/null +++ b/tools/alloy-test/test_sha.cc @@ -0,0 +1,211 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("SHA_I8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Sha(b.Truncate(LoadGPR(b, 4), INT8_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xF0; + ctx->r[5] = 4; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFF; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x80; + ctx->r[5] = 8; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7F; + ctx->r[5] = 7; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); +} + +TEST_CASE("SHA_I16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Sha(b.Truncate(LoadGPR(b, 4), INT16_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFF00; + ctx->r[5] = 8; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFE; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x8000; + ctx->r[5] = 16; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFF; + ctx->r[5] = 15; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); +} + +TEST_CASE("SHA_I32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Sha(b.Truncate(LoadGPR(b, 4), INT32_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFF0000; + ctx->r[5] = 16; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFE; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x80000000; + ctx->r[5] = 32; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x80000000); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFFFFFF; + ctx->r[5] = 31; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); +} + +TEST_CASE("SHA_I64", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.Sha(b.Truncate(LoadGPR(b, 4), INT64_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE))); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFF00000000ull; + ctx->r[5] = 32; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFFFFFFFFFFull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFFFFFFFFFFull; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFFFFFFFFFFull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFFFFFFFFFEull; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFFFFFFFFFFull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x8000000000000000ull; + ctx->r[5] = 64; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x8000000000000000ull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFFFFFFFFFFFFFFull; + ctx->r[5] = 63; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); +} diff --git a/tools/alloy-test/test_shl.cc b/tools/alloy-test/test_shl.cc new file mode 100644 index 000000000..c05bb7071 --- /dev/null +++ b/tools/alloy-test/test_shl.cc @@ -0,0 +1,211 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("SHL_I8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Shl(b.Truncate(LoadGPR(b, 4), INT8_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x0F; + ctx->r[5] = 4; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xF0); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFF; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFE); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x80; + ctx->r[5] = 8; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7F; + ctx->r[5] = 7; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x80); + }); +} + +TEST_CASE("SHL_I16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Shl(b.Truncate(LoadGPR(b, 4), INT16_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x00FF; + ctx->r[5] = 8; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFF00); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFF; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFE); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x8000; + ctx->r[5] = 16; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFF; + ctx->r[5] = 15; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x8000); + }); +} + +TEST_CASE("SHL_I32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Shl(b.Truncate(LoadGPR(b, 4), INT32_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x0000FFFF; + ctx->r[5] = 16; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFF0000); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFFFFFF; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFE); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x80000000; + ctx->r[5] = 32; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x80000000); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFFFFFF; + ctx->r[5] = 31; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x80000000); + }); +} + +TEST_CASE("SHL_I64", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.Shl(b.Truncate(LoadGPR(b, 4), INT64_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE))); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x00000000FFFFFFFFull; + ctx->r[5] = 32; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFF00000000ull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFFFFFFFFFFull; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFFFFFFFFFFull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFFFFFFFFFFFFFFull; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFFFFFFFFFEull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x8000000000000000ull; + ctx->r[5] = 64; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x8000000000000000ull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFFFFFFFFFFFFFFull; + ctx->r[5] = 63; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x8000000000000000ull); + }); +} diff --git a/tools/alloy-test/test_shr.cc b/tools/alloy-test/test_shr.cc new file mode 100644 index 000000000..3ddfc6dd7 --- /dev/null +++ b/tools/alloy-test/test_shr.cc @@ -0,0 +1,211 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("SHR_I8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Shr(b.Truncate(LoadGPR(b, 4), INT8_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xF0; + ctx->r[5] = 4; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x0F); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFF; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x7F); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x80; + ctx->r[5] = 8; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7F; + ctx->r[5] = 7; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); +} + +TEST_CASE("SHR_I16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Shr(b.Truncate(LoadGPR(b, 4), INT16_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFF00; + ctx->r[5] = 8; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x00FF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFE; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x7FFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x8000; + ctx->r[5] = 16; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFF; + ctx->r[5] = 15; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); +} + +TEST_CASE("SHR_I32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.ZeroExtend(b.Shr(b.Truncate(LoadGPR(b, 4), INT32_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE)), + INT64_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFF0000; + ctx->r[5] = 16; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x0000FFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFF; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFE; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x7FFFFFFF); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x80000000; + ctx->r[5] = 32; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x80000000); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFFFFFF; + ctx->r[5] = 31; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); +} + +TEST_CASE("SHR_I64", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreGPR(b, 3, b.Shr(b.Truncate(LoadGPR(b, 4), INT64_TYPE), + b.Truncate(LoadGPR(b, 5), INT8_TYPE))); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFF00000000ull; + ctx->r[5] = 32; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x00000000FFFFFFFFull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFFFFFFFFFFull; + ctx->r[5] = 0; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0xFFFFFFFFFFFFFFFFull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0xFFFFFFFFFFFFFFFEull; + ctx->r[5] = 1; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x7FFFFFFFFFFFFFFFull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x8000000000000000ull; + ctx->r[5] = 64; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0x8000000000000000ull); + }); + test.Run([](PPCContext* ctx) { + ctx->r[4] = 0x7FFFFFFFFFFFFFFFull; + ctx->r[5] = 63; + }, + [](PPCContext* ctx) { + auto result = static_cast(ctx->r[3]); + REQUIRE(result == 0); + }); +} diff --git a/tools/alloy-test/test_swizzle.cc b/tools/alloy-test/test_swizzle.cc new file mode 100644 index 000000000..270d6fb84 --- /dev/null +++ b/tools/alloy-test/test_swizzle.cc @@ -0,0 +1,46 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("SWIZZLE_V128", "[instr]") { + TestFunction([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Swizzle(LoadVR(b, 4), INT32_TYPE, + SWIZZLE_MASK(0, 1, 2, 3))); + b.Return(); + }).Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0, 1, 2, 3); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0, 1, 2, 3)); + }); + TestFunction([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Swizzle(LoadVR(b, 4), INT32_TYPE, + SWIZZLE_MASK(3, 2, 1, 0))); + b.Return(); + }).Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0, 1, 2, 3); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(3, 2, 1, 0)); + }); + TestFunction([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Swizzle(LoadVR(b, 4), INT32_TYPE, + SWIZZLE_MASK(1, 1, 2, 2))); + b.Return(); + }).Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0, 1, 2, 3); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(1, 1, 2, 2)); + }); +} diff --git a/tools/alloy-test/test_unpack.cc b/tools/alloy-test/test_unpack.cc new file mode 100644 index 000000000..2af4b0ef0 --- /dev/null +++ b/tools/alloy-test/test_unpack.cc @@ -0,0 +1,162 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("UNPACK_D3DCOLOR", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Unpack(LoadVR(b, 4), PACK_TYPE_D3DCOLOR)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + uint32_t value = 0; + ctx->v[4] = vec128i(0, 0, 0, value); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128f(1.0f, 1.0f, 1.0f, 1.0f)); + }); + test.Run([](PPCContext* ctx) { + uint32_t value = 0x80506070; + ctx->v[4] = vec128i(0, 0, 0, value); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x3F800050, 0x3F800060, 0x3F800070, 0x3F800080)); + }); +} + +TEST_CASE("UNPACK_FLOAT16_2", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Unpack(LoadVR(b, 4), PACK_TYPE_FLOAT16_2)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0, 0, 0, 0x3F800000)); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0, 0, 0, 0x7FFFFFFF); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x47FFE000, 0xC7FFE000, 0x00000000, 0x3F800000)); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0, 0, 0, 0x55556666); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x42AAA000, 0x44CCC000, 0x00000000, 0x3F800000)); + }); +} + +TEST_CASE("UNPACK_FLOAT16_4", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Unpack(LoadVR(b, 4), PACK_TYPE_FLOAT16_4)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128i(0)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128s(0, 0, 0, 0, 0x64D2, 0x6D8B, 0x4881, 0x4491); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x449A4000, 0x45B16000, 0x41102000, 0x40922000)); + }); +} + +TEST_CASE("UNPACK_SHORT_2", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.Unpack(LoadVR(b, 4), PACK_TYPE_SHORT_2)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { ctx->v[4] = vec128i(0); }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x40400000, 0x40400000, 0x00000000, 0x3F800000)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x7004FD60, 0x8201C990, 0x00000000, 0x7FFF8001); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x40407FFF, 0x403F8001, 0x00000000, 0x3F800000)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128i(0, 0, 0, (0x1234u << 16) | 0x5678u); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x40401234, 0x40405678, 0x00000000, 0x3F800000)); + }); +} + +// TEST_CASE("UNPACK_S8_IN_16_LO", "[instr]") { +// TestFunction test([](hir::HIRBuilder& b) { +// StoreVR(b, 3, b.Unpack(LoadVR(b, 4), PACK_TYPE_S8_IN_16_LO)); +// b.Return(); +// }); +// test.Run([](PPCContext* ctx) { ctx->v[4] = vec128b(0); }, +// [](PPCContext* ctx) { +// auto result = ctx->v[3]; +// REQUIRE(result == vec128b(0)); +// }); +//} +// +// TEST_CASE("UNPACK_S8_IN_16_HI", "[instr]") { +// TestFunction test([](hir::HIRBuilder& b) { +// StoreVR(b, 3, b.Unpack(LoadVR(b, 4), PACK_TYPE_S8_IN_16_HI)); +// b.Return(); +// }); +// test.Run([](PPCContext* ctx) { ctx->v[4] = vec128b(0); }, +// [](PPCContext* ctx) { +// auto result = ctx->v[3]; +// REQUIRE(result == vec128b(0)); +// }); +//} +// +// TEST_CASE("UNPACK_S16_IN_32_LO", "[instr]") { +// TestFunction test([](hir::HIRBuilder& b) { +// StoreVR(b, 3, b.Unpack(LoadVR(b, 4), PACK_TYPE_S16_IN_32_LO)); +// b.Return(); +// }); +// test.Run([](PPCContext* ctx) { ctx->v[4] = vec128b(0); }, +// [](PPCContext* ctx) { +// auto result = ctx->v[3]; +// REQUIRE(result == vec128b(0)); +// }); +//} +// +// TEST_CASE("UNPACK_S16_IN_32_HI", "[instr]") { +// TestFunction test([](hir::HIRBuilder& b) { +// StoreVR(b, 3, b.Unpack(LoadVR(b, 4), PACK_TYPE_S16_IN_32_HI)); +// b.Return(); +// }); +// test.Run([](PPCContext* ctx) { ctx->v[4] = vec128b(0); }, +// [](PPCContext* ctx) { +// auto result = ctx->v[3]; +// REQUIRE(result == vec128b(0)); +// }); +//} diff --git a/tools/alloy-test/test_vector_rotate_left.cc b/tools/alloy-test/test_vector_rotate_left.cc new file mode 100644 index 000000000..4f96c237c --- /dev/null +++ b/tools/alloy-test/test_vector_rotate_left.cc @@ -0,0 +1,71 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("VECTOR_ROTATE_LEFT_I8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorRotateLeft(LoadVR(b, 4), LoadVR(b, 5), INT8_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128b(B00000001); + ctx->v[5] = + vec128b(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128b(B00000001, B00000010, B00000100, B00001000, + B00010000, B00100000, B01000000, B10000000, + B00000001, B00000010, B00000100, B00001000, + B00010000, B00100000, B01000000, B10000000)); + }); +} + +TEST_CASE("VECTOR_ROTATE_LEFT_I16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorRotateLeft(LoadVR(b, 4), LoadVR(b, 5), INT16_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128s(0x0001, 0x0001, 0x0001, 0x0001, 0x1000, 0x1000, + 0x1000, 0x1000); + ctx->v[5] = vec128s(0, 1, 2, 3, 14, 15, 16, 17); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128s(0x0001, 0x0002, 0x0004, 0x0008, 0x0400, + 0x0800, 0x1000, 0x2000)); + }); +} + +TEST_CASE("VECTOR_ROTATE_LEFT_I32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorRotateLeft(LoadVR(b, 4), LoadVR(b, 5), INT32_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x00000001, 0x00000001, 0x80000000, 0x80000000); + ctx->v[5] = vec128i(0, 1, 1, 2); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x00000001, 0x00000002, 0x00000001, 0x00000002)); + }); +} diff --git a/tools/alloy-test/test_vector_sha.cc b/tools/alloy-test/test_vector_sha.cc new file mode 100644 index 000000000..6c9cdc16a --- /dev/null +++ b/tools/alloy-test/test_vector_sha.cc @@ -0,0 +1,145 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("VECTOR_SHA_I8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorSha(LoadVR(b, 4), LoadVR(b, 5), INT8_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128b(0x7E, 0x7E, 0x7E, 0x7F, 0x80, 0xFF, 0x01, 0x12, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + ctx->v[5] = + vec128b(0, 1, 2, 8, 4, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(0x7E, 0x3F, 0x1F, 0x7F, 0xF8, 0xFF, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00)); + }); +} + +TEST_CASE("VECTOR_SHA_I8_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorSha(LoadVR(b, 4), b.LoadConstant(vec128b( + 0, 1, 2, 8, 4, 4, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15)), + INT8_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128b(0x7E, 0x7E, 0x7E, 0x7F, 0x80, 0xFF, 0x01, 0x12, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(0x7E, 0x3F, 0x1F, 0x7F, 0xF8, 0xFF, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00)); + }); +} + +TEST_CASE("VECTOR_SHA_I16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorSha(LoadVR(b, 4), LoadVR(b, 5), INT16_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128s(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFF, 0x8000, 0xFFFF, + 0x0001, 0x1234); + ctx->v[5] = vec128s(0, 1, 8, 15, 15, 8, 1, 16); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128s(0x7FFE, 0x3FFF, 0x007F, 0x0000, 0xFFFF, + 0xFFFF, 0x0000, 0x1234)); + }); +} + +TEST_CASE("VECTOR_SHA_I16_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorSha(LoadVR(b, 4), b.LoadConstant(vec128s( + 0, 1, 8, 15, 15, 8, 1, 16)), + INT16_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128s(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFF, 0x8000, 0xFFFF, + 0x0001, 0x1234); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128s(0x7FFE, 0x3FFF, 0x007F, 0x0000, 0xFFFF, + 0xFFFF, 0x0000, 0x1234)); + }); +} + +TEST_CASE("VECTOR_SHA_I32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorSha(LoadVR(b, 4), LoadVR(b, 5), INT32_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFF); + ctx->v[5] = vec128i(0, 1, 16, 31); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x7FFFFFFE, 0x3FFFFFFF, 0x00007FFF, 0x00000000)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x80000000, 0xFFFFFFFF, 0x00000001, 0x12345678); + ctx->v[5] = vec128i(31, 16, 1, 32); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x12345678)); + }); +} + +TEST_CASE("VECTOR_SHA_I32_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, + b.VectorSha(LoadVR(b, 4), b.LoadConstant(vec128i(0, 1, 16, 31)), + INT32_TYPE)); + StoreVR(b, 4, + b.VectorSha(LoadVR(b, 5), b.LoadConstant(vec128i(31, 16, 1, 32)), + INT32_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFF); + ctx->v[5] = + vec128i(0x80000000, 0xFFFFFFFF, 0x00000001, 0x12345678); + }, + [](PPCContext* ctx) { + auto result1 = ctx->v[3]; + REQUIRE(result1 == + vec128i(0x7FFFFFFE, 0x3FFFFFFF, 0x00007FFF, 0x00000000)); + auto result2 = ctx->v[4]; + REQUIRE(result2 == + vec128i(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x12345678)); + }); +} diff --git a/tools/alloy-test/test_vector_shl.cc b/tools/alloy-test/test_vector_shl.cc new file mode 100644 index 000000000..50228d310 --- /dev/null +++ b/tools/alloy-test/test_vector_shl.cc @@ -0,0 +1,145 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("VECTOR_SHL_I8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShl(LoadVR(b, 4), LoadVR(b, 5), INT8_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128b(0x7E, 0x7E, 0x7E, 0x7F, 0x80, 0xFF, 0x01, 0x12, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + ctx->v[5] = + vec128b(0, 1, 2, 8, 4, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(0x7E, 0xFC, 0xF8, 0x7F, 0x00, 0xF0, 0x40, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00)); + }); +} + +TEST_CASE("VECTOR_SHL_I8_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShl(LoadVR(b, 4), b.LoadConstant(vec128b( + 0, 1, 2, 8, 4, 4, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15)), + INT8_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128b(0x7E, 0x7E, 0x7E, 0x7F, 0x80, 0xFF, 0x01, 0x12, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(0x7E, 0xFC, 0xF8, 0x7F, 0x00, 0xF0, 0x40, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00)); + }); +} + +TEST_CASE("VECTOR_SHL_I16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShl(LoadVR(b, 4), LoadVR(b, 5), INT16_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128s(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFF, 0x8000, 0xFFFF, + 0x0001, 0x1234); + ctx->v[5] = vec128s(0, 1, 8, 15, 15, 8, 1, 16); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128s(0x7FFE, 0xFFFC, 0xFE00, 0x8000, 0x0000, + 0xFF00, 0x0002, 0x1234)); + }); +} + +TEST_CASE("VECTOR_SHL_I16_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShl(LoadVR(b, 4), b.LoadConstant(vec128s( + 0, 1, 8, 15, 15, 8, 1, 16)), + INT16_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128s(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFF, 0x8000, 0xFFFF, + 0x0001, 0x1234); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128s(0x7FFE, 0xFFFC, 0xFE00, 0x8000, 0x0000, + 0xFF00, 0x0002, 0x1234)); + }); +} + +TEST_CASE("VECTOR_SHL_I32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShl(LoadVR(b, 4), LoadVR(b, 5), INT32_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFF); + ctx->v[5] = vec128i(0, 1, 16, 31); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x7FFFFFFE, 0xFFFFFFFC, 0xFFFE0000, 0x80000000)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x80000000, 0xFFFFFFFF, 0x00000001, 0x12345678); + ctx->v[5] = vec128i(31, 16, 1, 32); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x00000000, 0xFFFF0000, 0x00000002, 0x12345678)); + }); +} + +TEST_CASE("VECTOR_SHL_I32_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, + b.VectorShl(LoadVR(b, 4), b.LoadConstant(vec128i(0, 1, 16, 31)), + INT32_TYPE)); + StoreVR(b, 4, + b.VectorShl(LoadVR(b, 5), b.LoadConstant(vec128i(31, 16, 1, 32)), + INT32_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFF); + ctx->v[5] = + vec128i(0x80000000, 0xFFFFFFFF, 0x00000001, 0x12345678); + }, + [](PPCContext* ctx) { + auto result1 = ctx->v[3]; + REQUIRE(result1 == + vec128i(0x7FFFFFFE, 0xFFFFFFFC, 0xFFFE0000, 0x80000000)); + auto result2 = ctx->v[4]; + REQUIRE(result2 == + vec128i(0x00000000, 0xFFFF0000, 0x00000002, 0x12345678)); + }); +} diff --git a/tools/alloy-test/test_vector_shr.cc b/tools/alloy-test/test_vector_shr.cc new file mode 100644 index 000000000..25ae6c91d --- /dev/null +++ b/tools/alloy-test/test_vector_shr.cc @@ -0,0 +1,145 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +using namespace alloy; +using namespace alloy::hir; +using namespace alloy::runtime; +using namespace alloy::test; +using alloy::frontend::ppc::PPCContext; + +TEST_CASE("VECTOR_SHR_I8", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShr(LoadVR(b, 4), LoadVR(b, 5), INT8_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128b(0x7E, 0x7E, 0x7E, 0x7F, 0x80, 0xFF, 0x01, 0x12, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + ctx->v[5] = + vec128b(0, 1, 2, 8, 4, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(0x7E, 0x3F, 0x1F, 0x7F, 0x08, 0x0F, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00)); + }); +} + +TEST_CASE("VECTOR_SHR_I8_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShr(LoadVR(b, 4), b.LoadConstant(vec128b( + 0, 1, 2, 8, 4, 4, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15)), + INT8_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128b(0x7E, 0x7E, 0x7E, 0x7F, 0x80, 0xFF, 0x01, 0x12, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128b(0x7E, 0x3F, 0x1F, 0x7F, 0x08, 0x0F, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00)); + }); +} + +TEST_CASE("VECTOR_SHR_I16", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShr(LoadVR(b, 4), LoadVR(b, 5), INT16_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128s(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFF, 0x8000, 0xFFFF, + 0x0001, 0x1234); + ctx->v[5] = vec128s(0, 1, 8, 15, 15, 8, 1, 16); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128s(0x7FFE, 0x3FFF, 0x007F, 0x0000, 0x0001, + 0x00FF, 0x0000, 0x1234)); + }); +} + +TEST_CASE("VECTOR_SHR_I16_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShr(LoadVR(b, 4), b.LoadConstant(vec128s( + 0, 1, 8, 15, 15, 8, 1, 16)), + INT16_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = vec128s(0x7FFE, 0x7FFE, 0x7FFE, 0x7FFF, 0x8000, 0xFFFF, + 0x0001, 0x1234); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == vec128s(0x7FFE, 0x3FFF, 0x007F, 0x0000, 0x0001, + 0x00FF, 0x0000, 0x1234)); + }); +} + +TEST_CASE("VECTOR_SHR_I32", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, b.VectorShr(LoadVR(b, 4), LoadVR(b, 5), INT32_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFF); + ctx->v[5] = vec128i(0, 1, 16, 31); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x7FFFFFFE, 0x3FFFFFFF, 0x00007FFF, 0x00000000)); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x80000000, 0xFFFFFFFF, 0x00000001, 0x12345678); + ctx->v[5] = vec128i(31, 16, 1, 32); + }, + [](PPCContext* ctx) { + auto result = ctx->v[3]; + REQUIRE(result == + vec128i(0x00000001, 0x0000FFFF, 0x00000000, 0x12345678)); + }); +} + +TEST_CASE("VECTOR_SHR_I32_CONSTANT", "[instr]") { + TestFunction test([](hir::HIRBuilder& b) { + StoreVR(b, 3, + b.VectorShr(LoadVR(b, 4), b.LoadConstant(vec128i(0, 1, 16, 31)), + INT32_TYPE)); + StoreVR(b, 4, + b.VectorShr(LoadVR(b, 5), b.LoadConstant(vec128i(31, 16, 1, 32)), + INT32_TYPE)); + b.Return(); + }); + test.Run([](PPCContext* ctx) { + ctx->v[4] = + vec128i(0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFE, 0x7FFFFFFF); + ctx->v[5] = + vec128i(0x80000000, 0xFFFFFFFF, 0x00000001, 0x12345678); + }, + [](PPCContext* ctx) { + auto result1 = ctx->v[3]; + REQUIRE(result1 == + vec128i(0x7FFFFFFE, 0x3FFFFFFF, 0x00007FFF, 0x00000000)); + auto result2 = ctx->v[4]; + REQUIRE(result2 == + vec128i(0x00000001, 0x0000FFFF, 0x00000000, 0x12345678)); + }); +} diff --git a/tools/alloy-test/util.h b/tools/alloy-test/util.h index e6673dfe5..56924dd82 100644 --- a/tools/alloy-test/util.h +++ b/tools/alloy-test/util.h @@ -23,7 +23,7 @@ #include #define ALLOY_TEST_IVM 1 -//#define ALLOY_TEST_X64 1 +#define ALLOY_TEST_X64 1 namespace alloy { namespace test {