diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 77dd2c472..dfbb49545 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -45,6 +45,17 @@ namespace ivm { //#define DPRINT if (ics.thread_state->thread_id() == 1) printf //#define DFLUSH() fflush(stdout) +#if XE_CPU_BIGENDIAN +#define VECTORBYTEOFFSET(n) (n) +#else +static const uint8_t __vector_byte_offset_table[16] = { + 3, 2, 1, 0, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, +}; +#define VECTORBYTEOFFSET(n) (__vector_byte_offset_table[n]) +#endif uint32_t IntCode_INT_LOAD_CONSTANT(IntCodeState& ics, const IntCode* i) { // TODO(benvanik): optimize on type to avoid 16b copy per load. @@ -1245,7 +1256,7 @@ uint32_t IntCode_LOAD_VECTOR_SHL(IntCodeState& ics, const IntCode* i) { int8_t sh = MIN(16, ics.rf[i->src1_reg].i8); vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - dest.b16[n] = __lvsl_table[sh][n]; + dest.b16[n] = __lvsl_table[sh][VECTORBYTEOFFSET(n)]; } return IA_NEXT; } @@ -1257,7 +1268,7 @@ uint32_t IntCode_LOAD_VECTOR_SHR(IntCodeState& ics, const IntCode* i) { int8_t sh = MIN(16, ics.rf[i->src1_reg].i8); vec128_t& dest = ics.rf[i->dest_reg].v128; for (int n = 0; n < 16; n++) { - dest.b16[n] = __lvsr_table[sh][n]; + dest.b16[n] = __lvsr_table[sh][VECTORBYTEOFFSET(n)]; } return IA_NEXT; } @@ -3681,16 +3692,10 @@ uint32_t IntCode_PERMUTE_V128_BY_INT32(IntCodeState& ics, const IntCode* i) { } return IA_NEXT; } -static const uint8_t __swap_table[16] = { - 3, 2, 1, 0, - 7, 6, 5, 4, - 11, 10, 9, 8, - 15, 14, 13, 12, -}; uint8_t grab(const vec128_t& src, uint8_t index) { return (index < 8 - ? (src.low >> (__swap_table[index] << 3)) - : (src.high >> ((__swap_table[index - 8]) << 3))) & 0xFF; + ? (src.low >> (VECTORBYTEOFFSET(index) << 3)) + : (src.high >> ((VECTORBYTEOFFSET(index - 8)) << 3))) & 0xFF; } uint32_t IntCode_PERMUTE_V128_BY_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& table = ics.rf[i->src1_reg].v128; @@ -3699,12 +3704,12 @@ uint32_t IntCode_PERMUTE_V128_BY_V128(IntCodeState& ics, const IntCode* i) { vec128_t& dests = ics.rf[i->dest_reg].v128; dests.low = dests.high = 0; for (size_t n = 0; n < 16; n++) { - uint8_t index = table.b16[n] & 0x1F; + uint8_t index = table.b16[VECTORBYTEOFFSET(n)] & 0x1F; uint8_t value = index < 16 ? grab(src2, index) : grab(src3, index - 16); uint64_t& dest = n < 8 ? dests.low : dests.high; - uint8_t shift = __swap_table[(n < 8 ? n : (n - 8))] << 3; + uint8_t shift = VECTORBYTEOFFSET((n < 8 ? n : (n - 8))) << 3; dest |= (((uint64_t)value) << shift); } return IA_NEXT;