diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 1d7342b83..910c32477 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -3681,16 +3681,31 @@ uint32_t IntCode_PERMUTE_V128_BY_INT32(IntCodeState& ics, const IntCode* i) { } return IA_NEXT; } +static const uint8_t __swap_table[16] = { + 3, 2, 1, 0, + 7, 6, 5, 4, + 11, 10, 9, 8, + 15, 14, 13, 12, +}; +uint8_t grab(const vec128_t& src, uint8_t index) { + return (index < 8 + ? (src.low >> (__swap_table[index] << 3)) + : (src.high >> ((__swap_table[index - 8]) << 3))) & 0xFF; +} uint32_t IntCode_PERMUTE_V128_BY_V128(IntCodeState& ics, const IntCode* i) { - const vec128_t& src1 = ics.rf[i->src1_reg].v128; + const vec128_t& table = ics.rf[i->src1_reg].v128; const vec128_t& src2 = ics.rf[i->src2_reg].v128; const vec128_t& src3 = ics.rf[i->src3_reg].v128; - vec128_t& dest = ics.rf[i->dest_reg].v128; - for (size_t i = 0; i < 16; i++) { - size_t b = src1.b16[i] & 0x1F; - dest.b16[i] = b < 16 ? - src2.b16[b] : - src3.b16[b - 16]; + vec128_t& dests = ics.rf[i->dest_reg].v128; + dests.low = dests.high = 0; + for (size_t n = 0; n < 16; n++) { + uint8_t index = table.b16[n] & 0x1F; + uint8_t value = index < 16 + ? grab(src2, index) + : grab(src3, index - 16); + uint64_t& dest = n < 8 ? dests.low : dests.high; + uint8_t shift = __swap_table[(n < 8 ? n : (n - 8))] << 3; + dest |= (((uint64_t)value) << shift); } return IA_NEXT; }