diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc index 555a904bf..f38e0f2b8 100644 --- a/src/alloy/backend/ivm/ivm_intcode.cc +++ b/src/alloy/backend/ivm/ivm_intcode.cc @@ -3071,12 +3071,10 @@ uint32_t IntCode_PERMUTE_V128_BY_INT32(IntCodeState& ics, const IntCode* i) { const vec128_t& src3 = ics.rf[i->src3_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 4; i++) { -#define SWAP_INLINE(x) (((x) & ~0x3) + (3 - ((x) % 4))) - size_t m = SWAP_INLINE(i); - size_t b = (src1 >> (m * 8)) & 0x3; - dest.i4[m] = b < 4 ? - src2.i4[SWAP_INLINE(b)] : - src3.i4[SWAP_INLINE(b - 4)]; + size_t b = (src1 >> ((3 - i) * 8)) & 0x7; + dest.i4[i] = b < 4 ? + src2.i4[b] : + src3.i4[b - 4]; } return IA_NEXT; } @@ -3086,12 +3084,10 @@ uint32_t IntCode_PERMUTE_V128_BY_V128(IntCodeState& ics, const IntCode* i) { const vec128_t& src3 = ics.rf[i->src3_reg].v128; vec128_t& dest = ics.rf[i->dest_reg].v128; for (size_t i = 0; i < 16; i++) { -#define SWAP_INLINE(x) (((x) & ~0x3) + (3 - ((x) % 4))) - size_t m = SWAP_INLINE(i); - size_t b = src1.b16[m] & 0x1F; - dest.b16[m] = b < 16 ? - src2.b16[SWAP_INLINE(b)] : - src3.b16[SWAP_INLINE(b - 16)]; + size_t b = src1.b16[i] & 0x1F; + dest.b16[i] = b < 16 ? + src2.b16[b] : + src3.b16[b - 16]; } return IA_NEXT; } diff --git a/src/alloy/frontend/ppc/ppc_emit_altivec.cc b/src/alloy/frontend/ppc/ppc_emit_altivec.cc index 018f518b7..742d981c6 100644 --- a/src/alloy/frontend/ppc/ppc_emit_altivec.cc +++ b/src/alloy/frontend/ppc/ppc_emit_altivec.cc @@ -801,7 +801,7 @@ int InstrEmit_vmrghw_(PPCFunctionBuilder& f, uint32_t vd, uint32_t va, uint32_t // (VD.z) = (VA.y) // (VD.w) = (VB.y) Value* v = f.Permute( - f.LoadConstant(0x05010400), + f.LoadConstant(0x00040105), f.LoadVR(va), f.LoadVR(vb), INT32_TYPE); @@ -831,7 +831,7 @@ int InstrEmit_vmrglw_(PPCFunctionBuilder& f, uint32_t vd, uint32_t va, uint32_t // (VD.z) = (VA.w) // (VD.w) = (VB.w) Value* v = f.Permute( - f.LoadConstant(0x07030602), + f.LoadConstant(0x02060307), f.LoadVR(va), f.LoadVR(vb), INT32_TYPE); @@ -1164,7 +1164,11 @@ XEEMITTER(vrlw128, VX128(6, 80), VX128 )(PPCFunctionBuilder& f, Inst XEEMITTER(vrlimi128, VX128_4(6, 1808), VX128_4)(PPCFunctionBuilder& f, InstrData& i) { const uint32_t vd = i.VX128_4.VD128l | (i.VX128_4.VD128h << 5); const uint32_t vb = i.VX128_4.VB128l | (i.VX128_4.VB128h << 5); - uint32_t blend_mask = i.VX128_4.IMM; + uint32_t blend_mask_src = i.VX128_4.IMM; + uint32_t blend_mask = 0; + for (int n = 0; n < 4; n++) { + blend_mask |= ((blend_mask_src >> n) ? n : (n + 4)) << ((3 - n) * 8); + } uint32_t rotate = i.VX128_4.z; // This is just a fancy permute. // X Y Z W, rotated left by 2 = Z W X Y @@ -1193,8 +1197,10 @@ XEEMITTER(vrlimi128, VX128_4(6, 1808), VX128_4)(PPCFunctionBuilder& f, Inst } else { v = f.LoadVR(vb); } - v = f.Permute( - f.LoadConstant(blend_mask), v, f.LoadVR(vd), FLOAT32_TYPE); + if (blend_mask != 0x00010203) { + v = f.Permute( + f.LoadConstant(blend_mask), v, f.LoadVR(vd), INT32_TYPE); + } f.StoreVR(vd, v); return 0; } diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h index bbb93c68b..4e32ef80c 100644 --- a/src/alloy/hir/opcodes.h +++ b/src/alloy/hir/opcodes.h @@ -49,7 +49,7 @@ enum ArithmeticFlags { ARITHMETIC_SET_CARRY = (1 << 1), }; enum Permutes { - PERMUTE_XY_ZW = 0x05040100, + PERMUTE_XY_ZW = 0x00010405, }; enum Swizzles { SWIZZLE_XYZW_TO_XYZW = 0xE4,