From d1528e24bb27c0f6577ebea29d9f4e610068b28a Mon Sep 17 00:00:00 2001
From: Ben Vanik <ben.vanik@gmail.com>
Date: Mon, 6 Jan 2014 22:17:49 -0800
Subject: [PATCH] Removing REM, adding MUL_HI, renaming MULADD/MULSUB.

---
 src/alloy/backend/ivm/ivm_intcode.cc          | 301 +++++++++++++++---
 .../x64/lowering/lowering_sequences.cc        |  10 +-
 .../passes/constant_propagation_pass.cc       |  11 +-
 src/alloy/frontend/ppc/ppc_emit_alu.cc        |  53 ++-
 src/alloy/hir/hir_builder.cc                  |  19 +-
 src/alloy/hir/hir_builder.h                   |   6 +-
 src/alloy/hir/opcodes.h                       |   7 +-
 src/alloy/hir/opcodes.inl                     |  16 +-
 src/alloy/hir/value.cc                        |   5 -
 src/alloy/hir/value.h                         |   1 -
 10 files changed, 315 insertions(+), 114 deletions(-)

diff --git a/src/alloy/backend/ivm/ivm_intcode.cc b/src/alloy/backend/ivm/ivm_intcode.cc
index 277819185..923cdb621 100644
--- a/src/alloy/backend/ivm/ivm_intcode.cc
+++ b/src/alloy/backend/ivm/ivm_intcode.cc
@@ -2204,6 +2204,22 @@ uint32_t IntCode_MUL_V128_V128(IntCodeState& ics, const IntCode* i) {
   }
   return IA_NEXT;
 }
+uint32_t IntCode_MUL_I8_I8_U(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].u8 = ics.rf[i->src1_reg].u8 * ics.rf[i->src2_reg].u8;
+  return IA_NEXT;
+}
+uint32_t IntCode_MUL_I16_I16_U(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].u16 = ics.rf[i->src1_reg].u16 * ics.rf[i->src2_reg].u16;
+  return IA_NEXT;
+}
+uint32_t IntCode_MUL_I32_I32_U(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].u32 = ics.rf[i->src1_reg].u32 * ics.rf[i->src2_reg].u32;
+  return IA_NEXT;
+}
+uint32_t IntCode_MUL_I64_I64_U(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].u64 = ics.rf[i->src1_reg].u64 * ics.rf[i->src2_reg].u64;
+  return IA_NEXT;
+}
 int Translate_MUL(TranslationContext& ctx, Instr* i) {
   static IntCodeFn fns[] = {
     IntCode_MUL_I8_I8,
@@ -2214,34 +2230,184 @@ int Translate_MUL(TranslationContext& ctx, Instr* i) {
     IntCode_MUL_F64_F64,
     IntCode_MUL_V128_V128,
   };
-  return DispatchToC(ctx, i, fns[i->dest->type]);
+  static IntCodeFn fns_unsigned[] = {
+    IntCode_MUL_I8_I8_U,
+    IntCode_MUL_I16_I16_U,
+    IntCode_MUL_I32_I32_U,
+    IntCode_MUL_I64_I64_U,
+    IntCode_INVALID_TYPE,
+    IntCode_INVALID_TYPE,
+    IntCode_INVALID_TYPE,
+  };
+  if (i->flags & ARITHMETIC_UNSIGNED) {
+    return DispatchToC(ctx, i, fns_unsigned[i->dest->type]);
+  } else {
+    return DispatchToC(ctx, i, fns[i->dest->type]);
+  }
 }
 
-uint32_t IntCode_DIV_I8(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].i8 = ics.rf[i->src1_reg].u8 / ics.rf[i->src2_reg].u8;
+namespace {
+uint64_t Mul128(uint64_t xi_low, uint64_t xi_high,
+                uint64_t yi_low, uint64_t yi_high) {
+  // 128bit multiply, simplified for two input 64bit integers.
+  // http://mrob.com/pub/math/int128.c.txt
+#define HI_WORD 0xFFFFFFFF00000000LL
+#define LO_WORD 0x00000000FFFFFFFFLL
+  uint64_t d = xi_low & LO_WORD;
+  uint64_t c = (xi_low & HI_WORD) >> 32LL;
+  uint64_t b = xi_high & LO_WORD;
+  uint64_t a = (xi_high & HI_WORD) >> 32LL;
+  uint64_t h = yi_low & LO_WORD;
+  uint64_t g = (yi_low & HI_WORD) >> 32LL;
+  uint64_t f = yi_high & LO_WORD;
+  uint64_t e = (yi_high & HI_WORD) >> 32LL;
+  uint64_t acc = d * h;
+  uint64_t o1 = acc & LO_WORD;
+  acc >>= 32LL;
+  uint64_t carry = 0;
+
+  uint64_t ac2 = acc + c * h; if (ac2 < acc) { carry++; }
+  acc = ac2 + d * g; if (acc < ac2) { carry++; }
+  uint64_t rv2_lo = o1 | (acc << 32LL);
+  ac2 = (acc >> 32LL) | (carry << 32LL); carry = 0;
+
+  acc = ac2 + b * h; if (acc < ac2) { carry++; }
+  ac2 = acc + c * g; if (ac2 < acc) { carry++; }
+  acc = ac2 + d * f; if (acc < ac2) { carry++; }
+  uint64_t o2 = acc & LO_WORD;
+  ac2 = (acc >> 32LL) | (carry << 32LL);
+
+  acc = ac2 + a * h;
+  ac2 = acc + b * g;
+  acc = ac2 + c * f;
+  ac2 = acc + d * e;
+  uint64_t rv2_hi = (ac2 << 32LL) | o2;
+
+  return rv2_hi;
+}
+}
+
+uint32_t IntCode_MUL_HI_I8_I8(IntCodeState& ics, const IntCode* i) {
+  int16_t v =
+      (int16_t)ics.rf[i->src1_reg].i8 * (int16_t)ics.rf[i->src2_reg].i8;
+  ics.rf[i->dest_reg].i8 = (v >> 8);
   return IA_NEXT;
 }
-uint32_t IntCode_DIV_I16(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].i16 = ics.rf[i->src1_reg].u16 / ics.rf[i->src2_reg].u16;
+uint32_t IntCode_MUL_HI_I16_I16(IntCodeState& ics, const IntCode* i) {
+  int32_t v =
+      (int32_t)ics.rf[i->src1_reg].i16 * (int32_t)ics.rf[i->src2_reg].i16;
+  ics.rf[i->dest_reg].i16 = (v >> 16);
   return IA_NEXT;
 }
-uint32_t IntCode_DIV_I32(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].i32 = ics.rf[i->src1_reg].u32 / ics.rf[i->src2_reg].u32;
+uint32_t IntCode_MUL_HI_I32_I32(IntCodeState& ics, const IntCode* i) {
+  int64_t v =
+      (int64_t)ics.rf[i->src1_reg].i32 * (int64_t)ics.rf[i->src2_reg].i32;
+  ics.rf[i->dest_reg].i32 = (v >> 32);
   return IA_NEXT;
 }
-uint32_t IntCode_DIV_I64(IntCodeState& ics, const IntCode* i) {
-  ics.rf[i->dest_reg].i64 = ics.rf[i->src1_reg].u64 / ics.rf[i->src2_reg].u64;
+uint32_t IntCode_MUL_HI_I64_I64(IntCodeState& ics, const IntCode* i) {
+#if !XE_COMPILER(MSVC)
+  // GCC can, in theory, do this:
+  __int128 v =
+      (__int128)ics.rf[i->src1_reg].i64 * (__int128)ics.rf[i->src2_reg].i64;
+  ics.rf[i->dest_reg].i64 = (v >> 64);
+#else
+  // 128bit multiply, simplified for two input 64bit integers.
+  // http://mrob.com/pub/math/int128.c.txt
+  int64_t xi_low = ics.rf[i->src1_reg].i64;
+  int64_t xi_high = xi_low < 0 ? -1 : 0;
+  int64_t yi_low = ics.rf[i->src2_reg].i64;
+  int64_t yi_high = yi_low < 0 ? -1 : 0;
+  ics.rf[i->dest_reg].i64 = Mul128(xi_low, xi_high, yi_low, yi_high);
+#endif  // !MSVC
   return IA_NEXT;
 }
-uint32_t IntCode_DIV_F32(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_HI_I8_I8_U(IntCodeState& ics, const IntCode* i) {
+  uint16_t v =
+      (uint16_t)ics.rf[i->src1_reg].u8 * (uint16_t)ics.rf[i->src2_reg].u8;
+  ics.rf[i->dest_reg].u8 = (v >> 8);
+  return IA_NEXT;
+}
+uint32_t IntCode_MUL_HI_I16_I16_U(IntCodeState& ics, const IntCode* i) {
+  uint32_t v =
+      (uint32_t)ics.rf[i->src1_reg].u16 * (uint32_t)ics.rf[i->src2_reg].u16;
+  ics.rf[i->dest_reg].u16 = (v >> 16);
+  return IA_NEXT;
+}
+uint32_t IntCode_MUL_HI_I32_I32_U(IntCodeState& ics, const IntCode* i) {
+  uint64_t v =
+      (uint64_t)ics.rf[i->src1_reg].u32 * (uint64_t)ics.rf[i->src2_reg].u32;
+  ics.rf[i->dest_reg].u32 = (v >> 32);
+  return IA_NEXT;
+}
+uint32_t IntCode_MUL_HI_I64_I64_U(IntCodeState& ics, const IntCode* i) {
+#if !XE_COMPILER(MSVC)
+  // GCC can, in theory, do this:
+  __int128 v =
+      (__int128)ics.rf[i->src1_reg].i64 * (__int128)ics.rf[i->src2_reg].i64;
+  ics.rf[i->dest_reg].i64 = (v >> 64);
+#else
+  // 128bit multiply, simplified for two input 64bit integers.
+  // http://mrob.com/pub/math/int128.c.txt
+  int64_t xi_low = ics.rf[i->src1_reg].i64;
+  int64_t xi_high = 0;
+  int64_t yi_low = ics.rf[i->src2_reg].i64;
+  int64_t yi_high = 0;
+  ics.rf[i->dest_reg].i64 = Mul128(xi_low, xi_high, yi_low, yi_high);
+#endif  // !MSVC
+  return IA_NEXT;
+}
+int Translate_MUL_HI(TranslationContext& ctx, Instr* i) {
+  static IntCodeFn fns[] = {
+    IntCode_MUL_HI_I8_I8,
+    IntCode_MUL_HI_I16_I16,
+    IntCode_MUL_HI_I32_I32,
+    IntCode_MUL_HI_I64_I64,
+    IntCode_INVALID_TYPE,
+    IntCode_INVALID_TYPE,
+    IntCode_INVALID_TYPE,
+  };
+  static IntCodeFn fns_unsigned[] = {
+    IntCode_MUL_HI_I8_I8_U,
+    IntCode_MUL_HI_I16_I16_U,
+    IntCode_MUL_HI_I32_I32_U,
+    IntCode_MUL_HI_I64_I64_U,
+    IntCode_INVALID_TYPE,
+    IntCode_INVALID_TYPE,
+    IntCode_INVALID_TYPE,
+  };
+  if (i->flags & ARITHMETIC_UNSIGNED) {
+    return DispatchToC(ctx, i, fns_unsigned[i->dest->type]);
+  } else {
+    return DispatchToC(ctx, i, fns[i->dest->type]);
+  }
+}
+
+uint32_t IntCode_DIV_I8_I8(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].i8 = ics.rf[i->src1_reg].i8 / ics.rf[i->src2_reg].i8;
+  return IA_NEXT;
+}
+uint32_t IntCode_DIV_I16_I16(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].i16 = ics.rf[i->src1_reg].i16 / ics.rf[i->src2_reg].i16;
+  return IA_NEXT;
+}
+uint32_t IntCode_DIV_I32_I32(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].i32 = ics.rf[i->src1_reg].i32 / ics.rf[i->src2_reg].i32;
+  return IA_NEXT;
+}
+uint32_t IntCode_DIV_I64_I64(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].i64 = ics.rf[i->src1_reg].i64 / ics.rf[i->src2_reg].i64;
+  return IA_NEXT;
+}
+uint32_t IntCode_DIV_F32_F32(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].f32 = ics.rf[i->src1_reg].f32 / ics.rf[i->src2_reg].f32;
   return IA_NEXT;
 }
-uint32_t IntCode_DIV_F64(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_DIV_F64_F64(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].f64 = ics.rf[i->src1_reg].f64 / ics.rf[i->src2_reg].f64;
   return IA_NEXT;
 }
-uint32_t IntCode_DIV_V128(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_DIV_V128_V128(IntCodeState& ics, const IntCode* i) {
   const vec128_t& src1 = ics.rf[i->src1_reg].v128;
   const vec128_t& src2 = ics.rf[i->src2_reg].v128;
   vec128_t& dest = ics.rf[i->dest_reg].v128;
@@ -2250,45 +2416,74 @@ uint32_t IntCode_DIV_V128(IntCodeState& ics, const IntCode* i) {
   }
   return IA_NEXT;
 }
+uint32_t IntCode_DIV_I8_I8_U(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].u8 = ics.rf[i->src1_reg].u8 / ics.rf[i->src2_reg].u8;
+  return IA_NEXT;
+}
+uint32_t IntCode_DIV_I16_I16_U(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].u16 = ics.rf[i->src1_reg].u16 / ics.rf[i->src2_reg].u16;
+  return IA_NEXT;
+}
+uint32_t IntCode_DIV_I32_I32_U(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].u32 = ics.rf[i->src1_reg].u32 / ics.rf[i->src2_reg].u32;
+  return IA_NEXT;
+}
+uint32_t IntCode_DIV_I64_I64_U(IntCodeState& ics, const IntCode* i) {
+  ics.rf[i->dest_reg].u64 = ics.rf[i->src1_reg].u64 / ics.rf[i->src2_reg].u64;
+  return IA_NEXT;
+}
 int Translate_DIV(TranslationContext& ctx, Instr* i) {
   static IntCodeFn fns[] = {
-    IntCode_DIV_I8,
-    IntCode_DIV_I16,
-    IntCode_DIV_I32,
-    IntCode_DIV_I64,
-    IntCode_DIV_F32,
-    IntCode_DIV_F64,
-    IntCode_DIV_V128,
+    IntCode_DIV_I8_I8,
+    IntCode_DIV_I16_I16,
+    IntCode_DIV_I32_I32,
+    IntCode_DIV_I64_I64,
+    IntCode_DIV_F32_F32,
+    IntCode_DIV_F64_F64,
+    IntCode_DIV_V128_V128,
   };
-  return DispatchToC(ctx, i, fns[i->dest->type]);
+  static IntCodeFn fns_unsigned[] = {
+    IntCode_DIV_I8_I8_U,
+    IntCode_DIV_I16_I16_U,
+    IntCode_DIV_I32_I32_U,
+    IntCode_DIV_I64_I64_U,
+    IntCode_INVALID_TYPE,
+    IntCode_INVALID_TYPE,
+    IntCode_INVALID_TYPE,
+  };
+  if (i->flags & ARITHMETIC_UNSIGNED) {
+    return DispatchToC(ctx, i, fns_unsigned[i->dest->type]);
+  } else {
+    return DispatchToC(ctx, i, fns[i->dest->type]);
+  }
 }
 
 // TODO(benvanik): use intrinsics or something
-uint32_t IntCode_MULADD_I8(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_ADD_I8(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].i8 = ics.rf[i->src1_reg].i8 * ics.rf[i->src2_reg].i8 + ics.rf[i->src3_reg].i8;
   return IA_NEXT;
 }
-uint32_t IntCode_MULADD_I16(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_ADD_I16(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].i16 = ics.rf[i->src1_reg].i16 * ics.rf[i->src2_reg].i16 + ics.rf[i->src3_reg].i16;
   return IA_NEXT;
 }
-uint32_t IntCode_MULADD_I32(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_ADD_I32(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].i32 = ics.rf[i->src1_reg].i32 * ics.rf[i->src2_reg].i32 + ics.rf[i->src3_reg].i32;
   return IA_NEXT;
 }
-uint32_t IntCode_MULADD_I64(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_ADD_I64(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].i64 = ics.rf[i->src1_reg].i64 * ics.rf[i->src2_reg].i64 + ics.rf[i->src3_reg].i64;
   return IA_NEXT;
 }
-uint32_t IntCode_MULADD_F32(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_ADD_F32(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].f32 = ics.rf[i->src1_reg].f32 * ics.rf[i->src2_reg].f32 + ics.rf[i->src3_reg].f32;
   return IA_NEXT;
 }
-uint32_t IntCode_MULADD_F64(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_ADD_F64(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].f64 = ics.rf[i->src1_reg].f64 * ics.rf[i->src2_reg].f64 + ics.rf[i->src3_reg].f64;
   return IA_NEXT;
 }
-uint32_t IntCode_MULADD_V128(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_ADD_V128(IntCodeState& ics, const IntCode* i) {
   const vec128_t& src1 = ics.rf[i->src1_reg].v128;
   const vec128_t& src2 = ics.rf[i->src2_reg].v128;
   const vec128_t& src3 = ics.rf[i->src3_reg].v128;
@@ -2298,45 +2493,45 @@ uint32_t IntCode_MULADD_V128(IntCodeState& ics, const IntCode* i) {
   }
   return IA_NEXT;
 }
-int Translate_MULADD(TranslationContext& ctx, Instr* i) {
+int Translate_MUL_ADD(TranslationContext& ctx, Instr* i) {
   static IntCodeFn fns[] = {
-    IntCode_MULADD_I8,
-    IntCode_MULADD_I16,
-    IntCode_MULADD_I32,
-    IntCode_MULADD_I64,
-    IntCode_MULADD_F32,
-    IntCode_MULADD_F64,
-    IntCode_MULADD_V128,
+    IntCode_MUL_ADD_I8,
+    IntCode_MUL_ADD_I16,
+    IntCode_MUL_ADD_I32,
+    IntCode_MUL_ADD_I64,
+    IntCode_MUL_ADD_F32,
+    IntCode_MUL_ADD_F64,
+    IntCode_MUL_ADD_V128,
   };
   return DispatchToC(ctx, i, fns[i->dest->type]);
 }
 
 // TODO(benvanik): use intrinsics or something
-uint32_t IntCode_MULSUB_I8(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_SUB_I8(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].i8 = ics.rf[i->src1_reg].i8 * ics.rf[i->src2_reg].i8 - ics.rf[i->src3_reg].i8;
   return IA_NEXT;
 }
-uint32_t IntCode_MULSUB_I16(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_SUB_I16(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].i16 = ics.rf[i->src1_reg].i16 * ics.rf[i->src2_reg].i16 - ics.rf[i->src3_reg].i16;
   return IA_NEXT;
 }
-uint32_t IntCode_MULSUB_I32(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_SUB_I32(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].i32 = ics.rf[i->src1_reg].i32 * ics.rf[i->src2_reg].i32 - ics.rf[i->src3_reg].i32;
   return IA_NEXT;
 }
-uint32_t IntCode_MULSUB_I64(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_SUB_I64(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].i64 = ics.rf[i->src1_reg].i64 * ics.rf[i->src2_reg].i64 - ics.rf[i->src3_reg].i64;
   return IA_NEXT;
 }
-uint32_t IntCode_MULSUB_F32(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_SUB_F32(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].f32 = ics.rf[i->src1_reg].f32 * ics.rf[i->src2_reg].f32 - ics.rf[i->src3_reg].f32;
   return IA_NEXT;
 }
-uint32_t IntCode_MULSUB_F64(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_SUB_F64(IntCodeState& ics, const IntCode* i) {
   ics.rf[i->dest_reg].f64 = ics.rf[i->src1_reg].f64 * ics.rf[i->src2_reg].f64 - ics.rf[i->src3_reg].f64;
   return IA_NEXT;
 }
-uint32_t IntCode_MULSUB_V128(IntCodeState& ics, const IntCode* i) {
+uint32_t IntCode_MUL_SUB_V128(IntCodeState& ics, const IntCode* i) {
   const vec128_t& src1 = ics.rf[i->src1_reg].v128;
   const vec128_t& src2 = ics.rf[i->src2_reg].v128;
   const vec128_t& src3 = ics.rf[i->src3_reg].v128;
@@ -2346,15 +2541,15 @@ uint32_t IntCode_MULSUB_V128(IntCodeState& ics, const IntCode* i) {
   }
   return IA_NEXT;
 }
-int Translate_MULSUB(TranslationContext& ctx, Instr* i) {
+int Translate_MUL_SUB(TranslationContext& ctx, Instr* i) {
   static IntCodeFn fns[] = {
-    IntCode_MULSUB_I8,
-    IntCode_MULSUB_I16,
-    IntCode_MULSUB_I32,
-    IntCode_MULSUB_I64,
-    IntCode_MULSUB_F32,
-    IntCode_MULSUB_F64,
-    IntCode_MULSUB_V128,
+    IntCode_MUL_SUB_I8,
+    IntCode_MUL_SUB_I16,
+    IntCode_MUL_SUB_I32,
+    IntCode_MUL_SUB_I64,
+    IntCode_MUL_SUB_F32,
+    IntCode_MUL_SUB_F64,
+    IntCode_MUL_SUB_V128,
   };
   return DispatchToC(ctx, i, fns[i->dest->type]);
 }
@@ -3273,10 +3468,10 @@ static const TranslateFn dispatch_table[] = {
   Translate_ADD_CARRY,
   Translate_SUB,
   Translate_MUL,
+  Translate_MUL_HI,
   Translate_DIV,
-  TranslateInvalid, //Translate_REM,
-  Translate_MULADD,
-  Translate_MULSUB,
+  Translate_MUL_ADD,
+  Translate_MUL_SUB,
   Translate_NEG,
   Translate_ABS,
   Translate_SQRT,
diff --git a/src/alloy/backend/x64/lowering/lowering_sequences.cc b/src/alloy/backend/x64/lowering/lowering_sequences.cc
index ec215f84e..0419effa8 100644
--- a/src/alloy/backend/x64/lowering/lowering_sequences.cc
+++ b/src/alloy/backend/x64/lowering/lowering_sequences.cc
@@ -448,19 +448,13 @@ void alloy::backend::x64::lowering::RegisterSequences(LoweringTable* table) {
     return true;
   });
 
-  table->AddSequence(OPCODE_REM, [](LIRBuilder& lb, Instr*& instr) {
+  table->AddSequence(OPCODE_MUL_ADD, [](LIRBuilder& lb, Instr*& instr) {
     // TODO
     instr = instr->next;
     return true;
   });
 
-  table->AddSequence(OPCODE_MULADD, [](LIRBuilder& lb, Instr*& instr) {
-    // TODO
-    instr = instr->next;
-    return true;
-  });
-
-  table->AddSequence(OPCODE_MULSUB, [](LIRBuilder& lb, Instr*& instr) {
+  table->AddSequence(OPCODE_MUL_SUB, [](LIRBuilder& lb, Instr*& instr) {
     // TODO
     instr = instr->next;
     return true;
diff --git a/src/alloy/compiler/passes/constant_propagation_pass.cc b/src/alloy/compiler/passes/constant_propagation_pass.cc
index af00e1608..0bf269334 100644
--- a/src/alloy/compiler/passes/constant_propagation_pass.cc
+++ b/src/alloy/compiler/passes/constant_propagation_pass.cc
@@ -209,15 +209,8 @@ int ConstantPropagationPass::Run(HIRBuilder* builder) {
           i->Remove();
         }
         break;
-      case OPCODE_REM:
-        if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) {
-          v->set_from(i->src1.value);
-          v->Rem(i->src2.value);
-          i->Remove();
-        }
-        break;
-      // case OPCODE_MULADD:
-      // case OPCODE_MULSUB
+      // case OPCODE_MUL_ADD:
+      // case OPCODE_MUL_SUB
       case OPCODE_NEG:
         if (i->src1.value->IsConstant()) {
           v->set_from(i->src1.value);
diff --git a/src/alloy/frontend/ppc/ppc_emit_alu.cc b/src/alloy/frontend/ppc/ppc_emit_alu.cc
index 019b99b52..ed9fbdf38 100644
--- a/src/alloy/frontend/ppc/ppc_emit_alu.cc
+++ b/src/alloy/frontend/ppc/ppc_emit_alu.cc
@@ -212,8 +212,8 @@ XEEMITTER(divdux,       0x7C000392, XO )(PPCHIRBuilder& f, InstrData& i) {
   // TODO(benvanik): check if zero
   //                 if OE=1, set XER[OV] = 1
   //                 else skip the divide
-  Value* v = f.Div(f.LoadGPR(i.XO.RA), divisor);
-f.StoreGPR(i.XO.RT, v);
+  Value* v = f.Div(f.LoadGPR(i.XO.RA), divisor, ARITHMETIC_UNSIGNED);
+  f.StoreGPR(i.XO.RT, v);
   if (i.XO.OE) {
     // If we are OE=1 we need to clear the overflow bit.
     //e.update_xer_with_overflow(e.get_uint64(0));
@@ -240,7 +240,7 @@ XEEMITTER(divwx,        0x7C0003D6, XO )(PPCHIRBuilder& f, InstrData& i) {
   //                 if OE=1, set XER[OV] = 1
   //                 else skip the divide
   Value* v = f.Div(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), divisor);
-  v = f.ZeroExtend(v, INT64_TYPE);
+  v = f.SignExtend(v, INT64_TYPE);
   f.StoreGPR(i.XO.RT, v);
   if (i.XO.OE) {
     // If we are OE=1 we need to clear the overflow bit.
@@ -267,7 +267,8 @@ XEEMITTER(divwux,       0x7C000396, XO )(PPCHIRBuilder& f, InstrData& i) {
   // TODO(benvanik): check if zero
   //                 if OE=1, set XER[OV] = 1
   //                 else skip the divide
-  Value* v = f.Div(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), divisor);
+  Value* v = f.Div(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), divisor,
+                   ARITHMETIC_UNSIGNED);
   v = f.ZeroExtend(v, INT64_TYPE);
   f.StoreGPR(i.XO.RT, v);
   if (i.XO.OE) {
@@ -283,13 +284,34 @@ XEEMITTER(divwux,       0x7C000396, XO )(PPCHIRBuilder& f, InstrData& i) {
 }
 
 XEEMITTER(mulhdx,       0x7C000092, XO )(PPCHIRBuilder& f, InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  // RT <- ((RA) × (RB) as 128)[0:63]
+  if (i.XO.OE) {
+    // With XER update.
+    XEINSTRNOTIMPLEMENTED();
+    return 1;
+  }
+  Value* v = f.MulHi(f.LoadGPR(i.XO.RA), f.LoadGPR(i.XO.RB));
+  f.StoreGPR(i.XO.RT, v);
+  if (i.XO.Rc) {
+    f.UpdateCR(0, v);
+  }
+  return 0;
 }
 
 XEEMITTER(mulhdux,      0x7C000012, XO )(PPCHIRBuilder& f, InstrData& i) {
-  XEINSTRNOTIMPLEMENTED();
-  return 1;
+  // RT <- ((RA) × (RB) as 128)[0:63]
+  if (i.XO.OE) {
+    // With XER update.
+    XEINSTRNOTIMPLEMENTED();
+    return 1;
+  }
+  Value* v = f.MulHi(
+      f.LoadGPR(i.XO.RA), f.LoadGPR(i.XO.RB), ARITHMETIC_UNSIGNED);
+  f.StoreGPR(i.XO.RT, v);
+  if (i.XO.Rc) {
+    f.UpdateCR(0, v);
+  }
+  return 0;
 }
 
 XEEMITTER(mulhwx,       0x7C000096, XO )(PPCHIRBuilder& f, InstrData& i) {
@@ -299,10 +321,9 @@ XEEMITTER(mulhwx,       0x7C000096, XO )(PPCHIRBuilder& f, InstrData& i) {
     XEINSTRNOTIMPLEMENTED();
     return 1;
   }
-  Value* v = f.Mul(
-      f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE),
-      f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE));
-  v = f.Shr(v, 32);
+  Value* v = f.SignExtend(f.MulHi(
+      f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
+      f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE)), INT64_TYPE);
   f.StoreGPR(i.XO.RT, v);
   if (i.XO.Rc) {
     f.UpdateCR(0, v);
@@ -317,10 +338,10 @@ XEEMITTER(mulhwux,      0x7C000016, XO )(PPCHIRBuilder& f, InstrData& i) {
     XEINSTRNOTIMPLEMENTED();
     return 1;
   }
-  Value* v = f.Mul(
-      f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE), INT64_TYPE),
-      f.ZeroExtend(f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE), INT64_TYPE));
-  v = f.Shr(v, 32);
+  Value* v = f.ZeroExtend(f.MulHi(
+      f.Truncate(f.LoadGPR(i.XO.RA), INT32_TYPE),
+      f.Truncate(f.LoadGPR(i.XO.RB), INT32_TYPE),
+      ARITHMETIC_UNSIGNED), INT64_TYPE);
   f.StoreGPR(i.XO.RT, v);
   if (i.XO.Rc) {
     f.UpdateCR(0, v, false);
diff --git a/src/alloy/hir/hir_builder.cc b/src/alloy/hir/hir_builder.cc
index b31907438..861e70853 100644
--- a/src/alloy/hir/hir_builder.cc
+++ b/src/alloy/hir/hir_builder.cc
@@ -1145,11 +1145,12 @@ Value* HIRBuilder::Sub(
   return i->dest;
 }
 
-Value* HIRBuilder::Mul(Value* value1, Value* value2) {
+Value* HIRBuilder::Mul(
+    Value* value1, Value* value2, uint32_t arithmetic_flags) {
   ASSERT_TYPES_EQUAL(value1, value2);
 
   Instr* i = AppendInstr(
-      OPCODE_MUL_info, 0,
+      OPCODE_MUL_info, arithmetic_flags,
       AllocValue(value1->type));
   i->set_src1(value1);
   i->set_src2(value2);
@@ -1157,11 +1158,12 @@ Value* HIRBuilder::Mul(Value* value1, Value* value2) {
   return i->dest;
 }
 
-Value* HIRBuilder::Div(Value* value1, Value* value2) {
+Value* HIRBuilder::MulHi(
+    Value* value1, Value* value2, uint32_t arithmetic_flags) {
   ASSERT_TYPES_EQUAL(value1, value2);
 
   Instr* i = AppendInstr(
-      OPCODE_DIV_info, 0,
+      OPCODE_MUL_HI_info, arithmetic_flags,
       AllocValue(value1->type));
   i->set_src1(value1);
   i->set_src2(value2);
@@ -1169,11 +1171,12 @@ Value* HIRBuilder::Div(Value* value1, Value* value2) {
   return i->dest;
 }
 
-Value* HIRBuilder::Rem(Value* value1, Value* value2) {
+Value* HIRBuilder::Div(
+    Value* value1, Value* value2, uint32_t arithmetic_flags) {
   ASSERT_TYPES_EQUAL(value1, value2);
 
   Instr* i = AppendInstr(
-      OPCODE_REM_info, 0,
+      OPCODE_DIV_info, arithmetic_flags,
       AllocValue(value1->type));
   i->set_src1(value1);
   i->set_src2(value2);
@@ -1194,7 +1197,7 @@ Value* HIRBuilder::MulAdd(Value* value1, Value* value2, Value* value3) {
   }
 
   Instr* i = AppendInstr(
-      OPCODE_MULADD_info, 0,
+      OPCODE_MUL_ADD_info, 0,
       AllocValue(value1->type));
   i->set_src1(value1);
   i->set_src2(value2);
@@ -1215,7 +1218,7 @@ Value* HIRBuilder::MulSub(Value* value1, Value* value2, Value* value3) {
   }
 
   Instr* i = AppendInstr(
-      OPCODE_MULSUB_info, 0,
+      OPCODE_MUL_SUB_info, 0,
       AllocValue(value1->type));
   i->set_src1(value1);
   i->set_src2(value2);
diff --git a/src/alloy/hir/hir_builder.h b/src/alloy/hir/hir_builder.h
index 117ff79f3..86dd94784 100644
--- a/src/alloy/hir/hir_builder.h
+++ b/src/alloy/hir/hir_builder.h
@@ -152,9 +152,9 @@ public:
                       uint32_t arithmetic_flags = 0);
   Value* Sub(Value* value1, Value* value2,
              uint32_t arithmetic_flags = 0);
-  Value* Mul(Value* value1, Value* value2);
-  Value* Div(Value* value1, Value* value2);
-  Value* Rem(Value* value1, Value* value2);
+  Value* Mul(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
+  Value* MulHi(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
+  Value* Div(Value* value1, Value* value2, uint32_t arithmetic_flags = 0);
   Value* MulAdd(Value* value1, Value* value2, Value* value3); // (1 * 2) + 3
   Value* MulSub(Value* value1, Value* value2, Value* value3); // (1 * 2) - 3
   Value* Neg(Value* value);
diff --git a/src/alloy/hir/opcodes.h b/src/alloy/hir/opcodes.h
index e6b0e3b54..abe7f3940 100644
--- a/src/alloy/hir/opcodes.h
+++ b/src/alloy/hir/opcodes.h
@@ -47,6 +47,7 @@ enum PrefetchFlags {
 };
 enum ArithmeticFlags {
   ARITHMETIC_SET_CARRY = (1 << 1),
+  ARITHMETIC_UNSIGNED = (1 << 2),
 };
 enum Permutes {
   PERMUTE_XY_ZW = 0x00010405,
@@ -134,10 +135,10 @@ enum Opcode {
   OPCODE_ADD_CARRY,
   OPCODE_SUB,
   OPCODE_MUL,
+  OPCODE_MUL_HI, // TODO(benvanik): remove this and add INT128 type.
   OPCODE_DIV,
-  OPCODE_REM,
-  OPCODE_MULADD,
-  OPCODE_MULSUB,
+  OPCODE_MUL_ADD,
+  OPCODE_MUL_SUB,
   OPCODE_NEG,
   OPCODE_ABS,
   OPCODE_SQRT,
diff --git a/src/alloy/hir/opcodes.inl b/src/alloy/hir/opcodes.inl
index a1b56775e..ef47de819 100644
--- a/src/alloy/hir/opcodes.inl
+++ b/src/alloy/hir/opcodes.inl
@@ -343,6 +343,12 @@ DEFINE_OPCODE(
     OPCODE_SIG_V_V_V,
     OPCODE_FLAG_COMMUNATIVE);
 
+DEFINE_OPCODE(
+    OPCODE_MUL_HI,
+    "mul_hi",
+    OPCODE_SIG_V_V_V,
+    OPCODE_FLAG_COMMUNATIVE);
+
 DEFINE_OPCODE(
     OPCODE_DIV,
     "div",
@@ -350,19 +356,13 @@ DEFINE_OPCODE(
     0);
 
 DEFINE_OPCODE(
-    OPCODE_REM,
-    "rem",
-    OPCODE_SIG_V_V_V,
-    0);
-
-DEFINE_OPCODE(
-    OPCODE_MULADD,
+    OPCODE_MUL_ADD,
     "mul_add",
     OPCODE_SIG_V_V_V_V,
     0);
 
 DEFINE_OPCODE(
-    OPCODE_MULSUB,
+    OPCODE_MUL_SUB,
     "mul_sub",
     OPCODE_SIG_V_V_V_V,
     0);
diff --git a/src/alloy/hir/value.cc b/src/alloy/hir/value.cc
index 88201ccf1..43d40d647 100644
--- a/src/alloy/hir/value.cc
+++ b/src/alloy/hir/value.cc
@@ -278,11 +278,6 @@ void Value::Div(Value* other) {
   }
 }
 
-void Value::Rem(Value* other) {
-  // TODO(benvanik): big matrix.
-  XEASSERTALWAYS();
-}
-
 void Value::MulAdd(Value* dest, Value* value1, Value* value2, Value* value3) {
   // TODO(benvanik): big matrix.
   XEASSERTALWAYS();
diff --git a/src/alloy/hir/value.h b/src/alloy/hir/value.h
index 8bf9f0135..814fe081d 100644
--- a/src/alloy/hir/value.h
+++ b/src/alloy/hir/value.h
@@ -185,7 +185,6 @@ public:
   void Sub(Value* other);
   void Mul(Value* other);
   void Div(Value* other);
-  void Rem(Value* other);
   static void MulAdd(Value* dest, Value* value1, Value* value2, Value* value3);
   static void MulSub(Value* dest, Value* value1, Value* value2, Value* value3);
   void Neg();