xenia-canary/src/xenia/cpu/codegen/function_generator.cc

/**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2013 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */

#include <xenia/cpu/codegen/function_generator.h>

#include <llvm/IR/Intrinsics.h>

#include <xenia/cpu/cpu-private.h>
#include <xenia/cpu/ppc/state.h>


using namespace llvm;
using namespace xe::cpu::codegen;
using namespace xe::cpu::ppc;
using namespace xe::cpu::sdb;


DEFINE_bool(memory_address_verification, false,
    "Whether to add additional checks to generated memory load/stores.");
DEFINE_bool(log_codegen, false,
    "Log codegen to stdout.");


/**
 * This generates function code.
 * One context is created for each function to generate. Each basic block in
 * the function is created and stashed in one pass, then filled in the next.
 *
 * This context object is a stateful representation of the current machine state
 * and all accessors to registers should occur through it. By doing so it's
 * possible to exploit the SSA nature of LLVM to reuse register values within
 * a function without needing to flush to memory.
 *
 * Function calls (any branch outside of the function) will result in an
 * expensive flush of registers.
 *
 * TODO(benvanik): track arguments by looking for register reads without writes
 * TODO(benvanik): avoid flushing registers for leaf nodes
 * TODO(benvnaik): pass return value in LLVM return, not by memory
 */


FunctionGenerator::FunctionGenerator(
    xe_memory_ref memory, SymbolDatabase* sdb, FunctionSymbol* fn,
    LLVMContext* context, Module* gen_module, Function* gen_fn) {
  memory_ = memory;
  sdb_ = sdb;
  fn_ = fn;
  context_ = context;
  gen_module_ = gen_module;
  gen_fn_ = gen_fn;
  builder_ = new IRBuilder<>(*context_);
  fn_block_ = NULL;
  return_block_ = NULL;
  internal_indirection_block_ = NULL;
  external_indirection_block_ = NULL;
  bb_ = NULL;

  access_bits_.Clear();

  locals_.indirection_target = NULL;
  locals_.indirection_cia = NULL;

  locals_.xer = NULL;
  locals_.lr = NULL;
  locals_.ctr = NULL;
  for (size_t n = 0; n < XECOUNT(locals_.cr); n++) {
    locals_.cr[n] = NULL;
  }
  for (size_t n = 0; n < XECOUNT(locals_.gpr); n++) {
    locals_.gpr[n] = NULL;
  }
  for (size_t n = 0; n < XECOUNT(locals_.fpr); n++) {
    locals_.fpr[n] = NULL;
  }

  if (FLAGS_log_codegen) {
    printf("%s:\n", fn->name());
  }
}

FunctionGenerator::~FunctionGenerator() {
  delete builder_;
}

SymbolDatabase* FunctionGenerator::sdb() {
  return sdb_;
}

FunctionSymbol* FunctionGenerator::fn() {
  return fn_;
}

llvm::LLVMContext* FunctionGenerator::context() {
  return context_;
}

llvm::Module* FunctionGenerator::gen_module() {
  return gen_module_;
}

llvm::Function* FunctionGenerator::gen_fn() {
  return gen_fn_;
}

FunctionBlock* FunctionGenerator::fn_block() {
  return fn_block_;
}

void FunctionGenerator::PushInsertPoint() {
  IRBuilder<>& b = *builder_;
  insert_points_.push_back(std::pair<BasicBlock*, BasicBlock::iterator>(
      b.GetInsertBlock(), b.GetInsertPoint()));
}

void FunctionGenerator::PopInsertPoint() {
  IRBuilder<>& b = *builder_;
  std::pair<BasicBlock*, BasicBlock::iterator> back = insert_points_.back();
  b.SetInsertPoint(back.first, back.second);
  insert_points_.pop_back();
}

void FunctionGenerator::GenerateBasicBlocks() {
  IRBuilder<>& b = *builder_;

  // Always add an entry block.
  BasicBlock* entry = BasicBlock::Create(*context_, "entry", gen_fn_);
  b.SetInsertPoint(entry);

  if (FLAGS_trace_user_calls) {
    SpillRegisters();
    Value* traceUserCall = gen_module_->getFunction("XeTraceUserCall");
    b.CreateCall4(
        traceUserCall,
        gen_fn_->arg_begin(),
        b.getInt64(fn_->start_address),
        ++gen_fn_->arg_begin(),
        b.getInt64((uint64_t)fn_));
  }

  // If this function is empty, abort!
  if (!fn_->blocks.size()) {
    b.CreateRetVoid();
    return;
  }

  // Create a return block.
  // This spills registers and returns. All non-tail returns should branch
  // here to do the return and ensure registers are spilled.
  return_block_ = BasicBlock::Create(*context_, "return", gen_fn_);

  // Pass 1 creates all of the blocks - this way we can branch to them.
  // We also track registers used so that when know which ones to fill/spill.
  for (std::map<uint32_t, FunctionBlock*>::iterator it = fn_->blocks.begin();
       it != fn_->blocks.end(); ++it) {
    FunctionBlock* block = it->second;
    XEIGNORE(PrepareBasicBlock(block));
  }

  // Setup all local variables now that we know what we need.
  SetupLocals();

  // Pass 2 fills in instructions.
  for (std::map<uint32_t, FunctionBlock*>::iterator it = fn_->blocks.begin();
       it != fn_->blocks.end(); ++it) {
    FunctionBlock* block = it->second;
    GenerateBasicBlock(block);
  }

  // Setup the shared return/indirection/etc blocks now that we know all the
  // blocks we need and all the registers used.
  GenerateSharedBlocks();
}

void FunctionGenerator::GenerateSharedBlocks() {
  IRBuilder<>& b = *builder_;

  Value* indirect_branch = gen_module_->getFunction("XeIndirectBranch");

  // Setup initial register fill in the entry block.
  // We can only do this once all the locals have been created.
  b.SetInsertPoint(&gen_fn_->getEntryBlock());
  FillRegisters();
  // Entry always falls through to the second block.
  b.CreateBr(bbs_.begin()->second);

  // Setup the spill block in return.
  b.SetInsertPoint(return_block_);
  SpillRegisters();
  b.CreateRetVoid();

  // Build indirection block on demand.
  // We have already prepped all basic blocks, so we can build these tables now.
  if (external_indirection_block_) {
    // This will spill registers and call the external function.
    // It is only meant for LK=0.
    b.SetInsertPoint(external_indirection_block_);
    SpillRegisters();
    b.CreateCall3(indirect_branch,
                  gen_fn_->arg_begin(),
                  b.CreateLoad(locals_.indirection_target),
                  b.CreateLoad(locals_.indirection_cia));
    b.CreateRetVoid();
  }

  if (internal_indirection_block_) {
    // This will not spill registers and instead try to switch on local blocks.
    // If it fails then the external indirection path is taken.
    // NOTE: we only generate this if a likely local branch is taken.
    b.SetInsertPoint(internal_indirection_block_);
    SwitchInst* switch_i = b.CreateSwitch(
        b.CreateLoad(locals_.indirection_target),
        external_indirection_block_,
        static_cast<int>(bbs_.size()));
    for (std::map<uint32_t, BasicBlock*>::iterator it = bbs_.begin();
         it != bbs_.end(); ++it) {
      switch_i->addCase(b.getInt64(it->first), it->second);
    }
  }
}

int FunctionGenerator::PrepareBasicBlock(FunctionBlock* block) {
  // Create the basic block that will end up getting filled during
  // generation.
  char name[32];
  xesnprintfa(name, XECOUNT(name), "loc_%.8X", block->start_address);
  BasicBlock* bb = BasicBlock::Create(*context_, name, gen_fn_);
  bbs_.insert(std::pair<uint32_t, BasicBlock*>(block->start_address, bb));

  // Scan and disassemble each instruction in the block to get accurate
  // register access bits. In the future we could do other optimization checks
  // in this pass.
  // TODO(benvanik): perhaps we want to stash this for each basic block?
  // We could use this for faster checking of cr/ca checks/etc.
  InstrAccessBits access_bits;
  uint8_t* p = xe_memory_addr(memory_, 0);
  for (uint32_t ia = block->start_address; ia <= block->end_address; ia += 4) {
    InstrData i;
    i.address = ia;
    i.code = XEGETUINT32BE(p + ia);
    i.type = ppc::GetInstrType(i.code);

    // Ignore unknown or ones with no disassembler fn.
    if (!i.type || !i.type->disassemble) {
      continue;
    }

    // We really need to know the registers modified, so die if we've been lazy
    // and haven't implemented the disassemble method yet.
    ppc::InstrDisasm d;
    XEASSERTNOTNULL(i.type->disassemble);
    int result_code = i.type->disassemble(i, d);
    XEASSERTZERO(result_code);
    if (result_code) {
      return result_code;
    }

    // Accumulate access bits.
    access_bits.Extend(d.access_bits);
  }

  // Add in access bits to function access bits.
  access_bits_.Extend(access_bits);

  return 0;
}

void FunctionGenerator::GenerateBasicBlock(FunctionBlock* block) {
  IRBuilder<>& b = *builder_;

  BasicBlock* bb = GetBasicBlock(block->start_address);
  XEASSERTNOTNULL(bb);

  if (FLAGS_log_codegen) {
    printf("  bb %.8X-%.8X:\n", block->start_address, block->end_address);
  }

  fn_block_ = block;
  bb_ = bb;

  // Move the builder to this block and setup.
  b.SetInsertPoint(bb);
  //i->setMetadata("some.name", MDNode::get(context, MDString::get(context, pname)));

  Value* invalidInstruction =
      gen_module_->getFunction("XeInvalidInstruction");
  Value* traceInstruction =
      gen_module_->getFunction("XeTraceInstruction");

  // Walk instructions in block.
  uint8_t* p = xe_memory_addr(memory_, 0);
  for (uint32_t ia = block->start_address; ia <= block->end_address; ia += 4) {
    InstrData i;
    i.address = ia;
    i.code = XEGETUINT32BE(p + ia);
    i.type = ppc::GetInstrType(i.code);

    if (FLAGS_trace_instructions) {
      SpillRegisters();
      b.CreateCall3(
          traceInstruction,
          gen_fn_->arg_begin(),
          b.getInt32(i.address),
          b.getInt32(i.code));
    }

    if (!i.type) {
      XELOGCPU(XT("Invalid instruction %.8X %.8X"), ia, i.code);
      SpillRegisters();
      b.CreateCall3(
          invalidInstruction,
          gen_fn_->arg_begin(),
          b.getInt32(i.address),
          b.getInt32(i.code));
      continue;
    }

    if (FLAGS_log_codegen) {
      if (i.type->disassemble) {
        ppc::InstrDisasm d;
        i.type->disassemble(i, d);
        std::string disasm;
        d.Dump(disasm);
        printf("    %.8X: %.8X %s\n", ia, i.code, disasm.c_str());
      } else {
        printf("    %.8X: %.8X %s ???\n", ia, i.code, i.type->name);
      }
    }

    // TODO(benvanik): debugging information? source/etc?
    // builder_>SetCurrentDebugLocation(DebugLoc::get(
    //     ia >> 8, ia & 0xFF, ctx->cu));

    typedef int (*InstrEmitter)(FunctionGenerator& g, IRBuilder<>& b,
                                InstrData& i);
    InstrEmitter emit = (InstrEmitter)i.type->emit;
    if (!i.type->emit || emit(*this, *builder_, i)) {
      // This printf is handy for sort/uniquify to find instructions.
      //printf("unimplinstr %s\n", i.type->name);

      XELOGCPU(XT("Unimplemented instr %.8X %.8X %s"),
               ia, i.code, i.type->name);
      SpillRegisters();
      b.CreateCall3(
          invalidInstruction,
          gen_fn_->arg_begin(),
          b.getInt32(i.address),
          b.getInt32(i.code));
    }
  }

  // If we fall through, create the branch.
  if (block->outgoing_type == FunctionBlock::kTargetNone) {
    BasicBlock* next_bb = GetNextBasicBlock();
    XEASSERTNOTNULL(next_bb);
    b.CreateBr(next_bb);
  } else if (block->outgoing_type == FunctionBlock::kTargetUnknown) {
    // Hrm.
    // TODO(benvanik): assert this doesn't occur - means a bad sdb run!
    XELOGCPU(XT("SDB function scan error in %.8X: bb %.8X has unknown exit"),
             fn_->start_address, block->start_address);
    b.CreateRetVoid();
  }

  // TODO(benvanik): finish up BB
}

BasicBlock* FunctionGenerator::GetBasicBlock(uint32_t address) {
  std::map<uint32_t, BasicBlock*>::iterator it = bbs_.find(address);
  if (it != bbs_.end()) {
    return it->second;
  }
  return NULL;
}

BasicBlock* FunctionGenerator::GetNextBasicBlock() {
  std::map<uint32_t, BasicBlock*>::iterator it = bbs_.find(
      fn_block_->start_address);
  ++it;
  if (it != bbs_.end()) {
    return it->second;
  }
  return NULL;
}

BasicBlock* FunctionGenerator::GetReturnBasicBlock() {
  return return_block_;
}

Function* FunctionGenerator::GetFunction(FunctionSymbol* fn) {
  Function* result = gen_module_->getFunction(StringRef(fn->name()));
  if (!result) {
    XELOGE(XT("Static function not found: %.8X %s"),
           fn->start_address, fn->name());
  }
  XEASSERTNOTNULL(result);
  return result;
}

int FunctionGenerator::GenerateIndirectionBranch(uint32_t cia, Value* target,
                                                 bool lk, bool likely_local) {
  // This function is called by the control emitters when they know that an
  // indirect branch is required.
  // It first tries to see if the branch is to an address within the function
  // and, if so, uses a local switch table. If that fails because we don't know
  // the block the function is regenerated (ACK!). If the target is external
  // then an external call occurs.

  IRBuilder<>& b = *builder_;
  BasicBlock* next_block = GetNextBasicBlock();

  PushInsertPoint();

  // Request builds of the indirection blocks on demand.
  // We can't build here because we don't know what registers will be needed
  // yet, so we just create the blocks and let GenerateSharedBlocks handle it
  // after we are done with all user instructions.
  if (!external_indirection_block_) {
    // Setup locals in the entry block.
    b.SetInsertPoint(&gen_fn_->getEntryBlock());
    locals_.indirection_target = b.CreateAlloca(
        b.getInt64Ty(), 0, "indirection_target");
    locals_.indirection_cia = b.CreateAlloca(
        b.getInt64Ty(), 0, "indirection_cia");

    external_indirection_block_ = BasicBlock::Create(
        *context_, "external_indirection_block", gen_fn_, return_block_);
  }
  if (likely_local && !internal_indirection_block_) {
    internal_indirection_block_ = BasicBlock::Create(
        *context_, "internal_indirection_block", gen_fn_, return_block_);
  }

  PopInsertPoint();

  // Check to see if the target address is within the function.
  // If it is jump to that basic block. If the basic block is not found it means
  // we have a jump inside the function that wasn't identified via static
  // analysis. These are bad as they require function regeneration.
  if (likely_local) {
    // Note that we only support LK=0, as we are using shared tables.
    XEASSERT(!lk);
    b.CreateStore(target, locals_.indirection_target);
    b.CreateStore(b.getInt64(cia), locals_.indirection_cia);
    Value* fn_ge_cmp = b.CreateICmpUGE(target, b.getInt64(fn_->start_address));
    Value* fn_l_cmp = b.CreateICmpULT(target, b.getInt64(fn_->end_address));
    Value* fn_target_cmp = b.CreateAnd(fn_ge_cmp, fn_l_cmp);
    b.CreateCondBr(fn_target_cmp,
                   internal_indirection_block_, external_indirection_block_);
    return 0;
  }

  // If we are LK=0 jump to the shared indirection block. This prevents us
  // from needing to fill the registers again after the call and shares more
  // code.
  if (!lk) {
    b.CreateStore(target, locals_.indirection_target);
    b.CreateStore(b.getInt64(cia), locals_.indirection_cia);
    b.CreateBr(external_indirection_block_);
  } else {
    // Slowest path - spill, call the external function, and fill.
    // We should avoid this at all costs.

    // Spill registers. We could probably share this.
    SpillRegisters();

    // TODO(benvanik): keep function pointer lookup local.
    Value* indirect_branch = gen_module_->getFunction("XeIndirectBranch");
    b.CreateCall3(indirect_branch,
                  gen_fn_->arg_begin(),
                  target,
                  b.getInt64(cia));

    if (next_block) {
      // Only refill if not a tail call.
      FillRegisters();
      b.CreateBr(next_block);
    } else {
      b.CreateRetVoid();
    }
  }

  return 0;
}

Value* FunctionGenerator::LoadStateValue(uint32_t offset, Type* type,
                                         const char* name) {
  IRBuilder<>& b = *builder_;
  PointerType* pointerTy = PointerType::getUnqual(type);
  Function::arg_iterator args = gen_fn_->arg_begin();
  Value* state_ptr = args;
  Value* address = b.CreateInBoundsGEP(state_ptr, b.getInt32(offset));
  Value* ptr = b.CreatePointerCast(address, pointerTy);
  return b.CreateLoad(ptr, name);
}

void FunctionGenerator::StoreStateValue(uint32_t offset, Type* type,
                                        Value* value) {
  IRBuilder<>& b = *builder_;
  PointerType* pointerTy = PointerType::getUnqual(type);
  Function::arg_iterator args = gen_fn_->arg_begin();
  Value* state_ptr = args;
  Value* address = b.CreateInBoundsGEP(state_ptr, b.getInt32(offset));
  Value* ptr = b.CreatePointerCast(address, pointerTy);
  b.CreateStore(value, ptr);
}

void FunctionGenerator::SetupLocals() {
  IRBuilder<>& b = *builder_;

  uint64_t spr_t = access_bits_.spr;
  if (spr_t & 0x3) {
    locals_.xer = SetupLocal(b.getInt64Ty(), "xer");
  }
  spr_t >>= 2;
  if (spr_t & 0x3) {
    locals_.lr = SetupLocal(b.getInt64Ty(), "lr");
  }
  spr_t >>= 2;
  if (spr_t & 0x3) {
    locals_.ctr = SetupLocal(b.getInt64Ty(), "ctr");
  }
  spr_t >>= 2;
  // TODO: FPCSR

  char name[32];

  uint64_t cr_t = access_bits_.cr;
  for (int n = 0; n < 8; n++) {
    if (cr_t & 3) {
      xesnprintfa(name, XECOUNT(name), "cr%d", n);
      locals_.cr[n] = SetupLocal(b.getInt8Ty(), name);
    }
    cr_t >>= 2;
  }

  uint64_t gpr_t = access_bits_.gpr;
  for (int n = 0; n < 32; n++) {
    if (gpr_t & 3) {
      xesnprintfa(name, XECOUNT(name), "r%d", n);
      locals_.gpr[n] = SetupLocal(b.getInt64Ty(), name);
    }
    gpr_t >>= 2;
  }

  uint64_t fpr_t = access_bits_.fpr;
  for (int n = 0; n < 32; n++) {
    if (fpr_t & 3) {
      xesnprintfa(name, XECOUNT(name), "f%d", n);
      locals_.fpr[n] = SetupLocal(b.getDoubleTy(), name);
    }
    fpr_t >>= 2;
  }
}

Value* FunctionGenerator::SetupLocal(llvm::Type* type, const char* name) {
  IRBuilder<>& b = *builder_;
  // Insert into the entry block.
  PushInsertPoint();
  b.SetInsertPoint(&gen_fn_->getEntryBlock());
  Value* v = b.CreateAlloca(type, 0, name);
  PopInsertPoint();
  return v;
}

Value* FunctionGenerator::cia_value() {
  return builder_->getInt32(cia_);
}

void FunctionGenerator::FillRegisters() {
  // This updates all of the local register values from the state memory.
  // It should be called on function entry for initial setup and after any
  // calls that may modify the registers.

  // TODO(benvanik): use access flags to see if we need to do reads/writes.
  // Though LLVM may do a better job than we can, except across calls.

  IRBuilder<>& b = *builder_;

  if (locals_.xer) {
    b.CreateStore(LoadStateValue(
        offsetof(xe_ppc_state_t, xer),
        b.getInt64Ty()), locals_.xer);
  }

  if (locals_.lr) {
    b.CreateStore(LoadStateValue(
        offsetof(xe_ppc_state_t, lr),
        b.getInt64Ty()), locals_.lr);
  }

  if (locals_.ctr) {
    b.CreateStore(LoadStateValue(
        offsetof(xe_ppc_state_t, ctr),
        b.getInt64Ty()), locals_.ctr);
  }

  // Fill the split CR values by extracting each one from the CR.
  // This could probably be done faster via an extractvalues or something.
  // Perhaps we could also change it to be a vector<8*i8>.
  Value* cr = NULL;
  for (size_t n = 0; n < XECOUNT(locals_.cr); n++) {
    Value* cr_n = locals_.cr[n];
    if (!cr_n) {
      continue;
    }
    if (!cr) {
      cr = LoadStateValue(
          offsetof(xe_ppc_state_t, cr),
          b.getInt64Ty());
    }
    b.CreateStore(
        b.CreateTrunc(b.CreateAnd(b.CreateLShr(cr, (28 - n * 4)), 0xF),
                      b.getInt8Ty()), cr_n);
  }

  for (size_t n = 0; n < XECOUNT(locals_.gpr); n++) {
    if (locals_.gpr[n]) {
      b.CreateStore(LoadStateValue(
          offsetof(xe_ppc_state_t, r) + 8 * n,
          b.getInt64Ty()), locals_.gpr[n]);
    }
  }

  for (size_t n = 0; n < XECOUNT(locals_.fpr); n++) {
    if (locals_.fpr[n]) {
      b.CreateStore(LoadStateValue(
          offsetof(xe_ppc_state_t, f) + 8 * n,
          b.getDoubleTy()), locals_.fpr[n]);
    }
  }
}

void FunctionGenerator::SpillRegisters() {
  // This flushes all local registers (if written) to the register bank and
  // resets their values.
  //
  // TODO(benvanik): only flush if actually required, or selective flushes.

  IRBuilder<>& b = *builder_;

  if (locals_.xer) {
    StoreStateValue(
        offsetof(xe_ppc_state_t, xer),
        b.getInt64Ty(),
        b.CreateLoad(locals_.xer));
  }

  if (locals_.lr) {
    StoreStateValue(
        offsetof(xe_ppc_state_t, lr),
        b.getInt64Ty(),
        b.CreateLoad(locals_.lr));
  }

  if (locals_.ctr) {
    StoreStateValue(
        offsetof(xe_ppc_state_t, ctr),
        b.getInt64Ty(),
        b.CreateLoad(locals_.ctr));
  }

  // Stitch together all split CR values.
  // TODO(benvanik): don't flush across calls?
  Value* cr = NULL;
  for (size_t n = 0; n < XECOUNT(locals_.cr); n++) {
    Value* cr_n = locals_.cr[n];
    if (!cr_n) {
      continue;
    }
    cr_n = b.CreateZExt(b.CreateLoad(cr_n), b.getInt64Ty());
    if (!cr) {
      cr = b.CreateShl(cr_n, n * 4);
    } else {
      cr = b.CreateOr(cr, b.CreateShl(cr_n, n * 4));
    }
  }
  if (cr) {
    StoreStateValue(
        offsetof(xe_ppc_state_t, cr),
        b.getInt64Ty(),
        cr);
  }

  for (uint32_t n = 0; n < XECOUNT(locals_.gpr); n++) {
    Value* v = locals_.gpr[n];
    if (v) {
      StoreStateValue(
          offsetof(xe_ppc_state_t, r) + 8 * n,
          b.getInt64Ty(),
          b.CreateLoad(locals_.gpr[n]));
    }
  }

  for (uint32_t n = 0; n < XECOUNT(locals_.fpr); n++) {
    Value* v = locals_.fpr[n];
    if (v) {
      StoreStateValue(
          offsetof(xe_ppc_state_t, f) + 8 * n,
          b.getDoubleTy(),
          b.CreateLoad(locals_.fpr[n]));
    }
  }
}

Value* FunctionGenerator::xer_value() {
  XEASSERTNOTNULL(locals_.xer);
  IRBuilder<>& b = *builder_;
  return b.CreateLoad(locals_.xer);
}

void FunctionGenerator::update_xer_value(Value* value) {
  XEASSERTNOTNULL(locals_.xer);
  IRBuilder<>& b = *builder_;

  // Extend to 64bits if needed.
  if (!value->getType()->isIntegerTy(64)) {
    value = b.CreateZExt(value, b.getInt64Ty());
  }
  b.CreateStore(value, locals_.xer);
}

void FunctionGenerator::update_xer_with_overflow(Value* value) {
  XEASSERTNOTNULL(locals_.xer);
  IRBuilder<>& b = *builder_;

  // Expects a i1 indicating overflow.
  // Trust the caller that if it's larger than that it's already truncated.
  if (!value->getType()->isIntegerTy(64)) {
    value = b.CreateZExt(value, b.getInt64Ty());
  }

  Value* xer = xer_value();
  xer = b.CreateAnd(xer, 0xFFFFFFFFBFFFFFFF); // clear bit 30
  xer = b.CreateOr(xer, b.CreateShl(value, 31));
  xer = b.CreateOr(xer, b.CreateShl(value, 30));
  b.CreateStore(xer, locals_.xer);
}

void FunctionGenerator::update_xer_with_carry(Value* value) {
  XEASSERTNOTNULL(locals_.xer);
  IRBuilder<>& b = *builder_;

  // Expects a i1 indicating carry.
  // Trust the caller that if it's larger than that it's already truncated.
  if (!value->getType()->isIntegerTy(64)) {
    value = b.CreateZExt(value, b.getInt64Ty());
  }

  Value* xer = xer_value();
  xer = b.CreateAnd(xer, 0xFFFFFFFFDFFFFFFF); // clear bit 29
  xer = b.CreateOr(xer, b.CreateShl(value, 29));
  b.CreateStore(xer, locals_.xer);
}

void FunctionGenerator::update_xer_with_overflow_and_carry(Value* value) {
  XEASSERTNOTNULL(locals_.xer);
  IRBuilder<>& b = *builder_;

  // Expects a i1 indicating overflow.
  // Trust the caller that if it's larger than that it's already truncated.
  if (!value->getType()->isIntegerTy(64)) {
    value = b.CreateZExt(value, b.getInt64Ty());
  }

  // This is effectively an update_xer_with_overflow followed by an
  // update_xer_with_carry, but since the logic is largely the same share it.
  Value* xer = xer_value();
  xer = b.CreateAnd(xer, 0xFFFFFFFF9FFFFFFF); // clear bit 30 & 29
  xer = b.CreateOr(xer, b.CreateShl(value, 31));
  xer = b.CreateOr(xer, b.CreateShl(value, 30));
  xer = b.CreateOr(xer, b.CreateShl(value, 29));
  b.CreateStore(xer, locals_.xer);
}

Value* FunctionGenerator::lr_value() {
  XEASSERTNOTNULL(locals_.lr);
  IRBuilder<>& b = *builder_;
  return b.CreateLoad(locals_.lr);
}

void FunctionGenerator::update_lr_value(Value* value) {
  XEASSERTNOTNULL(locals_.lr);
  IRBuilder<>& b = *builder_;

  // Extend to 64bits if needed.
  if (!value->getType()->isIntegerTy(64)) {
    value = b.CreateZExt(value, b.getInt64Ty());
  }
  b.CreateStore(value, locals_.lr);
}

Value* FunctionGenerator::ctr_value() {
  XEASSERTNOTNULL(locals_.ctr);
  IRBuilder<>& b = *builder_;

  return b.CreateLoad(locals_.ctr);
}

void FunctionGenerator::update_ctr_value(Value* value) {
  XEASSERTNOTNULL(locals_.ctr);
  IRBuilder<>& b = *builder_;

  // Extend to 64bits if needed.
  if (!value->getType()->isIntegerTy(64)) {
    value = b.CreateZExt(value, b.getInt64Ty());
  }
  b.CreateStore(value, locals_.ctr);
}

Value* FunctionGenerator::cr_value(uint32_t n) {
  XEASSERT(n >= 0 && n < 8);
  XEASSERTNOTNULL(locals_.cr[n]);
  IRBuilder<>& b = *builder_;

  Value* v = b.CreateLoad(locals_.cr[n]);
  v = b.CreateZExt(v, b.getInt64Ty());
  return v;
}

void FunctionGenerator::update_cr_value(uint32_t n, Value* value) {
  XEASSERT(n >= 0 && n < 8);
  XEASSERTNOTNULL(locals_.cr[n]);
  IRBuilder<>& b = *builder_;

  // Truncate to 8 bits if needed.
  // TODO(benvanik): also widen?
  if (!value->getType()->isIntegerTy(8)) {
    value = b.CreateTrunc(value, b.getInt8Ty());
  }

  b.CreateStore(value, locals_.cr[n]);
}

void FunctionGenerator::update_cr_with_cond(
    uint32_t n, Value* lhs, Value* rhs, bool is_signed) {
  IRBuilder<>& b = *builder_;

  // bit0 = RA < RB
  // bit1 = RA > RB
  // bit2 = RA = RB
  // bit3 = XER[SO]

  // TODO(benvanik): inline this using the x86 cmp instruction - this prevents
  // the need for a lot of the compares and ensures we lower to the best
  // possible x86.
  // Value* cmp = InlineAsm::get(
  //     FunctionType::get(),
  //     "cmp $0, $1                 \n"
  //     "mov from compare registers \n",
  //     "r,r", ??
  //     true);

  Value* is_lt = is_signed ?
      b.CreateICmpSLT(lhs, rhs) : b.CreateICmpULT(lhs, rhs);
  Value* is_gt = is_signed ?
      b.CreateICmpSGT(lhs, rhs) : b.CreateICmpUGT(lhs, rhs);
  Value* cp = b.CreateSelect(is_gt, b.getInt8(1 << 1), b.getInt8(1 << 2));
  Value* c = b.CreateSelect(is_lt, b.getInt8(1 << 0), cp);

  // TODO(benvanik): set bit 4 to XER[SO]

  // Insert the 4 bits into their location in the CR.
  update_cr_value(n, c);
}

Value* FunctionGenerator::gpr_value(uint32_t n) {
  XEASSERT(n >= 0 && n < 32);
  XEASSERTNOTNULL(locals_.gpr[n]);
  IRBuilder<>& b = *builder_;

  // Actually r0 is writable, even though nobody should ever do that.
  // Perhaps we can check usage and enable this if safe?
  // if (n == 0) {
  //   // Always force zero to a constant - this should help LLVM.
  //   return b.getInt64(0);
  // }

  return b.CreateLoad(locals_.gpr[n]);
}

void FunctionGenerator::update_gpr_value(uint32_t n, Value* value) {
  XEASSERT(n >= 0 && n < 32);
  XEASSERTNOTNULL(locals_.gpr[n]);
  IRBuilder<>& b = *builder_;

  // See above - r0 can be written.
  // if (n == 0) {
  //   // Ignore writes to zero.
  //   return;
  // }

  // Extend to 64bits if needed.
  if (!value->getType()->isIntegerTy(64)) {
    value = b.CreateZExt(value, b.getInt64Ty());
  }

  b.CreateStore(value, locals_.gpr[n]);
}

Value* FunctionGenerator::fpr_value(uint32_t n) {
  XEASSERT(n >= 0 && n < 32);
  XEASSERTNOTNULL(locals_.fpr[n]);
  IRBuilder<>& b = *builder_;
  return b.CreateLoad(locals_.fpr[n]);
}

void FunctionGenerator::update_fpr_value(uint32_t n, Value* value) {
  XEASSERT(n >= 0 && n < 32);
  XEASSERTNOTNULL(locals_.fpr[n]);
  IRBuilder<>& b = *builder_;
  value = b.CreateFPExtOrFPTrunc(value, b.getDoubleTy());
  b.CreateStore(value, locals_.fpr[n]);
}

Value* FunctionGenerator::GetMembase() {
  Value* v = gen_module_->getGlobalVariable("xe_memory_base");
  return builder_->CreateLoad(v);
}

Value* FunctionGenerator::GetMemoryAddress(uint32_t cia, Value* addr) {
  IRBuilder<>& b = *builder_;

  // Input address is always in 32-bit space.
  addr = b.CreateAnd(addr, UINT_MAX);

  // Add runtime memory address checks, if needed.
  if (FLAGS_memory_address_verification) {
    BasicBlock* invalid_bb = BasicBlock::Create(*context_, "", gen_fn_);
    BasicBlock* valid_bb = BasicBlock::Create(*context_, "", gen_fn_);

    // The heap starts at 0x1000 - if we write below that we're boned.
    Value* gt = b.CreateICmpUGE(addr, b.getInt64(0x00001000));
    b.CreateCondBr(gt, valid_bb, invalid_bb);

    b.SetInsertPoint(invalid_bb);
    Value* access_violation = gen_module_->getFunction("XeAccessViolation");
    SpillRegisters();
    b.CreateCall3(access_violation,
                  gen_fn_->arg_begin(),
                  b.getInt32(cia),
                  addr);
    b.CreateBr(valid_bb);

    b.SetInsertPoint(valid_bb);
  }

  // Rebase off of memory base pointer.
  return b.CreateInBoundsGEP(GetMembase(), addr);
}

Value* FunctionGenerator::ReadMemory(
    uint32_t cia, Value* addr, uint32_t size, bool acquire) {
  IRBuilder<>& b = *builder_;

  Type* dataTy = NULL;
  bool needs_swap = false;
  switch (size) {
    case 1:
      dataTy = b.getInt8Ty();
      break;
    case 2:
      dataTy = b.getInt16Ty();
      needs_swap = true;
      break;
    case 4:
      dataTy = b.getInt32Ty();
      needs_swap = true;
      break;
    case 8:
      dataTy = b.getInt64Ty();
      needs_swap = true;
      break;
    default:
      XEASSERTALWAYS();
      return NULL;
  }
  PointerType* pointerTy = PointerType::getUnqual(dataTy);

  Value* address = GetMemoryAddress(cia, addr);
  Value* ptr = b.CreatePointerCast(address, pointerTy);
  LoadInst* load_value = b.CreateLoad(ptr);
  if (acquire) {
    load_value->setAlignment(size);
    load_value->setVolatile(true);
    load_value->setAtomic(Acquire);
  }
  Value* value = load_value;

  // Swap after loading.
  // TODO(benvanik): find a way to avoid this!
  if (needs_swap) {
    Function* bswap = Intrinsic::getDeclaration(
          gen_module_, Intrinsic::bswap, dataTy);
    value = b.CreateCall(bswap, value);
  }

  return value;
}

void FunctionGenerator::WriteMemory(
    uint32_t cia, Value* addr, uint32_t size, Value* value, bool release) {
  IRBuilder<>& b = *builder_;

  Type* dataTy = NULL;
  bool needs_swap = false;
  switch (size) {
    case 1:
      dataTy = b.getInt8Ty();
      break;
    case 2:
      dataTy = b.getInt16Ty();
      needs_swap = true;
      break;
    case 4:
      dataTy = b.getInt32Ty();
      needs_swap = true;
      break;
    case 8:
      dataTy = b.getInt64Ty();
      needs_swap = true;
      break;
    default:
      XEASSERTALWAYS();
      return;
  }
  PointerType* pointerTy = PointerType::getUnqual(dataTy);

  Value* address = GetMemoryAddress(cia, addr);
  Value* ptr = b.CreatePointerCast(address, pointerTy);

  // Truncate, if required.
  if (value->getType() != dataTy) {
    value = b.CreateTrunc(value, dataTy);
  }

  // Swap before storing.
  // TODO(benvanik): find a way to avoid this!
  if (needs_swap) {
    Function* bswap = Intrinsic::getDeclaration(
          gen_module_, Intrinsic::bswap, dataTy);
    value = b.CreateCall(bswap, value);
  }

  StoreInst* store_value = b.CreateStore(value, ptr);
  if (release) {
    store_value->setAlignment(size);
    store_value->setVolatile(true);
    store_value->setAtomic(Release);
  }
}