A probably-working register allocator.

This commit is contained in:
Ben Vanik 2014-02-10 21:16:38 -08:00
parent 6bd214af0b
commit 4a584129d2
13 changed files with 613 additions and 29 deletions

View File

@ -74,15 +74,19 @@ int IVMAssembler::Assemble(
builder->ResetLabelTags();
// Function prologue.
size_t stack_size = 0;
size_t stack_offset = 0;
auto locals = builder->locals();
for (auto it = locals.begin(); it != locals.end(); ++it) {
auto slot = *it;
size_t stack_offset = stack_size;
size_t type_size = GetTypeSize(slot->type);
// Align to natural size.
stack_offset = XEALIGN(stack_offset, type_size);
slot->set_constant(stack_offset);
stack_size += GetTypeSize(slot->type);
stack_offset += type_size;
}
ctx.stack_size = stack_size;
// Ensure 16b alignment.
stack_offset = XEALIGN(stack_offset, 16);
ctx.stack_size = stack_offset;
auto block = builder->first_block();
while (block) {

View File

@ -38,14 +38,14 @@ int IVMBackend::Initialize() {
0,
"gpr",
MachineInfo::RegisterSet::INT_TYPES,
10,
6,
};
machine_info_.register_sets[1] = {
1,
"vec",
MachineInfo::RegisterSet::FLOAT_TYPES |
MachineInfo::RegisterSet::VEC_TYPES,
10,
6,
};
alloy::tracing::WriteEvent(EventType::Init({

View File

@ -98,8 +98,6 @@ void* X64Emitter::Emplace(size_t stack_size) {
return new_address;
}
#define XEALIGN(value, align) ((value + align - 1) & ~(align - 1))
int X64Emitter::Emit(HIRBuilder* builder, size_t& out_stack_size) {
// These are the registers we will not be using. All others are fare game.
const uint32_t reserved_regs =
@ -220,7 +218,7 @@ void X64Emitter::ResetRegisters(uint32_t reserved_regs) {
if (live_regs & 0x1) {
auto v = reg_state_.reg_values[n];
if (v) {
v->reg = -1;
v->reg.index = -1;
}
}
reg_state_.reg_values[n] = 0;

View File

@ -15,8 +15,9 @@
#include <alloy/compiler/passes/context_promotion_pass.h>
#include <alloy/compiler/passes/data_flow_analysis_pass.h>
#include <alloy/compiler/passes/dead_code_elimination_pass.h>
#include <alloy/compiler/passes/finalization_pass.h>
//#include <alloy/compiler/passes/dead_store_elimination_pass.h>
#include <alloy/compiler/passes/finalization_pass.h>
#include <alloy/compiler/passes/register_allocation_pass.h>
#include <alloy/compiler/passes/simplification_pass.h>
#include <alloy/compiler/passes/validation_pass.h>
#include <alloy/compiler/passes/value_reduction_pass.h>
@ -137,5 +138,42 @@
// store_context +302, v5
// branch_true v5, ...
//
// - X86Canonicalization
// For various opcodes add copies/commute the arguments to match x86
// operand semantics. This makes code generation easier and if done
// before register allocation can prevent a lot of extra shuffling in
// the emitted code.
//
// Example:
// <block0>:
// v0 = ...
// v1 = ...
// v2 = add v0, v1 <-- v1 now unused
// Becomes:
// v0 = ...
// v1 = ...
// v1 = add v1, v0 <-- src1 = dest/src, so reuse for both
// by commuting and setting dest = src1
//
// - RegisterAllocation
// Given a machine description (register classes, counts) run over values
// and assign them to registers, adding spills as needed. It should be
// possible to directly emit code from this form.
//
// Example:
// <block0>:
// v0 = load_context +0
// v1 = load_context +1
// v0 = add v0, v1
// ...
// v2 = mul v0, v1
// Becomes:
// reg0 = load_context +0
// reg1 = load_context +1
// reg2 = add reg0, reg1
// store_local +123, reg2 <-- spill inserted
// ...
// reg0 = load_local +123 <-- load inserted
// reg0 = mul reg0, reg1
#endif // ALLOY_COMPILER_COMPILER_PASSES_H_

View File

@ -41,19 +41,21 @@ int ControlFlowAnalysisPass::Run(HIRBuilder* builder) {
// Add edges.
auto block = builder->first_block();
while (block) {
auto instr = block->instr_head;
auto instr = block->instr_tail;
while (instr) {
if (instr->opcode->flags & OPCODE_FLAG_BRANCH) {
if (instr->opcode == &OPCODE_BRANCH_info) {
auto label = instr->src1.label;
builder->AddEdge(block, label->block, Edge::UNCONDITIONAL);
break;
} else if (instr->opcode == &OPCODE_BRANCH_TRUE_info ||
instr->opcode == &OPCODE_BRANCH_FALSE_info) {
auto label = instr->src2.label;
builder->AddEdge(block, label->block, 0);
break;
}
}
instr = instr->next;
instr = instr->prev;
}
block = block->next;
}

View File

@ -0,0 +1,471 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2014 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#include <alloy/compiler/passes/register_allocation_pass.h>
using namespace alloy;
using namespace alloy::backend;
using namespace alloy::compiler;
using namespace alloy::compiler::passes;
using namespace alloy::hir;
struct RegisterAllocationPass::Interval {
uint32_t start_ordinal;
uint32_t end_ordinal;
Value* value;
RegisterFreeUntilSet* free_until_set;
// TODO(benvanik): reduce to offsets in arena?
struct Interval* next;
struct Interval* prev;
void AddToList(Interval** list_head) {
auto list_next = *list_head;
this->next = list_next;
if (list_next) {
list_next->prev = this;
}
*list_head = this;
}
void InsertIntoList(Interval** list_head) {
auto it = *list_head;
while (it) {
if (it->start_ordinal > this->start_ordinal) {
// Went too far. Insert before this interval.
this->prev = it->prev;
this->next = it;
if (it->prev) {
it->prev->next = this;
} else {
*list_head = this;
}
it->prev = this;
return;
}
if (!it->next) {
// None found, add at tail.
it->next = this;
this->prev = it;
return;
}
it = it->next;
}
}
void RemoveFromList(Interval** list_head) {
if (this->next) {
this->next->prev = this->prev;
}
if (this->prev) {
this->prev->next = this->next;
} else {
*list_head = this->next;
}
this->next = this->prev = NULL;
}
};
struct RegisterAllocationPass::Intervals {
Interval* unhandled;
Interval* active;
Interval* handled;
};
RegisterAllocationPass::RegisterAllocationPass(
const MachineInfo* machine_info) :
machine_info_(machine_info),
CompilerPass() {
// Initialize register sets. The values of these will be
// cleared before use, so just the structure is required.
auto mi_sets = machine_info->register_sets;
xe_zero_struct(&free_until_sets_, sizeof(free_until_sets_));
uint32_t n = 0;
while (mi_sets[n].count) {
auto& mi_set = mi_sets[n];
auto free_until_set = new RegisterFreeUntilSet();
free_until_sets_.all_sets[n] = free_until_set;
free_until_set->count = mi_set.count;
free_until_set->set = &mi_set;
if (mi_set.types & MachineInfo::RegisterSet::INT_TYPES) {
free_until_sets_.int_set = free_until_set;
}
if (mi_set.types & MachineInfo::RegisterSet::FLOAT_TYPES) {
free_until_sets_.float_set = free_until_set;
}
if (mi_set.types & MachineInfo::RegisterSet::VEC_TYPES) {
free_until_sets_.vec_set = free_until_set;
}
n++;
}
}
RegisterAllocationPass::~RegisterAllocationPass() {
for (size_t n = 0; n < XECOUNT(free_until_sets_.all_sets); n++) {
if (!free_until_sets_.all_sets[n]) {
break;
}
delete free_until_sets_.all_sets[n];
}
}
int RegisterAllocationPass::Run(HIRBuilder* builder) {
// A (probably broken) implementation of a linear scan register allocator
// that operates directly on SSA form:
// http://www.christianwimmer.at/Publications/Wimmer10a/Wimmer10a.pdf
//
// Requirements:
// - SSA form (single definition for variables)
// - block should be in linear order:
// - dominators *should* come before (a->b->c)
// - loop block sequences *should not* have intervening non-loop blocks
auto arena = scratch_arena();
// Renumber everything.
uint32_t block_ordinal = 0;
uint32_t instr_ordinal = 0;
auto block = builder->first_block();
while (block) {
// Sequential block ordinals.
block->ordinal = block_ordinal++;
auto instr = block->instr_head;
while (instr) {
// Sequential global instruction ordinals.
instr->ordinal = instr_ordinal++;
instr = instr->next;
}
block = block->next;
}
// Compute all liveness ranges by walking forward through all
// blocks/instructions and checking the last use of each value. This lets
// us know the exact order in (block#,instr#) form, which is then used to
// setup the range.
// TODO(benvanik): ideally we would have a list of all values and not have
// to keep walking instructions over and over.
Interval* prev_interval = NULL;
Interval* head_interval = NULL;
block = builder->first_block();
while (block) {
auto instr = block->instr_head;
while (instr) {
// Compute last-use for the dest value.
// Since we know all values of importance must be defined, we can avoid
// having to check every value and just look at dest.
const OpcodeInfo* info = instr->opcode;
if (GET_OPCODE_SIG_TYPE_DEST(info->signature) == OPCODE_SIG_TYPE_V) {
auto v = instr->dest;
if (!v->last_use) {
ComputeLastUse(v);
}
// Add interval.
auto interval = arena->Alloc<Interval>();
interval->start_ordinal = instr->ordinal;
interval->end_ordinal = v->last_use ?
v->last_use->ordinal : v->def->ordinal;
interval->value = v;
interval->next = NULL;
interval->prev = prev_interval;
if (prev_interval) {
prev_interval->next = interval;
} else {
head_interval = interval;
}
prev_interval = interval;
// Grab register set to use.
// We do this now so it's only once per interval, and it makes it easy
// to only compare intervals that overlap their sets.
if (v->type <= INT64_TYPE) {
interval->free_until_set = free_until_sets_.int_set;
} else if (v->type <= FLOAT64_TYPE) {
interval->free_until_set = free_until_sets_.float_set;
} else {
interval->free_until_set = free_until_sets_.vec_set;
}
}
instr = instr->next;
}
block = block->next;
}
// Now have a sorted list of intervals, minus their ending ordinals.
Intervals intervals;
intervals.unhandled = head_interval;
intervals.active = intervals.handled = NULL;
while (intervals.unhandled) {
// Get next unhandled interval.
auto current = intervals.unhandled;
intervals.unhandled = intervals.unhandled->next;
current->RemoveFromList(&intervals.unhandled);
// Check for intervals in active that are handled or inactive.
auto it = intervals.active;
while (it) {
auto next = it->next;
if (it->end_ordinal <= current->start_ordinal) {
// Move from active to handled.
it->RemoveFromList(&intervals.active);
it->AddToList(&intervals.handled);
}
it = next;
}
// Find a register for current.
if (!TryAllocateFreeReg(current, intervals)) {
// Failed, spill.
AllocateBlockedReg(builder, current, intervals);
}
if (current->value->reg.index!= -1) {
// Add current to active.
current->AddToList(&intervals.active);
}
}
return 0;
}
void RegisterAllocationPass::ComputeLastUse(Value* value) {
// TODO(benvanik): compute during construction?
// Note that this list isn't sorted (unfortunately), so we have to scan
// them all.
uint32_t max_ordinal = 0;
Value::Use* last_use = NULL;
auto use = value->use_head;
while (use) {
if (!last_use || use->instr->ordinal >= max_ordinal) {
last_use = use;
max_ordinal = use->instr->ordinal;
}
use = use->next;
}
value->last_use = last_use ? last_use->instr : NULL;
}
bool RegisterAllocationPass::TryAllocateFreeReg(
Interval* current, Intervals& intervals) {
// Reset all registers in the set to unused.
auto free_until_set = current->free_until_set;
for (uint32_t n = 0; n < free_until_set->count; n++) {
free_until_set->pos[n] = -1;
}
// Mark all active registers as used.
// TODO(benvanik): keep some kind of bitvector so that this is instant?
auto it = intervals.active;
while (it) {
if (it->free_until_set == free_until_set) {
free_until_set->pos[it->value->reg.index] = 0;
}
it = it->next;
}
uint32_t max_pos = 0;
for (uint32_t n = 0; n < free_until_set->count; n++) {
if (max_pos == -1) {
max_pos = n;
} else {
if (free_until_set->pos[n] > free_until_set->pos[max_pos]) {
max_pos = n;
}
}
}
if (!free_until_set->pos[max_pos]) {
// No register available without spilling.
return false;
}
if (current->end_ordinal < free_until_set->pos[max_pos]) {
// Register available for the whole interval.
current->value->reg.set = free_until_set->set;
current->value->reg.index = max_pos;
} else {
// Register available for the first part of the interval.
// Split the interval at where it hits the next one.
//current->value->reg = max_pos;
//SplitRange(current, free_until_set->pos[max_pos]);
// TODO(benvanik): actually split -- for now we just spill.
return false;
}
return true;
}
void RegisterAllocationPass::AllocateBlockedReg(
HIRBuilder* builder, Interval* current, Intervals& intervals) {
auto free_until_set = current->free_until_set;
// TODO(benvanik): smart heuristics.
// wimmer AllocateBlockedReg has some stuff for deciding whether to
// spill current or some other active interval - which we ignore.
// Pick a random interval. Maybe the first. Sure.
auto spill_interval = intervals.active;
Value* spill_value = NULL;
Instr* prev_use = NULL;
Instr* next_use = NULL;
while (spill_interval) {
if (spill_interval->free_until_set != free_until_set ||
spill_interval->start_ordinal == current->start_ordinal) {
// Only interested in ones of the same register set.
// We also ensure that ones at the same ordinal as us are ignored,
// which can happen with multiple local inserts/etc.
spill_interval = spill_interval->next;
continue;
}
spill_value = spill_interval->value;
// Find the uses right before/after current.
auto use = spill_value->use_head;
while (use) {
if (use->instr->ordinal != -1) {
if (use->instr->ordinal < current->start_ordinal) {
if (!prev_use || prev_use->ordinal < use->instr->ordinal) {
prev_use = use->instr;
}
} else if (use->instr->ordinal > current->start_ordinal) {
if (!next_use || next_use->ordinal > use->instr->ordinal) {
next_use = use->instr;
}
}
}
use = use->next;
}
if (!prev_use) {
prev_use = spill_value->def;
}
if (prev_use->next == next_use) {
// Uh, this interval is way too short.
spill_interval = spill_interval->next;
continue;
}
XEASSERT(prev_use->ordinal != -1);
XEASSERTNOTNULL(next_use);
break;
}
XEASSERT(spill_interval->free_until_set == free_until_set);
// Find the real last use -- paired ops may require sequences to stay
// intact. This is a bad design.
auto prev_def_tail = prev_use;
while (prev_def_tail &&
prev_def_tail->opcode->flags & OPCODE_FLAG_PAIRED_PREV) {
prev_def_tail = prev_def_tail->prev;
}
Value* new_value;
uint32_t end_ordinal;
if (spill_value->local_slot) {
// Value is already assigned a slot, so load from that.
// We can then split the interval right after the previous use to
// before the next use.
// Update the last use of the spilled interval/value.
end_ordinal = spill_interval->end_ordinal;
spill_interval->end_ordinal = current->start_ordinal;//prev_def_tail->ordinal;
XEASSERT(end_ordinal != -1);
XEASSERT(spill_interval->end_ordinal != -1);
// Insert a load right before the next use.
new_value = builder->LoadLocal(spill_value->local_slot);
builder->last_instr()->MoveBefore(next_use);
// Update last use info.
new_value->last_use = spill_value->last_use;
spill_value->last_use = prev_use;
} else {
// Allocate a local slot.
spill_value->local_slot = builder->AllocLocal(spill_value->type);
// Insert a spill right after the def.
builder->StoreLocal(spill_value->local_slot, spill_value);
auto spill_store = builder->last_instr();
spill_store->MoveBefore(prev_def_tail->next);
// Update last use of spilled interval/value.
end_ordinal = spill_interval->end_ordinal;
spill_interval->end_ordinal = current->start_ordinal;//prev_def_tail->ordinal;
XEASSERT(end_ordinal != -1);
XEASSERT(spill_interval->end_ordinal != -1);
// Insert a load right before the next use.
new_value = builder->LoadLocal(spill_value->local_slot);
builder->last_instr()->MoveBefore(next_use);
// Update last use info.
new_value->last_use = spill_value->last_use;
spill_value->last_use = spill_store;
}
// Reuse the same local slot. Hooray SSA.
new_value->local_slot = spill_value->local_slot;
// Rename all future uses to that loaded value.
auto use = spill_value->use_head;
while (use) {
// TODO(benvanik): keep use list sorted so we don't have to do this.
if (use->instr->ordinal <= spill_interval->end_ordinal ||
use->instr->ordinal == -1) {
use = use->next;
continue;
}
auto next = use->next;
auto instr = use->instr;
uint32_t signature = instr->opcode->signature;
if (GET_OPCODE_SIG_TYPE_SRC1(signature) == OPCODE_SIG_TYPE_V) {
if (instr->src1.value == spill_value) {
instr->set_src1(new_value);
}
}
if (GET_OPCODE_SIG_TYPE_SRC2(signature) == OPCODE_SIG_TYPE_V) {
if (instr->src2.value == spill_value) {
instr->set_src2(new_value);
}
}
if (GET_OPCODE_SIG_TYPE_SRC3(signature) == OPCODE_SIG_TYPE_V) {
if (instr->src3.value == spill_value) {
instr->set_src3(new_value);
}
}
use = next;
}
// Create new interval.
auto arena = scratch_arena();
auto new_interval = arena->Alloc<Interval>();
new_interval->start_ordinal = new_value->def->ordinal;
new_interval->end_ordinal = end_ordinal;
new_interval->value = new_value;
new_interval->next = NULL;
new_interval->prev = NULL;
if (new_value->type <= INT64_TYPE) {
new_interval->free_until_set = free_until_sets_.int_set;
} else if (new_value->type <= FLOAT64_TYPE) {
new_interval->free_until_set = free_until_sets_.float_set;
} else {
new_interval->free_until_set = free_until_sets_.vec_set;
}
// Remove the old interval from the active list, as it's been spilled.
spill_interval->RemoveFromList(&intervals.active);
spill_interval->AddToList(&intervals.handled);
// Insert interval into the right place in the list.
// We know it's ahead of us.
new_interval->InsertIntoList(&intervals.unhandled);
// TODO(benvanik): use the register we just freed?
//current->value->reg.set = free_until_set->set;
//current->value->reg.index = spill_interval->value->reg.index;
bool allocated = TryAllocateFreeReg(current, intervals);
XEASSERTTRUE(allocated);
}

View File

@ -0,0 +1,60 @@
/**
******************************************************************************
* Xenia : Xbox 360 Emulator Research Project *
******************************************************************************
* Copyright 2014 Ben Vanik. All rights reserved. *
* Released under the BSD license - see LICENSE in the root for more details. *
******************************************************************************
*/
#ifndef ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_
#define ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_
#include <alloy/backend/machine_info.h>
#include <alloy/compiler/compiler_pass.h>
namespace alloy {
namespace compiler {
namespace passes {
class RegisterAllocationPass : public CompilerPass {
public:
RegisterAllocationPass(const backend::MachineInfo* machine_info);
virtual ~RegisterAllocationPass();
virtual int Run(hir::HIRBuilder* builder);
private:
struct Interval;
struct Intervals;
void ComputeLastUse(hir::Value* value);
bool TryAllocateFreeReg(Interval* current, Intervals& intervals);
void AllocateBlockedReg(hir::HIRBuilder* builder,
Interval* current, Intervals& intervals);
private:
const backend::MachineInfo* machine_info_;
struct RegisterFreeUntilSet {
uint32_t count;
uint32_t pos[32];
const backend::MachineInfo::RegisterSet* set;
};
struct RegisterFreeUntilSets {
RegisterFreeUntilSet* int_set;
RegisterFreeUntilSet* float_set;
RegisterFreeUntilSet* vec_set;
RegisterFreeUntilSet* all_sets[3];
};
RegisterFreeUntilSets free_until_sets_;
};
} // namespace passes
} // namespace compiler
} // namespace alloy
#endif // ALLOY_COMPILER_PASSES_REGISTER_ALLOCATION_PASS_H_

View File

@ -15,6 +15,8 @@
'finalization_pass.h',
#'dead_store_elimination_pass.cc',
#'dead_store_elimination_pass.h',
'register_allocation_pass.cc',
'register_allocation_pass.h',
'simplification_pass.cc',
'simplification_pass.h',
'validation_pass.cc',

View File

@ -46,7 +46,7 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) :
// Passes are executed in the order they are added. Multiple of the same
// pass type may be used.
if (validate) compiler_->AddPass(new passes::ValidationPass());
//compiler_->AddPass(new passes::ContextPromotionPass());
compiler_->AddPass(new passes::ContextPromotionPass());
if (validate) compiler_->AddPass(new passes::ValidationPass());
compiler_->AddPass(new passes::SimplificationPass());
if (validate) compiler_->AddPass(new passes::ValidationPass());
@ -59,18 +59,16 @@ PPCTranslator::PPCTranslator(PPCFrontend* frontend) :
compiler_->AddPass(new passes::DeadCodeEliminationPass());
if (validate) compiler_->AddPass(new passes::ValidationPass());
// Adds local load/stores.
compiler_->AddPass(new passes::DataFlowAnalysisPass());
if (validate) compiler_->AddPass(new passes::ValidationPass());
compiler_->AddPass(new passes::SimplificationPass());
if (validate) compiler_->AddPass(new passes::ValidationPass());
//// Removes all unneeded variables. Try not to add new ones after this.
//compiler_->AddPass(new passes::ValueReductionPass());
//if (validate) compiler_->AddPass(new passes::ValidationPass());
// Run DCE one more time to cleanup any local manipulation.
compiler_->AddPass(new passes::DeadCodeEliminationPass());
if (validate) compiler_->AddPass(new passes::ValidationPass());
// Removes all unneeded variables. Try not to add new ones after this.
compiler_->AddPass(new passes::ValueReductionPass());
// Register allocation for the target backend.
// Will modify the HIR to add loads/stores.
// This should be the last pass before finalization, as after this all
// registers are assigned and ready to be emitted.
compiler_->AddPass(new passes::RegisterAllocationPass(
backend->machine_info()));
if (validate) compiler_->AddPass(new passes::ValidationPass());
// Must come last. The HIR is not really HIR after this.

View File

@ -108,6 +108,9 @@ void HIRBuilder::DumpValue(StringBuffer* str, Value* value) {
};
str->Append("v%d.%s", value->ordinal, type_names[value->type]);
}
if (value->reg.index != -1) {
str->Append("<%s%d>", value->reg.set->name, value->reg.index);
}
}
void HIRBuilder::DumpOp(
@ -453,6 +456,7 @@ Instr* HIRBuilder::AppendInstr(
if (!block->instr_head) {
block->instr_head = instr;
}
instr->ordinal = -1;
instr->block = block;
instr->opcode = &opcode_info;
instr->flags = flags;
@ -477,7 +481,8 @@ Value* HIRBuilder::AllocValue(TypeName type) {
value->last_use = NULL;
value->local_slot = NULL;
value->tag = NULL;
value->reg = -1;
value->reg.set = NULL;
value->reg.index = -1;
return value;
}
@ -492,7 +497,8 @@ Value* HIRBuilder::CloneValue(Value* source) {
value->last_use = NULL;
value->local_slot = NULL;
value->tag = NULL;
value->reg = -1;
value->reg.set = NULL;
value->reg.index = -1;
return value;
}

View File

@ -52,7 +52,7 @@ public:
const OpcodeInfo* opcode;
uint16_t flags;
uint16_t ordinal;
uint32_t ordinal;
typedef union {
runtime::FunctionInfo* symbol_info;

View File

@ -11,6 +11,7 @@
#define ALLOY_HIR_VALUE_H_
#include <alloy/core.h>
#include <alloy/backend/machine_info.h>
#include <alloy/hir/opcodes.h>
@ -90,7 +91,10 @@ public:
TypeName type;
uint32_t flags;
uint32_t reg;
struct {
const backend::MachineInfo::RegisterSet* set;
int32_t index;
} reg;
ConstantValue constant;
Instr* def;

View File

@ -145,6 +145,7 @@ typedef XECACHEALIGN volatile void xe_aligned_void_t;
static inline uint32_t XENEXTPOW2(uint32_t v) {
v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v;
}
#define XEALIGN(value, align) ((value + align - 1) & ~(align - 1))
#define XESUCCEED() goto XECLEANUP
#define XEFAIL() goto XECLEANUP