From 327cc9eff540bfcaf45d99b2f1e45aadc3d3f1af Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 25 Jun 2022 09:58:13 -0700 Subject: [PATCH 1/2] drastically reduce size of final generated code for rlwinm by adding special paths for rotations of 0, masks that discard the rotated bits and using And w/ UINT_MAX instead of truncate/zero extend Add special case to TYPE_INT64's EmitAnd for UINT_MAX mask. Do mov32 to 32 if detected to take advantage of implicit zero xt/reg renaming Add helper function for skipping assignment defs in instr. Add helper function for checking if an opcode is binary value type Add several new optimizations to simplificationpass, plus weak NZM calculation code (better full evaluation of Z/NZ will be done later) . List of optimizations: If a value is anded with a bitmask that it was already masked against, reuse the old value (this cuts out most FPSCR update garbage, although it does cause a local variable to be allocated for the masked FPSCR and it still repeatedly stores the masked value to the context) If masking a value that was or'ed against another check whether our mask only considers bits from one value or another. if so, change the operand to the OR input that actually matters If the only usage of a rotate left's output is an AND against a mask that discards the bits that were rotated in change the opcode to SHIFT_LEFT If masking against all ones, become an assign. If XOR or OR against 0, become an assign (additional FPSCR codegen cleanup) If XOR against all ones, become a NOT Adding a direct CPUID check to x64_emitter for lzcnt, the version of xbyak we are using is skipping checking for lzcnt on all non-intel cpus, meaning we are generating the much slower bitscan path for AMD cpus. --- src/xenia/cpu/backend/x64/x64_emitter.cc | 11 +- src/xenia/cpu/backend/x64/x64_sequences.cc | 8 +- .../compiler/passes/simplification_pass.cc | 233 +++++++++++++++++- .../cpu/compiler/passes/simplification_pass.h | 13 + src/xenia/cpu/hir/instr.cc | 13 + src/xenia/cpu/hir/instr.h | 46 ++++ src/xenia/cpu/hir/opcodes.h | 4 + src/xenia/cpu/hir/value.h | 26 ++ src/xenia/cpu/ppc/ppc_emit_alu.cc | 61 ++++- 9 files changed, 399 insertions(+), 16 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 1cf4dc416..3d9323af9 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -106,7 +106,16 @@ X64Emitter::X64Emitter(X64Backend* backend, XbyakAllocator* allocator) #undef TEST_EMIT_FEATURE - + /* + fix for xbyak bug/omission, amd cpus are never checked for lzcnt. fixed in latest version of xbyak +*/ + unsigned int data[4]; + Xbyak::util::Cpu::getCpuid(0x80000001, data); + if (data[2] & (1U << 5)) { + if ((cvars::x64_extension_mask & kX64EmitLZCNT) == kX64EmitLZCNT) { + feature_flags_ |= kX64EmitLZCNT; + } + } if (cpu_.has(Xbyak::util::Cpu::tAMD)) { bool is_zennish = cpu_.displayFamily >= 0x17; diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index b4b9a70e2..b647ff404 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -2749,11 +2749,17 @@ struct AND_I32 : Sequence> { }; struct AND_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - EmitAndXX(e, i); + if (i.src2.is_constant && i.src2.constant() == 0xFFFFFFFF) { + // special case for rlwinm codegen + e.mov(((Reg64)i.dest).cvt32(), ((Reg64)i.src1).cvt32()); + } else { + EmitAndXX(e, i); + } } }; struct AND_V128 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + EmitCommutativeBinaryXmmOp(e, i, [](X64Emitter& e, Xmm dest, Xmm src1, Xmm src2) { e.vpand(dest, src1, src2); diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 3569887a4..6c00ef9c1 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -9,8 +9,8 @@ #include "xenia/cpu/compiler/passes/simplification_pass.h" +#include "xenia/base/byte_order.h" #include "xenia/base/profiling.h" - namespace xe { namespace cpu { namespace compiler { @@ -29,11 +29,241 @@ SimplificationPass::~SimplificationPass() {} bool SimplificationPass::Run(HIRBuilder* builder, bool& result) { result = false; + result |= SimplifyBitArith(builder); result |= EliminateConversions(builder); result |= SimplifyAssignments(builder); return true; } +// simplifications that apply to both or and xor +bool SimplificationPass::CheckOrXorZero(hir::Instr* i) { + auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar(); + if (constant_value && constant_value->IsConstantZero()) { + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(variable_value); + return true; + } + return false; +} +bool SimplificationPass::CheckOr(hir::Instr* i) { return CheckOrXorZero(i); } +bool SimplificationPass::CheckXor(hir::Instr* i) { + if (CheckOrXorZero(i)) { + return true; + } else { + uint64_t type_mask = GetScalarTypeMask(i->dest->type); + + auto [constant_value, variable_value] = + i->BinaryValueArrangeAsConstAndVar(); + + if (!constant_value) return false; + + if (constant_value->AsUint64() == type_mask) { + i->Replace(&OPCODE_NOT_info, 0); + i->set_src1(variable_value); + return true; + } + } + return false; +} +bool SimplificationPass::Is1BitOpcode(hir::Opcode def_opcode) { + return def_opcode >= OPCODE_IS_TRUE && def_opcode <= OPCODE_DID_SATURATE; +} +uint64_t SimplificationPass::GetScalarNZM(hir::Value* value, hir::Instr* def, + + uint64_t typemask, + hir::Opcode def_opcode) { + if (def_opcode == OPCODE_SHL) { + hir::Value* shifted = def->src1.value; + hir::Value* shiftby = def->src2.value; + // todo: nzm shift + if (shiftby->IsConstant()) { + uint64_t shifted_nzm = GetScalarNZM(shifted); + return shifted_nzm << shiftby->AsUint64(); + } + } else if (def_opcode == OPCODE_SHR) { + hir::Value* shifted = def->src1.value; + hir::Value* shiftby = def->src2.value; + // todo: nzm shift + if (shiftby->IsConstant()) { + uint64_t shifted_nzm = GetScalarNZM(shifted); + return shifted_nzm >> shiftby->AsUint64(); + } + } + // todo : sha, check signbit + else if (def_opcode == OPCODE_ROTATE_LEFT) { + hir::Value* shifted = def->src1.value; + hir::Value* shiftby = def->src2.value; + // todo: nzm shift + if (shiftby->IsConstant()) { + uint64_t shifted_nzm = GetScalarNZM(shifted); + return xe::rotate_left(shifted_nzm, + static_cast(shiftby->AsUint64())); + } + } else if (def_opcode == OPCODE_XOR || def_opcode == OPCODE_OR) { + return GetScalarNZM(def->src1.value) | GetScalarNZM(def->src2.value); + } else if (def_opcode == OPCODE_NOT) { + return ~GetScalarNZM(def->src1.value); + } else if (def_opcode == OPCODE_ASSIGN) { + return GetScalarNZM(def->src1.value); + } else if (def_opcode == OPCODE_BYTE_SWAP) { + uint64_t input_nzm = GetScalarNZM(def->src1.value); + switch (GetTypeSize(def->dest->type)) { + case 1: + return input_nzm; + case 2: + return xe::byte_swap( + static_cast(input_nzm)); + + case 4: + return xe::byte_swap( + static_cast(input_nzm)); + case 8: + return xe::byte_swap(input_nzm); + default: + xenia_assert(0); + return typemask; + } + } else if (def_opcode == OPCODE_ZERO_EXTEND) { + return GetScalarNZM(def->src1.value); + } else if (def_opcode == OPCODE_TRUNCATE) { + return GetScalarNZM(def->src1.value); // caller will truncate by masking + } else if (def_opcode == OPCODE_AND) { + return GetScalarNZM(def->src1.value) & GetScalarNZM(def->src2.value); + } else if (def_opcode == OPCODE_SELECT) { + return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src3.value); + } else if (def_opcode == OPCODE_MIN) { + /* + the nzm will be that of the narrowest operand, because if one value is + capable of being much larger than the other it can never actually reach + a value that is outside the range of the other values nzm, because that + would make it not the minimum of the two + + ahh, actually, we have to be careful about constants then.... for now, + just return or + */ + return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value); + } else if (def_opcode == OPCODE_MAX) { + return GetScalarNZM(def->src2.value) | GetScalarNZM(def->src1.value); + } else if (Is1BitOpcode(def_opcode)) { + return 1ULL; + } else if (def_opcode == OPCODE_CAST) { + return GetScalarNZM(def->src1.value); + } + + return typemask; +} +uint64_t SimplificationPass::GetScalarNZM(hir::Value* value) { + if (value->IsConstant()) { + return value->AsUint64(); + } + + uint64_t default_return = GetScalarTypeMask(value->type); + + hir::Instr* def = value->def; + if (!def) { + return default_return; + } + return GetScalarNZM(value, def, default_return, def->opcode->num) & + default_return; +} +bool SimplificationPass::CheckAnd(hir::Instr* i) { +retry_and_simplification: + auto [constant_value, variable_value] = i->BinaryValueArrangeAsConstAndVar(); + if (!constant_value) return false; + + // todo: check if masking with mask that covers all of zero extension source + uint64_t type_mask = GetScalarTypeMask(i->dest->type); + // if masking with entire width, pointless instruction so become an assign + + if (constant_value->AsUint64() == type_mask) { + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(variable_value); + return true; + } + + auto variable_def = variable_value->def; + + if (variable_def) { + auto true_variable_def = variable_def->GetDestDefSkipAssigns(); + if (true_variable_def) { + if (true_variable_def->opcode == &OPCODE_AND_info) { + auto [variable_def_constant, variable_def_variable] = + true_variable_def->BinaryValueArrangeAsConstAndVar(); + + if (variable_def_constant) { + // todo: check if masked with mask that was a subset of the current + // one and elim if so + if (variable_def_constant->AsUint64() == constant_value->AsUint64()) { + // we already masked the input with the same mask + i->Replace(&OPCODE_ASSIGN_info, 0); + i->set_src1(variable_value); + return true; + } + } + } else if (true_variable_def->opcode == &OPCODE_OR_info) { + Value* or_left = true_variable_def->src1.value; + Value* or_right = true_variable_def->src2.value; + + uint64_t left_nzm = GetScalarNZM(or_left); + + // use the other or input instead of the or output + if ((constant_value->AsUint64() & left_nzm) == 0) { + i->Replace(&OPCODE_AND_info, 0); + i->set_src1(or_right); + i->set_src2(constant_value); + return true; + } + + uint64_t right_nzm = GetScalarNZM(or_right); + + if ((constant_value->AsUint64() & right_nzm) == 0) { + i->Replace(&OPCODE_AND_info, 0); + i->set_src1(or_left); + i->set_src2(constant_value); + return true; + } + } else if (true_variable_def->opcode == &OPCODE_ROTATE_LEFT_info) { + if (true_variable_def->src2.value->IsConstant()) { + if (((type_mask << true_variable_def->src2.value->AsUint64()) & + type_mask) == + constant_value->AsUint64()) { // rotated bits are unused, convert + // to shift if we are the only use + if (true_variable_def->dest->use_head->next == nullptr) { + // one use, convert to shift + true_variable_def->opcode = &OPCODE_SHL_info; + goto retry_and_simplification; + } + } + } + } + } + } + + return false; +} +bool SimplificationPass::SimplifyBitArith(hir::HIRBuilder* builder) { + bool result = false; + auto block = builder->first_block(); + while (block) { + auto i = block->instr_head; + while (i) { + // vector types use the same opcodes as scalar ones for AND/OR/XOR! we + // don't handle these in our simplifications, so skip + if (i->dest && i->dest->type != VEC128_TYPE) { + if (i->opcode == &OPCODE_OR_info) { + result |= CheckOr(i); + } else if (i->opcode == &OPCODE_XOR_info) { + result |= CheckXor(i); + } else if (i->opcode == &OPCODE_AND_info) { + result |= CheckAnd(i); + } + } + i = i->next; + } + block = block->next; + } + return result; +} bool SimplificationPass::EliminateConversions(HIRBuilder* builder) { // First, we check for truncates/extensions that can be skipped. // This generates some assignments which then the second step will clean up. @@ -158,6 +388,7 @@ bool SimplificationPass::SimplifyAssignments(HIRBuilder* builder) { i->set_src3(CheckValue(i->src3.value, modified)); result |= modified; } + i = i->next; } block = block->next; diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.h b/src/xenia/cpu/compiler/passes/simplification_pass.h index 2ba6efad7..b2ef9bd95 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.h +++ b/src/xenia/cpu/compiler/passes/simplification_pass.h @@ -31,6 +31,19 @@ class SimplificationPass : public ConditionalGroupSubpass { bool SimplifyAssignments(hir::HIRBuilder* builder); hir::Value* CheckValue(hir::Value* value, bool& result); + bool SimplifyBitArith(hir::HIRBuilder* builder); + // handle either or or xor with 0 + bool CheckOrXorZero(hir::Instr* i); + bool CheckOr(hir::Instr* i); + bool CheckXor(hir::Instr* i); + bool CheckAnd(hir::Instr* i); + static bool Is1BitOpcode(hir::Opcode def_opcode); + static uint64_t GetScalarNZM(hir::Value* value, hir::Instr* def, + uint64_t typemask, hir::Opcode def_opcode); + // todo: use valuemask + // returns maybenonzeromask for value (mask of bits that may possibly hold + // information) + static uint64_t GetScalarNZM(hir::Value* value); }; } // namespace passes diff --git a/src/xenia/cpu/hir/instr.cc b/src/xenia/cpu/hir/instr.cc index 657dc5f53..4096d8e4a 100644 --- a/src/xenia/cpu/hir/instr.cc +++ b/src/xenia/cpu/hir/instr.cc @@ -114,7 +114,20 @@ void Instr::Remove() { block->instr_tail = prev; } } +Instr* Instr::GetDestDefSkipAssigns() { + Instr* current_def = this; + while (current_def->opcode == &OPCODE_ASSIGN_info) { + Instr* next_def = current_def->src1.value->def; + + if (!next_def) { + return nullptr; + } + + current_def = next_def; + } + return current_def; +} } // namespace hir } // namespace cpu } // namespace xe diff --git a/src/xenia/cpu/hir/instr.h b/src/xenia/cpu/hir/instr.h index 67fe47ede..035be44d8 100644 --- a/src/xenia/cpu/hir/instr.h +++ b/src/xenia/cpu/hir/instr.h @@ -59,6 +59,52 @@ class Instr { void MoveBefore(Instr* other); void Replace(const OpcodeInfo* new_opcode, uint16_t new_flags); void Remove(); + + template + std::pair BinaryValueArrangeByPredicateExclusive( + TPredicate&& pred) { + auto src1_value = src1.value; + auto src2_value = src2.value; + if (!src1_value || !src2_value) return {nullptr, nullptr}; + + if (!opcode) return {nullptr, nullptr}; // impossible! + + // check if binary opcode taking two values. we dont care if the dest is a + // value + + if (!IsOpcodeBinaryValue(opcode->signature)) return {nullptr, nullptr}; + + if (pred(src1_value)) { + if (pred(src2_value)) { + return {nullptr, nullptr}; + } else { + return {src1_value, src2_value}; + } + } else if (pred(src2_value)) { + return {src2_value, src1_value}; + } else { + return {nullptr, nullptr}; + } + } + + /* +if src1 is constant, and src2 is not, return [src1, src2] +if src2 is constant, and src1 is not, return [src2, src1] +if neither is constant, return nullptr, nullptr +if both are constant, return nullptr, nullptr +*/ + std::pair BinaryValueArrangeAsConstAndVar() { + return BinaryValueArrangeByPredicateExclusive( + [](Value* value) { return value->IsConstant(); }); + } + std::pair BinaryValueArrangeByDefiningOpcode( + const OpcodeInfo* op_ptr) { + return BinaryValueArrangeByPredicateExclusive([op_ptr](Value* value) { + return value->def && value->def->opcode == op_ptr; + }); + } + + Instr* GetDestDefSkipAssigns(); }; } // namespace hir diff --git a/src/xenia/cpu/hir/opcodes.h b/src/xenia/cpu/hir/opcodes.h index 6f45bb8da..8e681c757 100644 --- a/src/xenia/cpu/hir/opcodes.h +++ b/src/xenia/cpu/hir/opcodes.h @@ -347,6 +347,10 @@ enum OpcodeSignature { #define GET_OPCODE_SIG_TYPE_SRC1(sig) (OpcodeSignatureType)((sig >> 3) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC2(sig) (OpcodeSignatureType)((sig >> 6) & 0x7) #define GET_OPCODE_SIG_TYPE_SRC3(sig) (OpcodeSignatureType)((sig >> 9) & 0x7) +static bool IsOpcodeBinaryValue(uint32_t signature) { + return (signature & ~(0x7)) == + ((OPCODE_SIG_TYPE_V << 3) | (OPCODE_SIG_TYPE_V << 6)); +} typedef struct { uint32_t flags; diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index e3426a816..94a8c5965 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -57,6 +57,15 @@ inline size_t GetTypeSize(TypeName type_name) { return 0; } } +inline uint64_t GetScalarTypeMask(TypeName type_name) { + size_t mask_width = GetTypeSize(type_name); + + if (mask_width == 8) { + return ~0ULL; + } else { + return (1ULL << (mask_width * CHAR_BIT)) - 1; + } +} enum ValueFlags { VALUE_IS_CONSTANT = (1 << 1), @@ -68,6 +77,23 @@ struct RegAssignment { int32_t index; }; +struct ValueMask { + uint64_t low; // low 64 bits, usually for scalar values + uint64_t high; // high 64 bits, only used for vector types + + ValueMask(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} + + ValueMask operator&(ValueMask other) const { + return ValueMask{low & other.low, high & other.high}; + } + ValueMask operator|(ValueMask other) const { + return ValueMask{low | other.low, high | other.high}; + } + ValueMask operator^(ValueMask other) const { + return ValueMask{low ^ other.low, high ^ other.high}; + } +}; + class Value { public: typedef struct Use_s { diff --git a/src/xenia/cpu/ppc/ppc_emit_alu.cc b/src/xenia/cpu/ppc/ppc_emit_alu.cc index 81cd2a05b..7139e4c50 100644 --- a/src/xenia/cpu/ppc/ppc_emit_alu.cc +++ b/src/xenia/cpu/ppc/ppc_emit_alu.cc @@ -1023,6 +1023,17 @@ int InstrEmit_rlwimix(PPCHIRBuilder& f, const InstrData& i) { } return 0; } +static bool InstrCheck_rlx_only_needs_low(unsigned rotation, uint64_t mask) { + uint32_t mask32 = static_cast(mask); + if (static_cast(mask32) != mask) { + return false; + } + uint32_t all_ones_32 = ~0U; + all_ones_32 <<= rotation; + + return all_ones_32 == mask32; // mask is only 32 bits and all bits from the + // rotation are discarded +} int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) { // n <- SH @@ -1031,23 +1042,47 @@ int InstrEmit_rlwinmx(PPCHIRBuilder& f, const InstrData& i) { // RA <- r & m Value* v = f.LoadGPR(i.M.RT); - // (x||x) - v = f.Or(f.Shl(v, 32), f.ZeroExtend(f.Truncate(v, INT32_TYPE), INT64_TYPE)); + unsigned rotation = i.M.SH; - // TODO(benvanik): optimize srwi - // TODO(benvanik): optimize slwi - // The compiler will generate a bunch of these for the special case of SH=0. - // Which seems to just select some bits and set cr0 for use with a branch. - // We can detect this and do less work. - if (i.M.SH) { - v = f.RotateLeft(v, f.LoadConstantInt8(i.M.SH)); - } - // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here - // as our truncation/zero-extend does it for us. uint64_t m = XEMASK(i.M.MB + 32, i.M.ME + 32); - if (m != 0xFFFFFFFFFFFFFFFFull) { + + // in uint32 range (so no register concat/truncate/zx needed) and no rotation + if (m < (1ULL << 32) && (rotation == 0)) { v = f.And(v, f.LoadConstantUint64(m)); } + // masks out all the bits that are rotated in from the right, so just do a + // shift + and. the and with 0xFFFFFFFF is done instead of a truncate/zx + // because we have a special case for it in the emitters that will just do a + // single insn (mov reg32, lowpartofreg64), otherwise we generate + // significantly more code from setting up the opnds of the truncate/zx + else if (InstrCheck_rlx_only_needs_low(rotation, m)) { + // this path is taken for like 90% of all rlwinms + v = f.And(f.Shl(v, rotation), f.LoadConstantUint64(0xFFFFFFFF)); + } + + else { + // (x||x) + // cs: changed this to mask with UINT32_MAX instead of doing the + // truncate/extend, this generates better code in the backend and is easier + // to do analysis on + v = f.And(v, f.LoadConstantUint64(0xFFFFFFFF)); + + v = f.Or(f.Shl(v, 32), v); + + // TODO(benvanik): optimize srwi + // TODO(benvanik): optimize slwi + // The compiler will generate a bunch of these for the special case of SH=0. + // Which seems to just select some bits and set cr0 for use with a branch. + // We can detect this and do less work. + if (i.M.SH) { + v = f.RotateLeft(v, f.LoadConstantInt8(rotation)); + } + // Compiler sometimes masks with 0xFFFFFFFF (identity) - avoid the work here + // as our truncation/zero-extend does it for us. + if (m != 0xFFFFFFFFFFFFFFFFull) { + v = f.And(v, f.LoadConstantUint64(m)); + } + } f.StoreGPR(i.M.RA, v); if (i.M.Rc) { f.UpdateCR(0, v); From 08232de8ccc4f6d0bd059cea2608b0354eb03775 Mon Sep 17 00:00:00 2001 From: chrisps Date: Sun, 26 Jun 2022 09:30:56 -0700 Subject: [PATCH 2/2] patch a mistake in NZM calculation for OPCODE_NOT --- src/xenia/cpu/compiler/passes/simplification_pass.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xenia/cpu/compiler/passes/simplification_pass.cc b/src/xenia/cpu/compiler/passes/simplification_pass.cc index 6c00ef9c1..3ab9276a6 100644 --- a/src/xenia/cpu/compiler/passes/simplification_pass.cc +++ b/src/xenia/cpu/compiler/passes/simplification_pass.cc @@ -102,7 +102,7 @@ uint64_t SimplificationPass::GetScalarNZM(hir::Value* value, hir::Instr* def, } else if (def_opcode == OPCODE_XOR || def_opcode == OPCODE_OR) { return GetScalarNZM(def->src1.value) | GetScalarNZM(def->src2.value); } else if (def_opcode == OPCODE_NOT) { - return ~GetScalarNZM(def->src1.value); + return typemask; } else if (def_opcode == OPCODE_ASSIGN) { return GetScalarNZM(def->src1.value); } else if (def_opcode == OPCODE_BYTE_SWAP) {