From 9dfbef8acf26f37ccc20ed885c603fd82c889195 Mon Sep 17 00:00:00 2001 From: chrisps Date: Fri, 17 Jan 2020 07:28:36 -0800 Subject: [PATCH 1/5] Smaller ComputeMemoryAddress/Offset sequence Replace a movzx after setae in both ComputeMemoryAddressOffset and ComputeMemoryAddress with a xor_ of eax prior to the cmp. This reduces the length in bytes of both sequences by 1, and should be a moderate ICache usage reduction thanks to the frequency of these sequences. --- src/xenia/cpu/backend/x64/x64_seq_memory.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_seq_memory.cc b/src/xenia/cpu/backend/x64/x64_seq_memory.cc index f57d8352d..191146e6f 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_memory.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_memory.cc @@ -52,9 +52,9 @@ RegExp ComputeMemoryAddressOffset(X64Emitter& e, const T& guest, if (xe::memory::allocation_granularity() > 0x1000) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. + e.xor_(e.eax, e.eax); e.cmp(guest.reg().cvt32(), 0xE0000000 - offset_const); e.setae(e.al); - e.movzx(e.eax, e.al); e.shl(e.eax, 12); e.add(e.eax, guest.reg().cvt32()); } else { @@ -89,9 +89,9 @@ RegExp ComputeMemoryAddress(X64Emitter& e, const T& guest) { if (xe::memory::allocation_granularity() > 0x1000) { // Emulate the 4 KB physical address offset in 0xE0000000+ when can't do // it via memory mapping. + e.xor_(e.eax, e.eax); e.cmp(guest.reg().cvt32(), 0xE0000000); e.setae(e.al); - e.movzx(e.eax, e.al); e.shl(e.eax, 12); e.add(e.eax, guest.reg().cvt32()); } else { From 3ad80810b5ab11248580d71bf0b04a8754d0ab46 Mon Sep 17 00:00:00 2001 From: chrisps Date: Wed, 15 Jan 2020 15:57:09 -0800 Subject: [PATCH 2/5] Optimized CONVERT_I64_TO_F64 with neat overflow trick Reduced instruction count from 11 to 8, eliminated a movq stall. --- src/xenia/cpu/backend/x64/x64_sequences.cc | 27 +++++++--------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 34cef4f7d..07883ba00 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -317,31 +317,20 @@ struct CONVERT_I32_F64 struct CONVERT_I64_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - // Copy src1. - e.movq(e.rcx, i.src1); + e.xor_(e.eax, e.eax); - // TODO(benvanik): saturation check? cvtt* (trunc?) + e.vcomisd(i.src1, e.GetXmmConstPtr(XmmConst::XMMZero)); if (i.instr->flags == ROUND_TO_ZERO) { e.vcvttsd2si(i.dest, i.src1); } else { e.vcvtsd2si(i.dest, i.src1); } - - // 0x8000000000000000 - e.mov(e.rax, 0x1); - e.shl(e.rax, 63); - - // Saturate positive overflow - // TODO(DrChat): Find a shorter equivalent sequence. - // if (result ind. && src1 >= 0) - // result = 0x7FFFFFFFFFFFFFFF; - e.cmp(e.rax, i.dest); - e.sete(e.al); - e.movzx(e.rax, e.al); - e.shr(e.rcx, 63); - e.xor_(e.rcx, 0x01); - e.and_(e.rax, e.rcx); - + // cf set if less than + e.setnc(e.cl); + e.cmp(i.dest, -1LL); + // if dest == 0x80000000 and not inp < 0 then dest = 0x7FFFFFFF + e.seto(e.al); + e.and_(e.al, e.cl); e.sub(i.dest, e.rax); } }; From 3675b3860a2cceaa0f2e3b792276c3d5eecb73e4 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 3 Aug 2019 13:26:54 -0700 Subject: [PATCH 3/5] Add constant folding for OPCODE_ROTATE_LEFT --- .../passes/constant_propagation_pass.cc | 9 +++++++- src/xenia/cpu/hir/value.cc | 23 +++++++++++++++++++ src/xenia/cpu/hir/value.h | 1 + 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc index b6b0376fa..acd1b5d64 100644 --- a/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc +++ b/src/xenia/cpu/compiler/passes/constant_propagation_pass.cc @@ -712,7 +712,14 @@ bool ConstantPropagationPass::Run(HIRBuilder* builder, bool& result) { result = true; } break; - // TODO(benvanik): ROTATE_LEFT + case OPCODE_ROTATE_LEFT: + if (i->src1.value->IsConstant() && i->src2.value->IsConstant()) { + v->set_from(i->src1.value); + v->RotateLeft(i->src2.value); + i->Remove(); + result = true; + } + break; case OPCODE_BYTE_SWAP: if (i->src1.value->IsConstant()) { v->set_from(i->src1.value); diff --git a/src/xenia/cpu/hir/value.cc b/src/xenia/cpu/hir/value.cc index 28ed07ee7..40796064c 100644 --- a/src/xenia/cpu/hir/value.cc +++ b/src/xenia/cpu/hir/value.cc @@ -813,6 +813,29 @@ void Value::Sha(Value* other) { } } +void Value::RotateLeft(Value* other) { + assert_true(other->type == INT8_TYPE); + auto rotation = other->constant.u8; + + switch (type) { + case INT8_TYPE: + constant.u8 = rotate_left(constant.u8, rotation); + break; + case INT16_TYPE: + constant.u16 = rotate_left(constant.u16, rotation); + break; + case INT32_TYPE: + constant.u32 = rotate_left(constant.u32, rotation); + break; + case INT64_TYPE: + constant.u64 = rotate_left(constant.u64, rotation); + break; + default: + assert_unhandled_case(type); + break; + } +} + void Value::Extract(Value* vec, Value* index) { assert_true(vec->type == VEC128_TYPE); switch (type) { diff --git a/src/xenia/cpu/hir/value.h b/src/xenia/cpu/hir/value.h index dcc95ca8c..d2e0fbf6d 100644 --- a/src/xenia/cpu/hir/value.h +++ b/src/xenia/cpu/hir/value.h @@ -519,6 +519,7 @@ class Value { void Shl(Value* other); void Shr(Value* other); void Sha(Value* other); + void RotateLeft(Value* other); void Extract(Value* vec, Value* index); void Select(Value* other, Value* ctrl); void Splat(Value* other); From 8a8ff6ae461034433ec22c6bd678fbd8b9e853c4 Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sat, 3 Aug 2019 13:44:16 -0700 Subject: [PATCH 4/5] Reuse flag results in OPCODE_BRANCH_TRUE codegen if the preceding instruction was a comparison that already set the cpu flags --- src/xenia/cpu/backend/x64/x64_seq_control.cc | 92 +++++++++++++++++--- 1 file changed, 79 insertions(+), 13 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_seq_control.cc b/src/xenia/cpu/backend/x64/x64_seq_control.cc index 80eeeebc7..46c879218 100644 --- a/src/xenia/cpu/backend/x64/x64_seq_control.cc +++ b/src/xenia/cpu/backend/x64/x64_seq_control.cc @@ -20,7 +20,59 @@ namespace backend { namespace x64 { volatile int anchor_control = 0; - +template +static void EmitFusedBranch(X64Emitter& e, const T& i) { + bool valid = i.instr->prev && i.instr->prev->dest == i.src1.value; + auto opcode = valid ? i.instr->prev->opcode->num : -1; + if (valid) { + auto name = i.src2.value->name; + switch (opcode) { + case OPCODE_IS_TRUE: + e.jnz(name, e.T_NEAR); + break; + case OPCODE_IS_FALSE: + e.jz(name, e.T_NEAR); + break; + case OPCODE_COMPARE_EQ: + e.je(name, e.T_NEAR); + break; + case OPCODE_COMPARE_NE: + e.jne(name, e.T_NEAR); + break; + case OPCODE_COMPARE_SLT: + e.jl(name, e.T_NEAR); + break; + case OPCODE_COMPARE_SLE: + e.jle(name, e.T_NEAR); + break; + case OPCODE_COMPARE_SGT: + e.jg(name, e.T_NEAR); + break; + case OPCODE_COMPARE_SGE: + e.jge(name, e.T_NEAR); + break; + case OPCODE_COMPARE_ULT: + e.jb(name, e.T_NEAR); + break; + case OPCODE_COMPARE_ULE: + e.jbe(name, e.T_NEAR); + break; + case OPCODE_COMPARE_UGT: + e.ja(name, e.T_NEAR); + break; + case OPCODE_COMPARE_UGE: + e.jae(name, e.T_NEAR); + break; + default: + e.test(i.src1, i.src1); + e.jnz(name, e.T_NEAR); + break; + } + } else { + e.test(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } +} // ============================================================================ // OPCODE_DEBUG_BREAK // ============================================================================ @@ -450,43 +502,57 @@ EMITTER_OPCODE_TABLE(OPCODE_BRANCH, BRANCH); struct BRANCH_TRUE_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); + EmitFusedBranch(e, i); } }; struct BRANCH_TRUE_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); + EmitFusedBranch(e, i); } }; struct BRANCH_TRUE_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); + EmitFusedBranch(e, i); } }; struct BRANCH_TRUE_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.test(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); + EmitFusedBranch(e, i); } }; struct BRANCH_TRUE_F32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); + if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info && + i.instr->prev->dest == i.src1.value) { + e.jnz(i.src2.value->name, e.T_NEAR); + } else if (i.instr->prev && + i.instr->prev->opcode == &OPCODE_IS_FALSE_info && + i.instr->prev->dest == i.src1.value) { + e.jz(i.src2.value->name, e.T_NEAR); + } else { + e.vptest(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } } }; struct BRANCH_TRUE_F64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.vptest(i.src1, i.src1); - e.jnz(i.src2.value->name, e.T_NEAR); + if (i.instr->prev && i.instr->prev->opcode == &OPCODE_IS_TRUE_info && + i.instr->prev->dest == i.src1.value) { + e.jnz(i.src2.value->name, e.T_NEAR); + } else if (i.instr->prev && + i.instr->prev->opcode == &OPCODE_IS_FALSE_info && + i.instr->prev->dest == i.src1.value) { + e.jz(i.src2.value->name, e.T_NEAR); + } else { + e.vptest(i.src1, i.src1); + e.jnz(i.src2.value->name, e.T_NEAR); + } } }; EMITTER_OPCODE_TABLE(OPCODE_BRANCH_TRUE, BRANCH_TRUE_I8, BRANCH_TRUE_I16, From e4fd015886c5e4361185b6ec9466707ef4253b2b Mon Sep 17 00:00:00 2001 From: chrisps Date: Fri, 17 Jun 2022 14:01:17 +0200 Subject: [PATCH 5/5] Juicy optimization goodness --- src/xenia/cpu/backend/x64/x64_emitter.cc | 30 +++++- src/xenia/cpu/backend/x64/x64_sequences.cc | 106 +++++++++++++++++++-- 2 files changed, 122 insertions(+), 14 deletions(-) diff --git a/src/xenia/cpu/backend/x64/x64_emitter.cc b/src/xenia/cpu/backend/x64/x64_emitter.cc index 97b14e03e..fb1fe138a 100644 --- a/src/xenia/cpu/backend/x64/x64_emitter.cc +++ b/src/xenia/cpu/backend/x64/x64_emitter.cc @@ -818,6 +818,12 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, const vec128_t& v) { // 1111... vpcmpeqb(dest, dest); } else { + for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) { + if (xmm_consts[i] == v) { + vmovapd(dest, GetXmmConstPtr((XmmConst)i)); + return; + } + } // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. MovMem64(rsp + kStashOffset, v.low); @@ -833,11 +839,19 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, float v) { } x = {v}; if (!x.i) { // +0.0f (but not -0.0f because it may be used to flip the sign via xor). - vpxor(dest, dest); + vxorps(dest, dest); } else if (x.i == ~uint32_t(0)) { // 1111... - vpcmpeqb(dest, dest); + vcmpeqss(dest, dest); } else { + unsigned raw_bits = *reinterpret_cast(&v); + + for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) { + if (xmm_consts[i].u32[0] == raw_bits) { + vmovss(dest, GetXmmConstPtr((XmmConst)i)); + return; + } + } // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. mov(eax, x.i); @@ -852,11 +866,19 @@ void X64Emitter::LoadConstantXmm(Xbyak::Xmm dest, double v) { } x = {v}; if (!x.i) { // +0.0 (but not -0.0 because it may be used to flip the sign via xor). - vpxor(dest, dest); + vxorpd(dest, dest); } else if (x.i == ~uint64_t(0)) { // 1111... - vpcmpeqb(dest, dest); + vcmpeqpd(dest, dest); } else { + uint64_t raw_bits = *reinterpret_cast(&v); + + for (size_t i = 0; i < (kConstDataSize / sizeof(vec128_t)); ++i) { + if (xmm_consts[i].u64[0] == raw_bits) { + vmovsd(dest, GetXmmConstPtr((XmmConst)i)); + return; + } + } // TODO(benvanik): see what other common values are. // TODO(benvanik): build constant table - 99% are reused. mov(rax, x.i); diff --git a/src/xenia/cpu/backend/x64/x64_sequences.cc b/src/xenia/cpu/backend/x64/x64_sequences.cc index 07883ba00..44d8bc439 100644 --- a/src/xenia/cpu/backend/x64/x64_sequences.cc +++ b/src/xenia/cpu/backend/x64/x64_sequences.cc @@ -175,7 +175,7 @@ struct ZERO_EXTEND_I32_I8 struct ZERO_EXTEND_I64_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.movzx(i.dest, i.src1); + e.movzx(i.dest.reg().cvt32(), i.src1); } }; struct ZERO_EXTEND_I32_I16 @@ -187,7 +187,7 @@ struct ZERO_EXTEND_I32_I16 struct ZERO_EXTEND_I64_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { - e.movzx(i.dest, i.src1); + e.movzx(i.dest.reg().cvt32(), i.src1); } }; struct ZERO_EXTEND_I64_I32 @@ -1209,14 +1209,7 @@ void EmitAddCarryXX(X64Emitter& e, const ARGS& i) { e.clc(); } } else { - if (i.src3.reg().getIdx() <= 4) { - // Can move from A/B/C/DX to AH. - e.mov(e.ah, i.src3.reg().cvt8()); - } else { - e.mov(e.al, i.src3); - e.mov(e.ah, e.al); - } - e.sahf(); + e.bt(i.src3.reg().cvt32(), 0); } SEQ::EmitCommutativeBinaryOp( e, i, @@ -1326,6 +1319,18 @@ EMITTER_OPCODE_TABLE(OPCODE_SUB, SUB_I8, SUB_I16, SUB_I32, SUB_I64, SUB_F32, // We exploit mulx here to avoid creating too much register pressure. struct MUL_I8 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant || i.src2.is_constant) { + uint64_t cval = + i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if (cval < (1ull << 32)) { + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } + if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1367,6 +1372,18 @@ struct MUL_I8 : Sequence> { }; struct MUL_I16 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src1.is_constant || i.src2.is_constant) { + uint64_t cval = + i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if (cval < (1ull << 32)) { + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } + if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1408,6 +1425,26 @@ struct MUL_I16 : Sequence> { }; struct MUL_I32 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + uint32_t multiplier = i.src2.value->constant.u32; + if (multiplier == 3 || multiplier == 5 || multiplier == 9) { + e.lea(i.dest, e.ptr[i.src1.reg() * (multiplier - 1) + i.src1.reg()]); + return; + } + } + + if (i.src1.is_constant || i.src2.is_constant) { + uint64_t cval = + i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if (cval < (1ull << 32)) { + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } + if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = EDX * $3 @@ -1450,6 +1487,27 @@ struct MUL_I32 : Sequence> { }; struct MUL_I64 : Sequence> { static void Emit(X64Emitter& e, const EmitArgType& i) { + if (i.src2.is_constant) { + uint64_t multiplier = i.src2.value->constant.u64; + if (multiplier == 3 || multiplier == 5 || multiplier == 9) { + e.lea(i.dest, + e.ptr[i.src1.reg() * ((int)multiplier - 1) + i.src1.reg()]); + return; + } + } + + if (i.src1.is_constant || i.src2.is_constant) { + uint64_t cval = + i.src1.is_constant ? i.src1.constant() : i.src2.constant(); + + if (cval < (1ull << 32)) { + auto& whichevs = i.src1.is_constant ? i.src2 : i.src1; + + e.imul(i.dest, whichevs, (int)cval); + return; + } + } + if (e.IsFeatureEnabled(kX64EmitBMI2)) { // mulx: $1:$2 = RDX * $3 @@ -2617,6 +2675,34 @@ void EmitAndXX(X64Emitter& e, const ARGS& i) { e.and_(dest_src, src); }, [](X64Emitter& e, const REG& dest_src, int32_t constant) { + if (constant == 0xFF) { + if (dest_src.getBit() == 16 || dest_src.getBit() == 32) { + e.movzx(dest_src, dest_src.cvt8()); + return; + } else if (dest_src.getBit() == 64) { + // take advantage of automatic zeroing of upper 32 bits + e.movzx(dest_src.cvt32(), dest_src.cvt8()); + return; + } + } else if (constant == 0xFFFF) { + if (dest_src.getBit() == 32) { + e.movzx(dest_src, dest_src.cvt16()); + return; + } else if (dest_src.getBit() == 64) { + e.movzx(dest_src.cvt32(), dest_src.cvt16()); + return; + } + } else if (constant == -1) { + if (dest_src.getBit() == 64) { + // todo: verify that mov eax, eax will properly zero upper 64 bits + } + } else if (dest_src.getBit() == 64 && constant > 0) { + // do 32 bit and, not the full 64, because the upper 32 of the mask + // are zero and the 32 bit op will auto clear the top, save space on + // the immediate and avoid a rex prefix + e.and_(dest_src.cvt32(), constant); + return; + } e.and_(dest_src, constant); }); }