diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 0ce38c24f..249e8ce72 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -9,8 +9,8 @@ Log_SetChannel(CPU::CodeCache); namespace CPU { -bool USE_CODE_CACHE = false; -bool USE_RECOMPILER = false; +bool USE_CODE_CACHE = true; +bool USE_RECOMPILER = true; CodeCache::CodeCache() = default; @@ -48,13 +48,21 @@ void CodeCache::Execute() continue; } +#if 0 + const u32 tick = m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(); + if (tick == 8950812) + __debugbreak(); +#endif + reexecute_block: if (USE_RECOMPILER) block->host_code(m_core); else InterpretCachedBlock(*block); - // LogCurrentState(); +#if 0 + LogCurrentState(); +#endif next_block_key = GetNextBlockKey(); if (next_block_key.bits == block->key.bits) @@ -91,10 +99,10 @@ void CodeCache::LogCurrentState() "tick=%u pc=%08X npc=%08X zero=%08X at=%08X v0=%08X v1=%08X a0=%08X a1=%08X a2=%08X a3=%08X t0=%08X " "t1=%08X t2=%08X t3=%08X t4=%08X t5=%08X t6=%08X t7=%08X s0=%08X s1=%08X s2=%08X s3=%08X s4=%08X " "s5=%08X s6=%08X s7=%08X t8=%08X t9=%08X k0=%08X k1=%08X gp=%08X sp=%08X fp=%08X ra=%08X\n", - m_system->GetGlobalTickCounter(), regs.pc, regs.npc, regs.zero, regs.at, regs.v0, regs.v1, regs.a0, regs.a1, - regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, regs.t5, regs.t6, regs.t7, regs.s0, regs.s1, regs.s2, - regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, regs.t8, regs.t9, regs.k0, regs.k1, regs.gp, regs.sp, regs.fp, - regs.ra); + m_system->GetGlobalTickCounter() + m_core->GetPendingTicks(), regs.pc, regs.npc, regs.zero, regs.at, regs.v0, + regs.v1, regs.a0, regs.a1, regs.a2, regs.a3, regs.t0, regs.t1, regs.t2, regs.t3, regs.t4, regs.t5, regs.t6, regs.t7, + regs.s0, regs.s1, regs.s2, regs.s3, regs.s4, regs.s5, regs.s6, regs.s7, regs.t8, regs.t9, regs.k0, regs.k1, regs.gp, + regs.sp, regs.fp, regs.ra); } CodeBlockKey CodeCache::GetNextBlockKey() const diff --git a/src/core/cpu_recompiler_code_generator.cpp b/src/core/cpu_recompiler_code_generator.cpp index 00fa0df15..0d03d888f 100644 --- a/src/core/cpu_recompiler_code_generator.cpp +++ b/src/core/cpu_recompiler_code_generator.cpp @@ -121,6 +121,11 @@ bool CodeGenerator::CompileInstruction(const CodeBlockInstruction& cbi) result = Compile_MoveHiLo(cbi); break; + case InstructionFunct::mult: + case InstructionFunct::multu: + result = Compile_Multiply(cbi); + break; + default: result = Compile_Fallback(cbi); break; @@ -285,6 +290,73 @@ Value CodeGenerator::AddValues(const Value& lhs, const Value& rhs) } } +std::pair CodeGenerator::MulValues(const Value& lhs, const Value& rhs, bool signed_multiply) +{ + DebugAssert(lhs.size == rhs.size); + if (lhs.IsConstant() && rhs.IsConstant()) + { + // compile-time + switch (lhs.size) + { + case RegSize_8: + { + u16 res; + if (signed_multiply) + res = u16(s16(s8(lhs.constant_value)) * s16(s8(rhs.constant_value))); + else + res = u16(u8(lhs.constant_value)) * u16(u8(rhs.constant_value)); + + return std::make_pair(Value::FromConstantU8(Truncate8(res >> 8)), Value::FromConstantU8(Truncate8(res))); + } + + case RegSize_16: + { + u32 res; + if (signed_multiply) + res = u32(s32(s16(lhs.constant_value)) * s32(s16(rhs.constant_value))); + else + res = u32(u16(lhs.constant_value)) * u32(u16(rhs.constant_value)); + + return std::make_pair(Value::FromConstantU16(Truncate16(res >> 16)), Value::FromConstantU16(Truncate16(res))); + } + + case RegSize_32: + { + u64 res; + if (signed_multiply) + res = u64(s64(s32(lhs.constant_value)) * s64(s32(rhs.constant_value))); + else + res = u64(u32(lhs.constant_value)) * u64(u32(rhs.constant_value)); + + return std::make_pair(Value::FromConstantU32(Truncate32(res >> 32)), Value::FromConstantU32(Truncate32(res))); + } + break; + + case RegSize_64: + { + u64 res; + if (signed_multiply) + res = u64(s64(lhs.constant_value) * s64(rhs.constant_value)); + else + res = lhs.constant_value * rhs.constant_value; + + // TODO: 128-bit multiply... + Panic("128-bit multiply"); + return std::make_pair(Value::FromConstantU64(0), Value::FromConstantU64(res)); + } + + default: + return std::make_pair(Value::FromConstantU64(0), Value::FromConstantU64(0)); + } + } + + // We need two registers for both components. + Value hi = m_register_cache.AllocateScratch(lhs.size); + Value lo = m_register_cache.AllocateScratch(lhs.size); + EmitMul(hi.host_reg, lo.host_reg, lhs, rhs, signed_multiply); + return std::make_pair(std::move(hi), std::move(lo)); +} + Value CodeGenerator::ShlValues(const Value& lhs, const Value& rhs) { DebugAssert(lhs.size == rhs.size); @@ -911,6 +983,20 @@ bool CodeGenerator::Compile_MoveHiLo(const CodeBlockInstruction& cbi) return true; } +bool CodeGenerator::Compile_Multiply(const CodeBlockInstruction& cbi) +{ + InstructionPrologue(cbi, 1); + + const bool signed_multiply = (cbi.instruction.r.funct == InstructionFunct::mult); + std::pair result = MulValues(m_register_cache.ReadGuestRegister(cbi.instruction.r.rs), + m_register_cache.ReadGuestRegister(cbi.instruction.r.rt), signed_multiply); + m_register_cache.WriteGuestRegister(Reg::hi, std::move(result.first)); + m_register_cache.WriteGuestRegister(Reg::lo, std::move(result.second)); + + InstructionEpilogue(cbi); + return true; +} + bool CodeGenerator::Compile_lui(const CodeBlockInstruction& cbi) { InstructionPrologue(cbi, 1); diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index ed1497706..861c04522 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -52,6 +52,7 @@ public: void EmitCopyValue(HostReg to_reg, const Value& value); void EmitAdd(HostReg to_reg, const Value& value); void EmitSub(HostReg to_reg, const Value& value); + void EmitMul(HostReg to_reg_hi, HostReg to_reg_lo, const Value& lhs, const Value& rhs, bool signed_multiply); void EmitCmp(HostReg to_reg, const Value& value); void EmitInc(HostReg to_reg, RegSize size); void EmitDec(HostReg to_reg, RegSize size); @@ -130,7 +131,7 @@ public: // Value ops Value AddValues(const Value& lhs, const Value& rhs); - Value MulValues(const Value& lhs, const Value& rhs); + std::pair MulValues(const Value& lhs, const Value& rhs, bool signed_multiply); Value ShlValues(const Value& lhs, const Value& rhs); Value ShrValues(const Value& lhs, const Value& rhs); Value SarValues(const Value& lhs, const Value& rhs); @@ -170,6 +171,7 @@ private: bool Compile_Load(const CodeBlockInstruction& cbi); bool Compile_Store(const CodeBlockInstruction& cbi); bool Compile_MoveHiLo(const CodeBlockInstruction& cbi); + bool Compile_Multiply(const CodeBlockInstruction& cbi); bool Compile_lui(const CodeBlockInstruction& cbi); bool Compile_addiu(const CodeBlockInstruction& cbi); diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 5f5b6310b..89b775c65 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -427,6 +427,122 @@ void CodeGenerator::EmitSub(HostReg to_reg, const Value& value) } } +void CodeGenerator::EmitMul(HostReg to_reg_hi, HostReg to_reg_lo, const Value& lhs, const Value& rhs, + bool signed_multiply) +{ + const bool save_eax = (to_reg_hi != Xbyak::Operand::RAX && to_reg_lo != Xbyak::Operand::RAX); + const bool save_edx = (to_reg_hi != Xbyak::Operand::RDX && to_reg_lo != Xbyak::Operand::RDX); + + if (save_eax) + m_emit.push(m_emit.rax); + + if (save_edx) + m_emit.push(m_emit.rdx); + +#define DO_MUL(src) \ + if (lhs.size == RegSize_8) \ + signed_multiply ? m_emit.imul(src.changeBit(8)) : m_emit.mul(src.changeBit(8)); \ + else if (lhs.size == RegSize_16) \ + signed_multiply ? m_emit.imul(src.changeBit(16)) : m_emit.mul(src.changeBit(16)); \ + else if (lhs.size == RegSize_32) \ + signed_multiply ? m_emit.imul(src.changeBit(32)) : m_emit.mul(src.changeBit(32)); \ + else \ + signed_multiply ? m_emit.imul(src.changeBit(64)) : m_emit.mul(src.changeBit(64)); + + // x*x + if (lhs.IsInHostRegister() && rhs.IsInHostRegister() && lhs.GetHostRegister() == rhs.GetHostRegister()) + { + if (lhs.GetHostRegister() != Xbyak::Operand::RAX) + EmitCopyValue(Xbyak::Operand::RAX, lhs); + + DO_MUL(m_emit.rax); + } + else if (lhs.IsInHostRegister() && lhs.GetHostRegister() == Xbyak::Operand::RAX) + { + if (!rhs.IsInHostRegister()) + { + EmitCopyValue(Xbyak::Operand::RDX, rhs); + DO_MUL(m_emit.rdx); + } + else + { + DO_MUL(GetHostReg64(rhs)); + } + } + else if (rhs.IsInHostRegister() && rhs.GetHostRegister() == Xbyak::Operand::RAX) + { + if (!lhs.IsInHostRegister()) + { + EmitCopyValue(Xbyak::Operand::RDX, lhs); + DO_MUL(m_emit.rdx); + } + else + { + DO_MUL(GetHostReg64(lhs)); + } + } + else + { + if (lhs.IsInHostRegister()) + { + EmitCopyValue(Xbyak::Operand::RAX, rhs); + if (lhs.size == RegSize_8) + signed_multiply ? m_emit.imul(GetHostReg8(lhs)) : m_emit.mul(GetHostReg8(lhs)); + else if (lhs.size == RegSize_16) + signed_multiply ? m_emit.imul(GetHostReg16(lhs)) : m_emit.mul(GetHostReg16(lhs)); + else if (lhs.size == RegSize_32) + signed_multiply ? m_emit.imul(GetHostReg32(lhs)) : m_emit.mul(GetHostReg32(lhs)); + else + signed_multiply ? m_emit.imul(GetHostReg64(lhs)) : m_emit.mul(GetHostReg64(lhs)); + } + else if (rhs.IsInHostRegister()) + { + EmitCopyValue(Xbyak::Operand::RAX, lhs); + if (lhs.size == RegSize_8) + signed_multiply ? m_emit.imul(GetHostReg8(rhs)) : m_emit.mul(GetHostReg8(rhs)); + else if (lhs.size == RegSize_16) + signed_multiply ? m_emit.imul(GetHostReg16(rhs)) : m_emit.mul(GetHostReg16(rhs)); + else if (lhs.size == RegSize_32) + signed_multiply ? m_emit.imul(GetHostReg32(rhs)) : m_emit.mul(GetHostReg32(rhs)); + else + signed_multiply ? m_emit.imul(GetHostReg64(rhs)) : m_emit.mul(GetHostReg64(rhs)); + } + else + { + EmitCopyValue(Xbyak::Operand::RAX, lhs); + EmitCopyValue(Xbyak::Operand::RDX, rhs); + DO_MUL(m_emit.rdx); + } + } + +#undef DO_MUL + + if (to_reg_hi == Xbyak::Operand::RDX && to_reg_lo == Xbyak::Operand::RAX) + { + // ideal case: registers are the ones we want: don't have to do anything + } + else if (to_reg_hi == Xbyak::Operand::RAX && to_reg_lo == Xbyak::Operand::RDX) + { + // what we want, but swapped, so exchange them + m_emit.xchg(m_emit.rax, m_emit.rdx); + } + else + { + // store to the registers we want.. this could be optimized better + m_emit.push(m_emit.rdx); + m_emit.push(m_emit.rax); + m_emit.pop(GetHostReg64(to_reg_lo)); + m_emit.pop(GetHostReg64(to_reg_hi)); + } + + // restore original contents + if (save_edx) + m_emit.pop(m_emit.rdx); + + if (save_eax) + m_emit.pop(m_emit.rax); +} + void CodeGenerator::EmitCmp(HostReg to_reg, const Value& value) { DebugAssert(value.IsConstant() || value.IsInHostRegister());