From 68c451f008be9341894c3c534dceb4a8a7b93b62 Mon Sep 17 00:00:00 2001 From: magumagu9 Date: Wed, 31 Dec 2008 01:39:35 +0000 Subject: [PATCH] Some WIP work on the JIT... only marginally usable at the moment, but I wanted to back this up somewhere, and the people familiar with the JIT might have comments. There's a big comment in Jit64IL/IR.cpp with a high-level overview of what this is. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@1724 8ced0084-cf51-0410-be5f-012b33b47a6e --- Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp | 1095 +++++++++++++++++ Source/Core/Core/Src/PowerPC/Jit64IL/IR.h | 322 +++++ Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp | 528 ++++++++ Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h | 299 +++++ .../Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp | 277 +++++ Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h | 88 ++ .../Core/Src/PowerPC/Jit64IL/JitBackpatch.cpp | 215 ++++ .../Core/Src/PowerPC/Jit64IL/JitCache.cpp | 346 ++++++ .../Core/Core/Src/PowerPC/Jit64IL/JitCache.h | 116 ++ .../Core/Src/PowerPC/Jit64IL/JitRegCache.cpp | 395 ++++++ .../Core/Src/PowerPC/Jit64IL/JitRegCache.h | 150 +++ .../Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp | 200 +++ .../Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp | 224 ++++ .../Core/Src/PowerPC/Jit64IL/Jit_Integer.cpp | 520 ++++++++ .../Src/PowerPC/Jit64IL/Jit_LoadStore.cpp | 198 +++ .../PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp | 322 +++++ .../PowerPC/Jit64IL/Jit_LoadStorePaired.cpp | 458 +++++++ .../Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp | 407 ++++++ .../PowerPC/Jit64IL/Jit_SystemRegisters.cpp | 149 +++ .../Core/Src/PowerPC/Jit64IL/Jit_Util.cpp | 161 +++ 20 files changed, 6470 insertions(+) create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/IR.h create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/JitBackpatch.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.h create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.h create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Integer.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_SystemRegisters.cpp create mode 100644 Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Util.cpp diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp new file mode 100644 index 0000000000..bf109005ed --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.cpp @@ -0,0 +1,1095 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +/* + +IR implementation comments: +This file implements code generation for a new IR-based JIT. The idea of +the IR is that as much as possible, it strips away the complexities +of the PPC instruction set into a simpler instruction set. In its current +form, the semantics are very simple: each instruction does its calculation +and performs its side effects in order. For an instruction with a result, +the instruction also represents the returned value. This works quite +simply because jumps within a block are not allowed. + +The IR treats loads and stores to PPC registers as separate steps from actual +calculations. This allows the instruction set to be significantly simpler, +because one PPC instruction can be mapped to multiple IR instructions. It +also allows optimizing out dead register stores: this reduces register +pressure and allows dead code elimination to completely remove instructions +which produce unused values, or the carry flag of srawx. + +The actual IR representation uses a few tricks I picked up from nanojit: +each instruction is a single 32-bit integer, the operands are 8-bit offsets +back from the current instruction, and there's a special Tramp instruction +to reference operands that are too far away to reference otherwise. + +The first step of code generation is producing the IR; this is roughly +equivalent to all of code generation in the previous code. In addition +to storing the IR, some optimizations occur in this step: the primary +optimizations are that redundant register loads/stores are eliminated, +and constant-folding is done. + +The second step is a quick pass over the IL to figure out liveness: this +information is used both for dead code elimination and to find the last +use of an instruction, which is allowed to destroy the value. + +The third step is the actual code generation: this just goes through each +instruction and generates code. Dead code elimination works in this step, +by simply skipping unused instructions. The register allocator is a dumb, +greedy allocator: at the moment, it's really a bit too dumb, but it's +actually not as bad as it looks: unless a block is relatively long, spills +are rarely needed. ECX is used as a scratch register: requiring a scratch +register isn't ideal, but the register allocator is too dumb to handle +instructions that need a specific register at the moment. + +In addition to the optimizations that are deeply tied to the IR, +I've implemented one additional trick: fast memory for 32-bit machines. +This works off of the observation that loads and stores can be classified +at runtime: any particular load instruction will always load similar addresses, +and any store will store to similar addresses. Using this observation, every +block is JIT-ed twice: the first time, the block includes extra code to +instrument the loads. Then, at the end of the block, it jumps back into the JIT +to recompile itself. The second recompilation looks at the address of each load +and store, and bakes the information into the generated code. This allows removing +the overhead for both the mask and the branch normally required for loads on 32-bit +machines. This optimization isn't completely safe: it depends on a guarantee which +isn't made by the PPC instruction set. That said, it's reliable enough that games +work without any fallback, and it's a large performance boost. Also, if it turns +out it isn't completely reliable, we can use a solution using segments which is +similar to the 64-bit fast memory implementation. + +The speed with this JIT is better than I expected, but not at much as I hoped... +on the test I've been working on (which bounded by JIT performance and doesn't +use any floating-point), it's roughly 25% faster than the current JIT, with the +edge over the current JIT mostly due to the fast memory optimization. + +TODO (in no particular order): +Floating-point JIT (both paired and unpaired): currently falls back + to the interpreter +Improve register allocator to deal with long live intervals. +Optimize conditions for conditional branches. +Inter-block dead register elimination, especially for CR0. +Inter-block inlining. +Track down a few correctness bugs. +Known zero bits: eliminate unneeded AND instructions for rlwinm/rlwimi +Implement a select instruction +64-bit compat (it should only be a few tweaks to register allocation and + the load/store code) +Scheduling to reduce register pressure: PowerPC compilers like to push + uses far away from definitions, but it's rather unfriendly to modern + x86 processors, which are short on registers and extremely good at + instruction reordering. +Common subexpression elimination +Optimize load of sum using complex addressing +Implement idle-skipping + +*/ + +#include "IR.h" +#include "../PPCTables.h" +#include "../../CoreTiming.h" +#include "Thunk.h" +#include "../../HW/Memmap.h" +#include "JitAsm.h" +#include "Jit.h" +#include "../../HW/GPFifo.h" +using namespace Gen; + +namespace IREmitter { + +InstLoc IRBuilder::EmitZeroOp(unsigned Opcode, unsigned extra = 0) { + InstLoc curIndex = &InstList[InstList.size()]; + InstList.push_back(Opcode | (extra << 8)); + return curIndex; +} + +InstLoc IRBuilder::EmitUOp(unsigned Opcode, InstLoc Op1, unsigned extra) { + InstLoc curIndex = &InstList[InstList.size()]; + unsigned backOp1 = curIndex - 1 - Op1; + if (backOp1 >= 256) { + InstList.push_back(Tramp | backOp1 << 8); + backOp1 = 0; + curIndex++; + } + InstList.push_back(Opcode | (backOp1 << 8) | (extra << 16)); + return curIndex; +} + +InstLoc IRBuilder::EmitBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2) { + InstLoc curIndex = &InstList[InstList.size()]; + unsigned backOp1 = curIndex - 1 - Op1; + if (backOp1 >= 255) { + InstList.push_back(Tramp | backOp1 << 8); + backOp1 = 0; + curIndex++; + } + unsigned backOp2 = curIndex - 1 - Op2; + if (backOp2 >= 256) { + InstList.push_back(Tramp | backOp2 << 8); + backOp2 = 0; + backOp1++; + curIndex++; + } + InstList.push_back(Opcode | backOp1 << 8 | backOp2 << 16); + return curIndex; +} + +#if 0 +InstLoc IRBuilder::EmitTriOp(unsigned Opcode, InstLoc Op1, InstLoc Op2, + InstLoc Op3) { + InstLoc curIndex = &InstList[InstList.size()]; + unsigned backOp1 = curIndex - 1 - Op1; + if (backOp1 >= 254) { + InstList.push_back(Tramp | backOp1 << 8); + backOp1 = 0; + curIndex++; + } + unsigned backOp2 = curIndex - 1 - Op2; + if (backOp2 >= 255) { + InstList.push_back((Tramp | backOp2 << 8)); + backOp2 = 0; + backOp1++; + curIndex++; + } + unsigned backOp3 = curIndex - 1 - Op3; + if (backOp3 >= 256) { + InstList.push_back(Tramp | (backOp3 << 8)); + backOp3 = 0; + backOp2++; + backOp1++; + curIndex++; + } + InstList.push_back(Opcode | (backOp1 << 8) | (backOp2 << 16) | + (backOp3 << 24)); + return curIndex; +} +#endif + +InstLoc IRBuilder::FoldZeroOp(unsigned Opcode, unsigned extra) { + if (Opcode == LoadGReg) { + // Reg load folding: if we already loaded the value, + // load it again + if (!GRegCache[extra]) + GRegCache[extra] = EmitZeroOp(LoadGReg, extra); + return GRegCache[extra]; + } + if (Opcode == LoadCarry) { + if (!CarryCache) + CarryCache = EmitZeroOp(LoadGReg, extra); + return CarryCache; + } + if (Opcode == LoadCR) { + if (!CRCache[extra]) + CRCache[extra] = EmitZeroOp(LoadCR, extra); + return CRCache[extra]; + } + + return EmitZeroOp(Opcode, extra); +} + +InstLoc IRBuilder::FoldUOp(unsigned Opcode, InstLoc Op1, unsigned extra) { + if (Opcode == StoreGReg) { + // Reg store folding: save the value for load folding. + // If there's a previous store, zap it because it's dead. + GRegCache[extra] = Op1; + if (GRegCacheStore[extra]) { + *GRegCacheStore[extra] = 0; + } + GRegCacheStore[extra] = EmitUOp(StoreGReg, Op1, extra); + return GRegCacheStore[extra]; + } + if (Opcode == StoreCarry) { + CarryCache = Op1; + if (CarryCacheStore) { + *CarryCacheStore = 0; + } + CarryCacheStore = EmitUOp(StoreCarry, Op1, extra); + return CarryCacheStore; + } + if (Opcode == StoreCR) { + CRCache[extra] = Op1; + if (CRCacheStore[extra]) { + *CRCacheStore[extra] = 0; + } + CRCacheStore[extra] = EmitUOp(StoreCR, Op1, extra); + return CRCacheStore[extra]; + } + + return EmitUOp(Opcode, Op1, extra); +} + +InstLoc IRBuilder::FoldAdd(InstLoc Op1, InstLoc Op2) { + if (isImm(*Op1)) { + if (isImm(*Op2)) + return EmitIntConst(GetImmValue(Op1) + + GetImmValue(Op2)); + return FoldAdd(Op2, Op1); + } + if (isImm(*Op2)) { + if (!GetImmValue(Op2)) return Op1; + if (getOpcode(*Op1) == Add && isImm(*getOp2(Op1))) { + unsigned RHS = GetImmValue(Op2) + + GetImmValue(getOp2(Op1)); + return FoldAdd(getOp1(Op1), EmitIntConst(RHS)); + } + } + return EmitBiOp(Add, Op1, Op2); +} + +InstLoc IRBuilder::FoldAnd(InstLoc Op1, InstLoc Op2) { + if (isImm(*Op1)) { + if (isImm(*Op2)) + return EmitIntConst(GetImmValue(Op1) & + GetImmValue(Op2)); + return FoldAnd(Op2, Op1); + } + if (isImm(*Op2)) { + if (!GetImmValue(Op2)) return EmitIntConst(0); + if (GetImmValue(Op2) == -1U) return Op1; + if (getOpcode(*Op1) == And && isImm(*getOp2(Op1))) { + unsigned RHS = GetImmValue(Op2) & + GetImmValue(getOp2(Op1)); + return FoldAnd(getOp1(Op1), EmitIntConst(RHS)); + } else if (getOpcode(*Op1) == Rol && isImm(*getOp2(Op1))) { + unsigned shiftMask1 = -1U << (GetImmValue(getOp2(Op1)) & 31); + if (GetImmValue(Op2) == shiftMask1) + return FoldShl(getOp1(Op1), getOp2(Op1)); + unsigned shiftAmt2 = ((32 - GetImmValue(getOp2(Op1))) & 31); + unsigned shiftMask2 = -1U >> shiftAmt2; + if (GetImmValue(Op2) == shiftMask2) { + return FoldShrl(getOp1(Op1), EmitIntConst(shiftAmt2)); + } + } + } + if (Op1 == Op2) return Op1; + + return EmitBiOp(And, Op1, Op2); +} + +InstLoc IRBuilder::FoldOr(InstLoc Op1, InstLoc Op2) { + if (isImm(*Op1)) { + if (isImm(*Op2)) + return EmitIntConst(GetImmValue(Op1) | + GetImmValue(Op2)); + return FoldOr(Op2, Op1); + } + if (isImm(*Op2)) { + if (!GetImmValue(Op2)) return Op1; + if (GetImmValue(Op2) == -1U) return EmitIntConst(-1U); + if (getOpcode(*Op1) == Or && isImm(*getOp2(Op1))) { + unsigned RHS = GetImmValue(Op2) | + GetImmValue(getOp2(Op1)); + return FoldOr(getOp1(Op1), EmitIntConst(RHS)); + } + } + if (Op1 == Op2) return Op1; + + return EmitBiOp(Or, Op1, Op2); +} + +InstLoc IRBuilder::FoldXor(InstLoc Op1, InstLoc Op2) { + if (isImm(*Op1)) { + if (isImm(*Op2)) + return EmitIntConst(GetImmValue(Op1) ^ + GetImmValue(Op2)); + return FoldXor(Op2, Op1); + } + if (isImm(*Op2)) { + if (!GetImmValue(Op2)) return Op1; + if (getOpcode(*Op1) == Xor && isImm(*getOp2(Op1))) { + unsigned RHS = GetImmValue(Op2) ^ + GetImmValue(getOp2(Op1)); + return FoldXor(getOp1(Op1), EmitIntConst(RHS)); + } + } + if (Op1 == Op2) return Op1; + + return EmitBiOp(Xor, Op1, Op2); +} + +InstLoc IRBuilder::FoldShl(InstLoc Op1, InstLoc Op2) { + if (isImm(*Op2)) { + if (isImm(*Op1)) + return EmitIntConst(GetImmValue(Op1) << (GetImmValue(Op2) & 31)); + } + return EmitBiOp(Shl, Op1, Op2); +} + +InstLoc IRBuilder::FoldShrl(InstLoc Op1, InstLoc Op2) { + if (isImm(*Op2)) { + if (isImm(*Op1)) + return EmitIntConst(GetImmValue(Op1) >> (GetImmValue(Op2) & 31)); + } + return EmitBiOp(Shrl, Op1, Op2); +} + +InstLoc IRBuilder::FoldRol(InstLoc Op1, InstLoc Op2) { + if (isImm(*Op2)) { + if (isImm(*Op1)) + return EmitIntConst(_rotl(GetImmValue(Op1), + GetImmValue(Op2))); + if (!(GetImmValue(Op2) & 31)) return Op1; + } + return EmitBiOp(Rol, Op1, Op2); +} + +InstLoc IRBuilder::FoldInterpreterFallback(InstLoc Op1, InstLoc Op2) { + for (unsigned i = 0; i < 32; i++) { + GRegCache[i] = 0; + GRegCacheStore[i] = 0; + } + CarryCache = 0; + CarryCacheStore = 0; + for (unsigned i = 0; i < 8; i++) { + CRCache[i] = 0; + CRCacheStore[i] = 0; + } + return EmitBiOp(InterpreterFallback, Op1, Op2); +} + +InstLoc IRBuilder::FoldBiOp(unsigned Opcode, InstLoc Op1, InstLoc Op2) { + switch (Opcode) { + case Add: return FoldAdd(Op1, Op2); + case And: return FoldAnd(Op1, Op2); + case Or: return FoldOr(Op1, Op2); + case Xor: return FoldXor(Op1, Op2); + case Shl: return FoldShl(Op1, Op2); + case Shrl: return FoldShrl(Op1, Op2); + case Rol: return FoldRol(Op1, Op2); + case InterpreterFallback: return FoldInterpreterFallback(Op1, Op2); + default: return EmitBiOp(Opcode, Op1, Op2); + } +} + +InstLoc IRBuilder::EmitIntConst(unsigned value) { + InstLoc curIndex = &InstList[InstList.size()]; + InstList.push_back(CInt32 | (ConstList.size() << 8)); + ConstList.push_back(value); + return curIndex; +} + +unsigned IRBuilder::GetImmValue(InstLoc I) { + return ConstList[*I >> 8]; +} + +} + +using namespace IREmitter; + +/* +Actual codegen is a backward pass followed by a forward pass. + +The first pass to actually doing codegen is a liveness analysis pass. +Liveness is important for two reasons: one, it lets us do dead code +elimination, which results both from earlier folding, PPC +instructions with unused parts like srawx, and just random strangeness. +The other bit is that is allows us to identify the last instruction to +use a value: this is absolutely essential for register allocation +because it the allocator needs to be able to free unused registers. +In addition, this allows eliminating redundant mov instructions in a lot +of cases. + +The register allocation is just a simple forward greedy allocator. +*/ + +struct RegInfo { + Jit64* Jit; + IRBuilder* Build; + InstLoc FirstI; + std::vector IInfo; + InstLoc regs[16]; + unsigned numSpills; + bool MakeProfile; + bool UseProfile; + unsigned numProfiledLoads; + unsigned exitNumber; + + RegInfo(Jit64* j, InstLoc f, unsigned insts) : Jit(j), FirstI(f), IInfo(insts) { + for (unsigned i = 0; i < 16; i++) + regs[i] = 0; + numSpills = 0; + numProfiledLoads = 0; + exitNumber = 0; + MakeProfile = UseProfile = false; + } +}; + +static void regMarkUse(RegInfo& R, InstLoc I, InstLoc Op, unsigned OpNum) { + unsigned& info = R.IInfo[Op - R.FirstI]; + if (info == 0) R.IInfo[I - R.FirstI] |= 1 << (OpNum + 1); + if (info < 2) info++; +} + +static unsigned regReadUse(RegInfo& R, InstLoc I) { + return R.IInfo[I - R.FirstI] & 3; +} + +static unsigned SlotSet[1000]; +static unsigned ProfiledLoads[1000]; + +static OpArg regLocForSlot(RegInfo& RI, unsigned slot) { + return M(&SlotSet[slot - 1]); +} + +static unsigned regCreateSpill(RegInfo& RI, InstLoc I) { + unsigned newSpill = ++RI.numSpills; + RI.IInfo[I - RI.FirstI] |= newSpill << 16; + return newSpill; +} + +static unsigned regGetSpill(RegInfo& RI, InstLoc I) { + return RI.IInfo[I - RI.FirstI] >> 16; +} + +static void regSpill(RegInfo& RI, X64Reg reg) { + if (!RI.regs[reg]) return; + unsigned slot = regGetSpill(RI, RI.regs[reg]); + if (!slot) { + slot = regCreateSpill(RI, RI.regs[reg]); + RI.Jit->MOV(32, regLocForSlot(RI, slot), R(reg)); + } + RI.regs[reg] = 0; +} + +static X64Reg regFindFreeReg(RegInfo& RI) { + if (RI.regs[EDI] == 0) return EDI; + if (RI.regs[ESI] == 0) return ESI; + if (RI.regs[EBP] == 0) return EBP; + if (RI.regs[EBX] == 0) return EBX; + if (RI.regs[EDX] == 0) return EDX; + if (RI.regs[EAX] == 0) return EAX; + // ECX is scratch; never allocate it! + regSpill(RI, EDI); + return EDI; +} + +static OpArg regLocForInst(RegInfo& RI, InstLoc I) { + if (RI.regs[EDI] == I) return R(EDI); + if (RI.regs[ESI] == I) return R(ESI); + if (RI.regs[EBP] == I) return R(EBP); + if (RI.regs[EBX] == I) return R(EBX); + if (RI.regs[EDX] == I) return R(EDX); + if (RI.regs[EAX] == I) return R(EAX); + if (RI.regs[ECX] == I) return R(ECX); + + if (regGetSpill(RI, I) == 0) + PanicAlert("Retrieving unknown spill slot?!"); + return regLocForSlot(RI, regGetSpill(RI, I)); +} + +static void regClearInst(RegInfo& RI, InstLoc I) { + if (RI.regs[EDI] == I) { + RI.regs[EDI] = 0; + } + if (RI.regs[ESI] == I) { + RI.regs[ESI] = 0; + } + if (RI.regs[EBP] == I) { + RI.regs[EBP] = 0; + } + if (RI.regs[EBX] == I) { + RI.regs[EBX] = 0; + } + if (RI.regs[EDX] == I) { + RI.regs[EDX] = 0; + } + if (RI.regs[EAX] == I) { + RI.regs[EAX] = 0; + } + if (RI.regs[ECX] == I) { + RI.regs[ECX] = 0; + } +} + +static X64Reg regEnsureInReg(RegInfo& RI, InstLoc I) { + OpArg loc = regLocForInst(RI, I); + if (!loc.IsSimpleReg()) { + X64Reg newReg = regFindFreeReg(RI); + RI.Jit->MOV(32, R(newReg), loc); + loc = R(newReg); + } + return loc.GetSimpleReg(); +} + +static void regSpillCallerSaved(RegInfo& RI) { + regSpill(RI, EDX); + regSpill(RI, ECX); + regSpill(RI, EAX); +} + +static X64Reg regBinLHSReg(RegInfo& RI, InstLoc I) { + if (RI.IInfo[I - RI.FirstI] & 4) { + return regEnsureInReg(RI, getOp1(I)); + } + X64Reg reg = regFindFreeReg(RI); + RI.Jit->MOV(32, R(reg), regLocForInst(RI, getOp1(I))); + return reg; +} + +static void regEmitBinInst(RegInfo& RI, InstLoc I, + void (Jit64::*op)(int, const OpArg&, + const OpArg&)) { + X64Reg reg = regBinLHSReg(RI, I); + if (isImm(*getOp2(I))) { + unsigned RHS = RI.Build->GetImmValue(getOp2(I)); + if (RHS + 128 < 256) { + (RI.Jit->*op)(32, R(reg), Imm8(RHS)); + } else { + (RI.Jit->*op)(32, R(reg), Imm32(RHS)); + } + } else { + (RI.Jit->*op)(32, R(reg), regLocForInst(RI, getOp2(I))); + } + RI.regs[reg] = I; +} + +static void regEmitMemLoad(RegInfo& RI, InstLoc I, unsigned Size) { + X64Reg reg = regBinLHSReg(RI, I); + if (RI.UseProfile) { + unsigned curLoad = ProfiledLoads[RI.numProfiledLoads++]; + if (!(curLoad & 0x0C000000)) { + if (regReadUse(RI, I)) { + unsigned addr = (u32)Memory::base - (curLoad & 0xC0000000); + RI.Jit->MOVZX(32, Size, reg, MDisp(reg, addr)); + RI.Jit->BSWAP(Size, reg); + RI.regs[reg] = I; + } + return; + } + } + if (RI.MakeProfile) { + RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(reg)); + } + RI.Jit->TEST(32, R(reg), Imm32(0x0C000000)); + FixupBranch argh = RI.Jit->J_CC(CC_Z); + if (reg != EAX) + RI.Jit->PUSH(32, R(EAX)); + switch (Size) + { + case 32: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break; + case 16: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break; + case 8: RI.Jit->ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg); break; + } + if (reg != EAX) { + RI.Jit->MOV(32, R(reg), R(EAX)); + RI.Jit->POP(32, R(EAX)); + } + FixupBranch arg2 = RI.Jit->J(); + RI.Jit->SetJumpTarget(argh); + RI.Jit->UnsafeLoadRegToReg(reg, reg, Size, 0, false); + RI.Jit->SetJumpTarget(arg2); + if (regReadUse(RI, I)) + RI.regs[reg] = I; +} + +static void regEmitMemStore(RegInfo& RI, InstLoc I, unsigned Size) { + if (RI.UseProfile) { + unsigned curStore = ProfiledLoads[RI.numProfiledLoads++]; + if (!(curStore & 0x0C000000)) { + X64Reg reg = regEnsureInReg(RI, getOp2(I)); + RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + RI.Jit->BSWAP(Size, ECX); + unsigned addr = (u32)Memory::base - (curStore & 0xC0000000); + RI.Jit->MOV(Size, MDisp(reg, addr), R(ECX)); + return; + } else if ((curStore & 0xFFFFF000) == 0xCC008000) { + regSpill(RI, EAX); + RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + RI.Jit->BSWAP(Size, ECX); + RI.Jit->MOV(32, R(EAX), M(&GPFifo::m_gatherPipeCount)); + RI.Jit->MOV(Size, MDisp(EAX, (u32)GPFifo::m_gatherPipe), R(ECX)); + RI.Jit->ADD(32, R(EAX), Imm8(Size >> 3)); + RI.Jit->MOV(32, M(&GPFifo::m_gatherPipeCount), R(EAX)); + RI.Jit->js.fifoBytesThisBlock += Size >> 3; + return; + } + } + regSpill(RI, EAX); + RI.Jit->MOV(32, R(EAX), regLocForInst(RI, getOp1(I))); + RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); + if (RI.MakeProfile) { + RI.Jit->MOV(32, M(&ProfiledLoads[RI.numProfiledLoads++]), R(ECX)); + } + RI.Jit->SafeWriteRegToReg(EAX, ECX, Size, 0); +} + +static void regEmitShiftInst(RegInfo& RI, InstLoc I, + void (Jit64::*op)(int, OpArg, OpArg)) +{ + X64Reg reg = regBinLHSReg(RI, I); + if (isImm(*getOp2(I))) { + unsigned RHS = RI.Build->GetImmValue(getOp2(I)); + (RI.Jit->*op)(32, R(reg), Imm8(RHS)); + RI.regs[reg] = I; + return; + } + // FIXME: prevent regBinLHSReg from finding ecx! + RI.Jit->MOV(32, R(ECX), regLocForInst(RI, getOp2(I))); + (RI.Jit->*op)(32, R(reg), R(ECX)); + RI.regs[reg] = I; +} + +static void regStoreInstToConstLoc(RegInfo& RI, unsigned width, InstLoc I, + void* loc) { + if (width != 32) { + PanicAlert("Not implemented!"); + return; + } + if (isImm(*I)) { + RI.Jit->MOV(32, M(loc), Imm32(RI.Build->GetImmValue(I))); + return; + } + X64Reg reg = regEnsureInReg(RI, I); + RI.Jit->MOV(32, M(loc), R(reg)); +} + +static void regEmitCmp(RegInfo& RI, InstLoc I) { + if (isImm(*getOp2(I))) { + unsigned RHS = RI.Build->GetImmValue(getOp2(I)); + RI.Jit->CMP(32, regLocForInst(RI, getOp1(I)), Imm32(RHS)); + } else { + X64Reg reg = regEnsureInReg(RI, getOp1(I)); + RI.Jit->CMP(32, R(reg), regLocForInst(RI, getOp2(I))); + } +} + +static void regWriteExit(RegInfo& RI, InstLoc dest) { + if (RI.MakeProfile) { + if (isImm(*dest)) { + RI.Jit->MOV(32, M(&PC), Imm32(RI.Build->GetImmValue(dest))); + } else { + RI.Jit->MOV(32, R(EAX), regLocForInst(RI, dest)); + RI.Jit->MOV(32, M(&PC), R(EAX)); + } + RI.Jit->Cleanup(); + RI.Jit->SUB(32, M(&CoreTiming::downcount), Imm32(RI.Jit->js.downcountAmount)); + RI.Jit->JMP(asm_routines.doReJit, true); + return; + } + if (isImm(*dest)) { + RI.Jit->WriteExit(RI.Build->GetImmValue(dest), RI.exitNumber++); + } else { + RI.Jit->MOV(32, R(EAX), regLocForInst(RI, dest)); + RI.Jit->WriteExitDestInEAX(RI.exitNumber++); + } +} + +static void DoWriteCode(IRBuilder* ibuild, Jit64* Jit, bool UseProfile) { + //printf("Writing block: %x\n", js.blockStart); + RegInfo RI(Jit, ibuild->getFirstInst(), ibuild->getNumInsts()); + RI.Build = ibuild; + RI.UseProfile = UseProfile; + RI.MakeProfile = !RI.UseProfile; + // Pass to compute liveness + // Note that despite this marking, we never materialize immediates; + // on x86, they almost always fold into the instruction, and it's at + // best a code-size reduction in the cases where they don't. + ibuild->StartBackPass(); + for (unsigned index = RI.IInfo.size() - 1; index != -1U; --index) { + InstLoc I = ibuild->ReadBackward(); + unsigned op = getOpcode(*I); + bool thisUsed = regReadUse(RI, I); + switch (op) { + default: + PanicAlert("Unexpected inst!"); + case Nop: + case CInt16: + case CInt32: + case LoadGReg: + case LoadLink: + case LoadCR: + case LoadCarry: + case LoadCTR: + case LoadMSR: + case BlockEnd: + case BlockStart: + case InterpreterFallback: + // No liveness effects + break; + case Tramp: + if (thisUsed) + regMarkUse(RI, I, I - 1 - (*I >> 8), 1); + break; + case SExt8: + case SExt16: + case BSwap32: + case BSwap16: + if (thisUsed) + regMarkUse(RI, I, getOp1(I), 1); + break; + case Load8: + case Load16: + case Load32: + case StoreGReg: + case StoreCR: + case StoreLink: + case StoreCarry: + case StoreCTR: + case StoreMSR: + regMarkUse(RI, I, getOp1(I), 1); + break; + case Add: + case Sub: + case And: + case Or: + case Xor: + case Mul: + case Rol: + case Shl: + case Shrl: + case Sarl: + case ICmpCRUnsigned: + case ICmpCRSigned: + case ICmpEq: + case ICmpUgt: + if (thisUsed) { + regMarkUse(RI, I, getOp1(I), 1); + if (!isImm(*getOp2(I))) + regMarkUse(RI, I, getOp2(I), 2); + } + break; + case Store8: + case Store16: + case Store32: + regMarkUse(RI, I, getOp1(I), 1); + regMarkUse(RI, I, getOp2(I), 2); + break; + case BranchUncond: + if (!isImm(*getOp1(I))) + regMarkUse(RI, I, getOp1(I), 1); + break; + case BranchCond: + regMarkUse(RI, I, getOp1(I), 1); + if (!isImm(*getOp2(I))) + regMarkUse(RI, I, getOp2(I), 2); + break; + } + } + + ibuild->StartForwardPass(); + for (unsigned i = 0; i != RI.IInfo.size(); i++) { + InstLoc I = ibuild->ReadForward(); + bool thisUsed = regReadUse(RI, I); + switch (getOpcode(*I)) { + case InterpreterFallback: { + unsigned InstCode = ibuild->GetImmValue(getOp1(I)); + unsigned InstLoc = ibuild->GetImmValue(getOp2(I)); + regSpillCallerSaved(RI); + Jit->MOV(32, M(&PC), Imm32(InstLoc)); + Jit->MOV(32, M(&NPC), Imm32(InstLoc+4)); + Jit->ABI_CallFunctionC((void*)GetInterpreterOp(InstCode), + InstCode); + break; + } + case LoadGReg: { + if (!thisUsed) break; + X64Reg reg = regFindFreeReg(RI); + unsigned ppcreg = *I >> 8; + Jit->MOV(32, R(reg), M(&PowerPC::ppcState.gpr[ppcreg])); + RI.regs[reg] = I; + break; + } + case LoadCR: { + if (!thisUsed) break; + X64Reg reg = regFindFreeReg(RI); + unsigned ppcreg = *I >> 8; + Jit->MOVZX(32, 8, reg, M(&PowerPC::ppcState.cr_fast[ppcreg])); + RI.regs[reg] = I; + break; + } + case LoadCTR: { + if (!thisUsed) break; + X64Reg reg = regFindFreeReg(RI); + Jit->MOV(32, R(reg), M(&CTR)); + RI.regs[reg] = I; + break; + } + case LoadLink: { + if (!thisUsed) break; + X64Reg reg = regFindFreeReg(RI); + Jit->MOV(32, R(reg), M(&LR)); + RI.regs[reg] = I; + break; + } + case LoadMSR: { + if (!thisUsed) break; + X64Reg reg = regFindFreeReg(RI); + Jit->MOV(32, R(reg), M(&MSR)); + RI.regs[reg] = I; + break; + } + case StoreGReg: { + unsigned ppcreg = *I >> 16; + regStoreInstToConstLoc(RI, 32, getOp1(I), + &PowerPC::ppcState.gpr[ppcreg]); + break; + } + case StoreCR: { + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + unsigned ppcreg = *I >> 16; + // CAUTION: uses 8-bit reg! + Jit->MOV(8, M(&PowerPC::ppcState.cr_fast[ppcreg]), R(ECX)); + break; + } + case StoreLink: { + regStoreInstToConstLoc(RI, 32, getOp1(I), &LR); + break; + } + case StoreCTR: { + regStoreInstToConstLoc(RI, 32, getOp1(I), &CTR); + break; + } + case StoreMSR: { + regStoreInstToConstLoc(RI, 32, getOp1(I), &MSR); + break; + } + case StoreCarry: { + Jit->CMP(32, regLocForInst(RI, getOp1(I)), Imm8(0)); + FixupBranch nocarry = Jit->J_CC(CC_Z); + Jit->JitSetCA(); + FixupBranch cont = Jit->J(); + Jit->SetJumpTarget(nocarry); + Jit->JitClearCA(); + Jit->SetJumpTarget(cont); + break; + } + case Load8: { + regEmitMemLoad(RI, I, 8); + break; + } + case Load16: { + regEmitMemLoad(RI, I, 16); + break; + } + case Load32: { + regEmitMemLoad(RI, I, 32); + break; + } + case Store8: { + regEmitMemStore(RI, I, 8); + break; + } + case Store16: { + regEmitMemStore(RI, I, 16); + break; + } + case Store32: { + regEmitMemStore(RI, I, 32); + break; + } + case SExt8: { + if (!thisUsed) break; + X64Reg reg = regFindFreeReg(RI); + Jit->MOV(32, R(ECX), regLocForInst(RI, getOp1(I))); + Jit->MOVSX(32, 8, reg, R(ECX)); + RI.regs[reg] = I; + break; + } + case SExt16: { + if (!thisUsed) break; + X64Reg reg = regFindFreeReg(RI); + Jit->MOVSX(32, 16, reg, regLocForInst(RI, getOp1(I))); + RI.regs[reg] = I; + break; + } + case And: { + if (!thisUsed) break; + regEmitBinInst(RI, I, &Jit64::AND); + break; + } + case Xor: { + if (!thisUsed) break; + regEmitBinInst(RI, I, &Jit64::XOR); + break; + } + case Sub: { + if (!thisUsed) break; + regEmitBinInst(RI, I, &Jit64::SUB); + break; + } + case Or: { + if (!thisUsed) break; + regEmitBinInst(RI, I, &Jit64::OR); + break; + } + case Add: { + if (!thisUsed) break; + regEmitBinInst(RI, I, &Jit64::ADD); + break; + } + case Mul: { + if (!thisUsed) break; + // FIXME: Use three-address capability of IMUL! + X64Reg reg = regBinLHSReg(RI, I); + if (isImm(*getOp2(I))) { + unsigned RHS = RI.Build->GetImmValue(getOp2(I)); + if (RHS + 128 < 256) { + Jit->IMUL(32, reg, Imm8(RHS)); + } else { + Jit->IMUL(32, reg, Imm32(RHS)); + } + } else { + Jit->IMUL(32, reg, regLocForInst(RI, getOp2(I))); + } + RI.regs[reg] = I; + break; + } + case Rol: { + if (!thisUsed) break; + regEmitShiftInst(RI, I, &Jit64::ROL); + break; + } + case Shl: { + if (!thisUsed) break; + regEmitShiftInst(RI, I, &Jit64::SHL); + break; + } + case Shrl: { + if (!thisUsed) break; + regEmitShiftInst(RI, I, &Jit64::SHR); + break; + } + case Sarl: { + if (!thisUsed) break; + regEmitShiftInst(RI, I, &Jit64::SAR); + break; + } + case ICmpEq: { + if (!thisUsed) break; + regEmitCmp(RI, I); + Jit->SETcc(CC_Z, R(ECX)); // Caution: SETCC uses 8-bit regs! + X64Reg reg = regFindFreeReg(RI); + Jit->MOVZX(32, 8, reg, R(ECX)); + RI.regs[reg] = I; + break; + } + case ICmpUgt: { + if (!thisUsed) break; + regEmitCmp(RI, I); + Jit->SETcc(CC_A, R(ECX)); // Caution: SETCC uses 8-bit regs! + X64Reg reg = regFindFreeReg(RI); + Jit->MOVZX(32, 8, reg, R(ECX)); + RI.regs[reg] = I; + break; + } + case ICmpCRUnsigned: { + if (!thisUsed) break; + regEmitCmp(RI, I); + X64Reg reg = regFindFreeReg(RI); + FixupBranch pLesser = Jit->J_CC(CC_B); + FixupBranch pGreater = Jit->J_CC(CC_A); + Jit->MOV(32, R(reg), Imm32(0x2)); // _x86Reg == 0 + FixupBranch continue1 = Jit->J(); + Jit->SetJumpTarget(pGreater); + Jit->MOV(32, R(reg), Imm32(0x4)); // _x86Reg > 0 + FixupBranch continue2 = Jit->J(); + Jit->SetJumpTarget(pLesser); + Jit->MOV(32, R(reg), Imm32(0x8)); // _x86Reg < 0 + Jit->SetJumpTarget(continue1); + Jit->SetJumpTarget(continue2); + RI.regs[reg] = I; + break; + } + case ICmpCRSigned: { + if (!thisUsed) break; + regEmitCmp(RI, I); + X64Reg reg = regFindFreeReg(RI); + FixupBranch pLesser = Jit->J_CC(CC_L); + FixupBranch pGreater = Jit->J_CC(CC_G); + Jit->MOV(32, R(reg), Imm32(0x2)); // _x86Reg == 0 + FixupBranch continue1 = Jit->J(); + Jit->SetJumpTarget(pGreater); + Jit->MOV(32, R(reg), Imm32(0x4)); // _x86Reg > 0 + FixupBranch continue2 = Jit->J(); + Jit->SetJumpTarget(pLesser); + Jit->MOV(32, R(reg), Imm32(0x8)); // _x86Reg < 0 + Jit->SetJumpTarget(continue1); + Jit->SetJumpTarget(continue2); + RI.regs[reg] = I; + break; + } + case CInt32: + case CInt16: { + if (!thisUsed) break; + X64Reg reg = regFindFreeReg(RI); + Jit->MOV(32, R(reg), Imm32(ibuild->GetImmValue(I))); + RI.regs[reg] = I; + break; + } + case BlockStart: + case BlockEnd: + break; + case BranchCond: { + Jit->CMP(32, regLocForInst(RI, getOp1(I)), Imm8(0)); + FixupBranch cont = Jit->J_CC(CC_NZ); + regWriteExit(RI, getOp2(I)); + Jit->SetJumpTarget(cont); + break; + } + case BranchUncond: { + regWriteExit(RI, getOp1(I)); + break; + } + case Tramp: { + if (!thisUsed) break; + // FIXME: Optimize! + InstLoc Op = I - 1 - (*I >> 8); + X64Reg reg = regFindFreeReg(RI); + Jit->MOV(32, R(reg), regLocForInst(RI, Op)); + RI.regs[reg] = I; + if (RI.IInfo[I - RI.FirstI] & 4) + regClearInst(RI, Op); + break; + } + case Nop: break; + default: + PanicAlert("Unknown JIT instruction; aborting!"); + exit(1); + } + if (getOpcode(*I) != Tramp) { + if (RI.IInfo[I - RI.FirstI] & 4) + regClearInst(RI, getOp1(I)); + if (RI.IInfo[I - RI.FirstI] & 8) + regClearInst(RI, getOp2(I)); + } + } + for (unsigned i = 0; i < 8; i++) { + if (RI.regs[i]) { + PanicAlert("Incomplete cleanup!"); + exit(1); + } + } + + printf("Block: %x, numspills %d\n", Jit->js.blockStart, RI.numSpills); + + Jit->MOV(32, R(EAX), M(&NPC)); + Jit->WriteRfiExitDestInEAX(); + Jit->UD2(); +} + +void Jit64::WriteCode() { + DoWriteCode(&ibuild, this, false); +} + +void ProfiledReJit() { + u8* x = (u8*)jit.GetCodePtr(); + jit.SetCodePtr(jit.js.normalEntry); + DoWriteCode(&jit.ibuild, &jit, true); + jit.js.curBlock->codeSize = jit.GetCodePtr() - jit.js.normalEntry; + jit.SetCodePtr(x); +} diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h new file mode 100644 index 0000000000..2a3f3c67c6 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/IR.h @@ -0,0 +1,322 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#ifndef IR_H +#define IR_H + +#include "x64Emitter.h" +#include + +namespace IREmitter { + + enum Opcode { + Nop = 0, + + // "Zero-operand" operators + // Register load operators + LoadGReg, + LoadLink, + LoadCR, + LoadCarry, + LoadCTR, + LoadMSR, + + // Unary operators + // Integer unary operators + SExt8, + SExt16, + BSwap32, + BSwap16, + Load8, // These loads zext + Load16, + Load32, + // Branches + BranchUncond, + // Register store operators + StoreGReg, + StoreCR, + StoreLink, + StoreCarry, + StoreCTR, + StoreMSR, + // Arbitrary interpreter instruction + InterpreterFallback, + + // Binary operators + // Commutative integer operators + Add, + Mul, + And, + Or, + Xor, + // Non-commutative integer operators + Sub, + Shl, // Note that shifts ignore bits above the bottom 5 + Shrl, + Sarl, + Rol, + ICmpCRSigned, // CR for signed int compare + ICmpCRUnsigned, // CR for unsigned int compare + ICmpEq, // One if equal, zero otherwise + ICmpUgt, // One if op1 > op2, zero otherwise + // Memory store operators + Store8, + Store16, + Store32, + BranchCond, + + // "Trinary" operators + // FIXME: Need to change representation! + //Select, // Equivalent to C "Op1 ? Op2 : Op3" + + // Integer constants + CInt16, + CInt32, + + // "Opcode" representing a register too far away to + // reference directly; this is a size optimization + Tramp, + // "Opcode"s representing the start and end + BlockStart, BlockEnd + }; + + typedef unsigned Inst; + typedef Inst* InstLoc; + + unsigned inline getOpcode(Inst i) { + return i & 255; + } + + unsigned inline isImm(Inst i) { + return getOpcode(i) >= CInt16 && getOpcode(i) <= CInt32; + } + + unsigned inline isUnary(Inst i) { + return getOpcode(i) >= SExt8 && getOpcode(i) <= BSwap16; + } + + unsigned inline isBinary(Inst i) { + return getOpcode(i) >= Add && getOpcode(i) <= ICmpCRUnsigned; + } + + unsigned inline isMemLoad(Inst i) { + return getOpcode(i) >= Load8 && getOpcode(i) <= Load32; + } + + unsigned inline isMemStore(Inst i) { + return getOpcode(i) >= Store8 && getOpcode(i) <= Store32; + } + + unsigned inline isRegLoad(Inst i) { + return getOpcode(i) >= LoadGReg && getOpcode(i) <= LoadCR; + } + + unsigned inline isRegStore(Inst i) { + return getOpcode(i) >= LoadGReg && getOpcode(i) <= LoadCR; + } + + unsigned inline isBranch(Inst i) { + return getOpcode(i) >= BranchUncond && + getOpcode(i) <= BranchCond; + } + + unsigned inline isInterpreterFallback(Inst i) { + return getOpcode(i) == InterpreterFallback; + } + + InstLoc inline getOp1(InstLoc i) { + return i - 1 - ((*i >> 8) & 255); + } + + InstLoc inline getOp2(InstLoc i) { + return i - 1 - ((*i >> 16) & 255); + } + + class IRBuilder { + InstLoc EmitZeroOp(unsigned Opcode, unsigned extra); + InstLoc EmitUOp(unsigned OpCode, InstLoc Op1, + unsigned extra = 0); + InstLoc EmitBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2); + + InstLoc FoldAdd(InstLoc Op1, InstLoc Op2); + InstLoc FoldAnd(InstLoc Op1, InstLoc Op2); + InstLoc FoldOr(InstLoc Op1, InstLoc Op2); + InstLoc FoldRol(InstLoc Op1, InstLoc Op2); + InstLoc FoldShl(InstLoc Op1, InstLoc Op2); + InstLoc FoldShrl(InstLoc Op1, InstLoc Op2); + InstLoc FoldXor(InstLoc Op1, InstLoc Op2); + + InstLoc FoldInterpreterFallback(InstLoc Op1, InstLoc Op2); + + InstLoc FoldZeroOp(unsigned Opcode, unsigned extra); + InstLoc FoldUOp(unsigned OpCode, InstLoc Op1, + unsigned extra = 0); + InstLoc FoldBiOp(unsigned OpCode, InstLoc Op1, InstLoc Op2); + + public: + InstLoc EmitIntConst(unsigned value); + InstLoc EmitStoreLink(InstLoc val) { + return FoldUOp(StoreLink, val); + } + InstLoc EmitBranchUncond(InstLoc val) { + return FoldUOp(BranchUncond, val); + } + InstLoc EmitBranchCond(InstLoc check, InstLoc dest) { + return FoldBiOp(BranchCond, check, dest); + } + InstLoc EmitLoadCR(unsigned crreg) { + return FoldZeroOp(LoadCR, crreg); + } + InstLoc EmitStoreCR(InstLoc value, unsigned crreg) { + return FoldUOp(StoreCR, value, crreg); + } + InstLoc EmitLoadLink() { + return FoldZeroOp(LoadLink, 0); + } + InstLoc EmitLoadMSR() { + return FoldZeroOp(LoadMSR, 0); + } + InstLoc EmitStoreMSR(InstLoc val) { + return FoldUOp(StoreMSR, val); + } + InstLoc EmitLoadGReg(unsigned reg) { + return FoldZeroOp(LoadGReg, reg); + } + InstLoc EmitStoreGReg(InstLoc value, unsigned reg) { + return FoldUOp(StoreGReg, value, reg); + } + InstLoc EmitAnd(InstLoc op1, InstLoc op2) { + return FoldBiOp(And, op1, op2); + } + InstLoc EmitXor(InstLoc op1, InstLoc op2) { + return FoldBiOp(Xor, op1, op2); + } + InstLoc EmitSub(InstLoc op1, InstLoc op2) { + return FoldBiOp(Sub, op1, op2); + } + InstLoc EmitOr(InstLoc op1, InstLoc op2) { + return FoldBiOp(Or, op1, op2); + } + InstLoc EmitAdd(InstLoc op1, InstLoc op2) { + return FoldBiOp(Add, op1, op2); + } + InstLoc EmitMul(InstLoc op1, InstLoc op2) { + return FoldBiOp(Mul, op1, op2); + } + InstLoc EmitRol(InstLoc op1, InstLoc op2) { + return FoldBiOp(Rol, op1, op2); + } + InstLoc EmitShl(InstLoc op1, InstLoc op2) { + return FoldBiOp(Shl, op1, op2); + } + InstLoc EmitShrl(InstLoc op1, InstLoc op2) { + return FoldBiOp(Shrl, op1, op2); + } + InstLoc EmitSarl(InstLoc op1, InstLoc op2) { + return FoldBiOp(Sarl, op1, op2); + } + InstLoc EmitLoadCTR() { + return FoldZeroOp(LoadCTR, 0); + } + InstLoc EmitStoreCTR(InstLoc op1) { + return FoldUOp(StoreCTR, op1); + } + InstLoc EmitICmpEq(InstLoc op1, InstLoc op2) { + return FoldBiOp(ICmpEq, op1, op2); + } + InstLoc EmitICmpUgt(InstLoc op1, InstLoc op2) { + return FoldBiOp(ICmpUgt, op1, op2); + } + InstLoc EmitLoad8(InstLoc op1) { + return FoldUOp(Load8, op1); + } + InstLoc EmitLoad16(InstLoc op1) { + return FoldUOp(Load16, op1); + } + InstLoc EmitLoad32(InstLoc op1) { + return FoldUOp(Load32, op1); + } + InstLoc EmitStore8(InstLoc op1, InstLoc op2) { + return FoldBiOp(Store8, op1, op2); + } + InstLoc EmitStore16(InstLoc op1, InstLoc op2) { + return FoldBiOp(Store16, op1, op2); + } + InstLoc EmitStore32(InstLoc op1, InstLoc op2) { + return FoldBiOp(Store32, op1, op2); + } + InstLoc EmitSExt16(InstLoc op1) { + return FoldUOp(SExt16, op1); + } + InstLoc EmitSExt8(InstLoc op1) { + return FoldUOp(SExt8, op1); + } + InstLoc EmitICmpCRSigned(InstLoc op1, InstLoc op2) { + return FoldBiOp(ICmpCRSigned, op1, op2); + } + InstLoc EmitICmpCRUnsigned(InstLoc op1, InstLoc op2) { + return FoldBiOp(ICmpCRUnsigned, op1, op2); + } + InstLoc EmitInterpreterFallback(InstLoc op1, InstLoc op2) { + return FoldBiOp(InterpreterFallback, op1, op2); + } + InstLoc EmitStoreCarry(InstLoc op1) { + return FoldUOp(StoreCarry, op1); + } + + void StartBackPass() { curReadPtr = &InstList[InstList.size()]; } + void StartForwardPass() { curReadPtr = &InstList[0]; } + InstLoc ReadForward() { return curReadPtr++; } + InstLoc ReadBackward() { return --curReadPtr; } + InstLoc getFirstInst() { return &InstList[0]; } + unsigned getNumInsts() { return InstList.size(); } + unsigned ReadInst(InstLoc I) { return *I; } + unsigned GetImmValue(InstLoc I); + + void Reset() { + InstList.clear(); + InstList.reserve(100000); + for (unsigned i = 0; i < 32; i++) { + GRegCache[i] = 0; + GRegCacheStore[i] = 0; + } + CarryCache = 0; + CarryCacheStore = 0; + for (unsigned i = 0; i < 8; i++) { + CRCache[i] = 0; + CRCacheStore[i] = 0; + } + } + + IRBuilder() { Reset(); } + + private: + std::vector InstList; // FIXME: We must ensure this is + // continuous! + std::vector ConstList; + InstLoc curReadPtr; + InstLoc GRegCache[32]; + InstLoc GRegCacheStore[32]; + InstLoc CarryCache; + InstLoc CarryCacheStore; + InstLoc CRCache[8]; + InstLoc CRCacheStore[8]; + }; + +}; + +#endif diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp new file mode 100644 index 0000000000..bc3ba82d3f --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.cpp @@ -0,0 +1,528 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include + +#include "Common.h" +#include "x64Emitter.h" +#include "ABI.h" +#include "Thunk.h" +#include "../../HLE/HLE.h" +#include "../../Core.h" +#include "../../PatchEngine.h" +#include "../../CoreTiming.h" +#include "../../Debugger/Debugger_BreakPoints.h" +#include "../PowerPC.h" +#include "../Profiler.h" +#include "../PPCTables.h" +#include "../PPCAnalyst.h" +#include "../../HW/Memmap.h" +#include "../../HW/GPFifo.h" +#include "Jit.h" +#include "JitAsm.h" +#include "JitCache.h" +#include "JitRegCache.h" + +using namespace Gen; +using namespace PowerPC; + +extern int blocksExecuted; + +// Dolphin's PowerPC->x86 JIT dynamic recompiler +// (Nearly) all code by ector (hrydgard) +// Features: +// * x86 & x64 support, lots of shared code. +// * Basic block linking +// * Fast dispatcher + +// Unfeatures: +// * Does not recompile all instructions - sometimes falls back to inserting a CALL to the corresponding JIT function. + +// Various notes below + +// Register allocation +// RAX - Generic quicktemp register +// RBX - point to base of memory map +// RSI RDI R12 R13 R14 R15 - free for allocation +// RCX RDX R8 R9 R10 R11 - allocate in emergencies. These need to be flushed before functions are called. +// RSP - stack pointer, do not generally use, very dangerous +// RBP - ? + +// IMPORTANT: +// Make sure that all generated code and all emulator state sits under the 2GB boundary so that +// RIP addressing can be used easily. Windows will always allocate static code under the 2GB boundary. +// Also make sure to use VirtualAlloc and specify EXECUTE permission. + +// Open questions +// * Should there be any statically allocated registers? r3, r4, r5, r8, r0 come to mind.. maybe sp +// * Does it make sense to finish off the remaining non-jitted instructions? Seems we are hitting diminishing returns. +// * Why is the FPU exception handling not working 100%? Several games still get corrupted floating point state. +// This can even be seen in one homebrew Wii demo - RayTracer.elf + +// Other considerations +// +// Many instructions have shorter forms for EAX. However, I believe their performance boost +// will be as small to be negligble, so I haven't dirtied up the code with that. AMD recommends it in their +// optimization manuals, though. +// +// We support block linking. Reserve space at the exits of every block for a full 5-byte jmp. Save 16-bit offsets +// from the starts of each block, marking the exits so that they can be nicely patched at any time. +// +// Blocks do NOT use call/ret, they only jmp to each other and to the dispatcher when necessary. +// +// All blocks that can be precompiled will be precompiled. Code will be memory protected - any write will mark +// the region as non-compilable, and all links to the page will be torn out and replaced with dispatcher jmps. +// +// Alternatively, icbi instruction SHOULD mark where we can't compile +// +// Seldom-happening events is handled by adding a decrement of a counter to all blr instructions (which are +// expensive anyway since we need to return to dispatcher, except when they can be predicted). + +// TODO: SERIOUS synchronization problem with the video plugin setting tokens and breakpoints in dual core mode!!! +// Somewhat fixed by disabling idle skipping when certain interrupts are enabled +// This is no permantent reliable fix +// TODO: Zeldas go whacko when you hang the gfx thread + +// Idea - Accurate exception handling +// Compute register state at a certain instruction by running the JIT in "dry mode", and stopping at the right place. +// Not likely to be done :P + + +// Optimization Ideas - +/* + * Assume SP is in main RAM (in Wii mode too?) - partly done + * Assume all floating point loads and double precision loads+stores are to/from main ram + (single precision can be used in write gather pipe, specialized fast check added) + * AMD only - use movaps instead of movapd when loading ps from memory? + * HLE functions like floorf, sin, memcpy, etc - they can be much faster + * ABI optimizations - drop F0-F13 on blr, for example. Watch out for context switching. + CR2-CR4 are non-volatile, rest of CR is volatile -> dropped on blr. + R5-R12 are volatile -> dropped on blr. + * classic inlining across calls. + +Low hanging fruit: +stfd -- guaranteed in memory +cmpl +mulli +stfs +stwu +lb/stzx + +bcx - optimize! +bcctr +stfs +psq_st +addx +orx +rlwimix +fcmpo +DSP_UpdateARAMDMA +lfd +stwu +cntlzwx +bcctrx +WriteBigEData + +TODO +lha +srawx +addic_rc +addex +subfcx +subfex + +fmaddx +fmulx +faddx +fnegx +frspx +frsqrtex +ps_sum0 +ps_muls0 +ps_adds1 + +*/ + +Jit64 jit; + +int CODE_SIZE = 1024*1024*16; + +namespace CPUCompare +{ + extern u32 m_BlockStart; +} + + void Jit(u32 em_address) + { + jit.Jit(em_address); + } + + void Jit64::Init() + { + asm_routines.compareEnabled = ::Core::g_CoreStartupParameter.bRunCompareClient; + if (Core::g_CoreStartupParameter.bJITUnlimitedCache) + CODE_SIZE = 1024*1024*8*8; + + jo.optimizeStack = true; + jo.enableBlocklink = true; // Speed boost, but not 100% safe +#ifdef _M_X64 + jo.enableFastMem = Core::GetStartupParameter().bUseFastMem; +#else + jo.enableFastMem = false; +#endif + jo.assumeFPLoadFromMem = true; + jo.fpAccurateFlags = true; + jo.optimizeGatherPipe = true; + jo.fastInterrupts = false; + jo.accurateSinglePrecision = false; + + gpr.SetEmitter(this); + fpr.SetEmitter(this); + + trampolines.Init(); + AllocCodeSpace(CODE_SIZE); + + blocks.Init(); + asm_routines.Init(); + } + + void Jit64::Shutdown() + { + FreeCodeSpace(); + + blocks.Shutdown(); + trampolines.Shutdown(); + asm_routines.Shutdown(); + } + + + void Jit64::WriteCallInterpreter(UGeckoInstruction inst) + { + gpr.Flush(FLUSH_ALL); + fpr.Flush(FLUSH_ALL); + if (js.isLastInstruction) + { + MOV(32, M(&PC), Imm32(js.compilerPC)); + MOV(32, M(&NPC), Imm32(js.compilerPC + 4)); + } + Interpreter::_interpreterInstruction instr = GetInterpreterOp(inst); + ABI_CallFunctionC((void*)instr, inst.hex); + if (js.isLastInstruction) + { + MOV(32, R(EAX), M(&NPC)); + WriteRfiExitDestInEAX(); + } + } + + void Jit64::unknown_instruction(UGeckoInstruction inst) + { + // CCPU::Break(); + PanicAlert("unknown_instruction %08x - Fix me ;)", inst.hex); + } + + void Jit64::Default(UGeckoInstruction _inst) + { + ibuild.EmitInterpreterFallback( + ibuild.EmitIntConst(_inst.hex), + ibuild.EmitIntConst(js.compilerPC)); + } + + void Jit64::HLEFunction(UGeckoInstruction _inst) + { + gpr.Flush(FLUSH_ALL); + fpr.Flush(FLUSH_ALL); + ABI_CallFunctionCC((void*)&HLE::Execute, js.compilerPC, _inst.hex); + MOV(32, R(EAX), M(&NPC)); + WriteExitDestInEAX(0); + } + + void Jit64::DoNothing(UGeckoInstruction _inst) + { + // Yup, just don't do anything. + } + + void Jit64::NotifyBreakpoint(u32 em_address, bool set) + { + int block_num = blocks.GetBlockNumberFromStartAddress(em_address); + if (block_num >= 0) + { + blocks.DestroyBlock(block_num, false); + } + } + + static const bool ImHereDebug = false; + static const bool ImHereLog = false; + static std::map been_here; + + void ImHere() + { + static FILE *f = 0; + if (ImHereLog) { + if (!f) + { +#ifdef _M_X64 + f = fopen("log64.txt", "w"); +#else + f = fopen("log32.txt", "w"); +#endif + } + fprintf(f, "%08x\n", PC); + } + if (been_here.find(PC) != been_here.end()) { + been_here.find(PC)->second++; + if ((been_here.find(PC)->second) & 1023) + return; + } + LOG(DYNA_REC, "I'm here - PC = %08x , LR = %08x", PC, LR); + printf("I'm here - PC = %08x , LR = %08x", PC, LR); + been_here[PC] = 1; + } + + void Jit64::Cleanup() + { + if (jo.optimizeGatherPipe && js.fifoBytesThisBlock > 0) + ABI_CallFunction((void *)&GPFifo::CheckGatherPipe); + } + + void Jit64::WriteExit(u32 destination, int exit_num) + { + Cleanup(); + SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + + //If nobody has taken care of this yet (this can be removed when all branches are done) + JitBlock *b = js.curBlock; + b->exitAddress[exit_num] = destination; + b->exitPtrs[exit_num] = GetWritableCodePtr(); + + // Link opportunity! + int block = blocks.GetBlockNumberFromStartAddress(destination); + if (block >= 0 && jo.enableBlocklink) + { + // It exists! Joy of joy! + JMP(blocks.GetBlock(block)->checkedEntry, true); + b->linkStatus[exit_num] = true; + } + else + { + MOV(32, M(&PC), Imm32(destination)); + JMP(asm_routines.dispatcher, true); + } + } + + void Jit64::WriteExitDestInEAX(int exit_num) + { + MOV(32, M(&PC), R(EAX)); + Cleanup(); + SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + JMP(asm_routines.dispatcher, true); + } + + void Jit64::WriteRfiExitDestInEAX() + { + MOV(32, M(&PC), R(EAX)); + Cleanup(); + SUB(32, M(&CoreTiming::downcount), js.downcountAmount > 127 ? Imm32(js.downcountAmount) : Imm8(js.downcountAmount)); + JMP(asm_routines.testExceptions, true); + } + + void Jit64::WriteExceptionExit(u32 exception) + { + Cleanup(); + OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(exception)); + MOV(32, M(&PC), Imm32(js.compilerPC + 4)); + JMP(asm_routines.testExceptions, true); + } + + void STACKALIGN Jit64::Run() + { + CompiledCode pExecAddr = (CompiledCode)asm_routines.enterCode; + pExecAddr(); + //Will return when PowerPC::state changes + } + + void Jit64::SingleStep() + { + // NOT USED, NOT TESTED, PROBABLY NOT WORKING YET + // PanicAlert("Single"); + /* + JitBlock temp_block; + PPCAnalyst::CodeBuffer temp_codebuffer(1); // Only room for one instruction! Single step! + const u8 *code = DoJit(PowerPC::ppcState.pc, &temp_codebuffer, &temp_block); + CompiledCode pExecAddr = (CompiledCode)code; + pExecAddr();*/ + } + + void STACKALIGN Jit64::Jit(u32 em_address) + { + if (GetSpaceLeft() < 0x10000 || blocks.IsFull()) + { + LOG(DYNA_REC, "JIT cache full - clearing.") + if (Core::g_CoreStartupParameter.bJITUnlimitedCache) + { + PanicAlert("What? JIT cache still full - clearing."); + } + ClearCache(); + } + int block_num = blocks.AllocateBlock(em_address); + JitBlock *b = blocks.GetBlock(block_num); + blocks.FinalizeBlock(block_num, jo.enableBlocklink, DoJit(em_address, &code_buffer, b)); + } + + const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b) + { + Core::g_CoreStartupParameter.bJITLoadStoreOff = true; + Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff = true; + Core::g_CoreStartupParameter.bJITLoadStorePairedOff = true; + Core::g_CoreStartupParameter.bJITFloatingPointOff = true; + Core::g_CoreStartupParameter.bJITIntegerOff = true; + Core::g_CoreStartupParameter.bJITPairedOff = true; + Core::g_CoreStartupParameter.bJITSystemRegistersOff = true; + Core::g_CoreStartupParameter.bJITBranchOff = true; + if (em_address == 0) + PanicAlert("ERROR : Trying to compile at 0. LR=%08x", LR); + + int size; + js.isLastInstruction = false; + js.blockStart = em_address; + js.fifoBytesThisBlock = 0; + js.curBlock = b; + js.blockSetsQuantizers = false; + js.block_flags = 0; + js.cancel = false; + + //Analyze the block, collect all instructions it is made of (including inlining, + //if that is enabled), reorder instructions for optimal performance, and join joinable instructions. + PPCAnalyst::Flatten(em_address, &size, &js.st, &js.gpa, &js.fpa, code_buffer); + PPCAnalyst::CodeOp *ops = code_buffer->codebuffer; + + const u8 *start = AlignCode4(); //TODO: Test if this or AlignCode16 make a difference from GetCodePtr + b->checkedEntry = start; + b->runCount = 0; + + // Downcount flag check. The last block decremented downcounter, and the flag should still be available. + FixupBranch skip = J_CC(CC_NBE); + MOV(32, M(&PC), Imm32(js.blockStart)); + JMP(asm_routines.doTiming, true); // downcount hit zero - go doTiming. + SetJumpTarget(skip); + + const u8 *normalEntry = GetCodePtr(); + js.normalEntry = (u8*)normalEntry; + + if (ImHereDebug) + ABI_CallFunction((void *)&ImHere); //Used to get a trace of the last few blocks before a crash, sometimes VERY useful + + if (false && js.fpa.any) + { + //This block uses FPU - needs to add FP exception bailout + TEST(32, M(&PowerPC::ppcState.msr), Imm32(1 << 13)); //Test FP enabled bit + FixupBranch b1 = J_CC(CC_NZ); + MOV(32, M(&PC), Imm32(js.blockStart)); + JMP(asm_routines.fpException, true); + SetJumpTarget(b1); + } + + if (false && jo.fastInterrupts) + { + // This does NOT yet work. + TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF)); + FixupBranch b1 = J_CC(CC_Z); + MOV(32, M(&PC), Imm32(js.blockStart)); + JMP(asm_routines.testExceptions, true); + SetJumpTarget(b1); + } + + // Conditionally add profiling code. + if (Profiler::g_ProfileBlocks) { + ADD(32, M(&b->runCount), Imm8(1)); +#ifdef _WIN32 + b->ticCounter.QuadPart = 0; + b->ticStart.QuadPart = 0; + b->ticStop.QuadPart = 0; +#else +//TODO +#endif + // get start tic + PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStart); + } + + //Start up the register allocators + //They use the information in gpa/fpa to preload commonly used registers. + //gpr.Start(js.gpa); + //fpr.Start(js.fpa); + ibuild.Reset(); + + js.downcountAmount = js.st.numCycles + PatchEngine::GetSpeedhackCycles(em_address); + js.blockSize = size; + // Translate instructions + for (int i = 0; i < (int)size; i++) + { + // gpr.Flush(FLUSH_ALL); + // if (PPCTables::UsesFPU(_inst)) + // fpr.Flush(FLUSH_ALL); + js.compilerPC = ops[i].address; + js.op = &ops[i]; + js.instructionNumber = i; + if (i == (int)size - 1) + { + // WARNING - cmp->branch merging will screw this up. + js.isLastInstruction = true; + js.next_inst = 0; + if (Profiler::g_ProfileBlocks) { + // CAUTION!!! push on stack regs you use, do your stuff, then pop + PROFILER_VPUSH; + // get end tic + PROFILER_QUERY_PERFORMACE_COUNTER(&b->ticStop); + // tic counter += (end tic - start tic) + PROFILER_ADD_DIFF_LARGE_INTEGER(&b->ticCounter, &b->ticStop, &b->ticStart); + PROFILER_VPOP; + } + } + else + { + // help peephole optimizations + js.next_inst = ops[i + 1].inst; + js.next_compilerPC = ops[i + 1].address; + } + + if (jo.optimizeGatherPipe && js.fifoBytesThisBlock >= 32) + { + js.fifoBytesThisBlock -= 32; + ABI_CallFunction(thunks.ProtectFunction((void *)&GPFifo::CheckGatherPipe, 0)); + } + + // If starting from the breakpointed instruction, we don't break. + if (em_address != ops[i].address && BreakPoints::IsAddressBreakPoint(ops[i].address)) + { + + } + + if (!ops[i].skip) + PPCTables::CompileInstruction(ops[i].inst); + + gpr.SanityCheck(); + fpr.SanityCheck(); + if (js.cancel) + break; + } + + WriteCode(); + + b->flags = js.block_flags; + b->codeSize = (u32)(GetCodePtr() - normalEntry); + b->originalSize = size; + return normalEntry; + } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h new file mode 100644 index 0000000000..8ddaf59cbc --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit.h @@ -0,0 +1,299 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +// ======================== +// See comments in Jit.cpp. +// ======================== + +// Mystery: Capcom vs SNK 800aa278 + +// CR flags approach: +// * Store that "N+Z flag contains CR0" or "S+Z flag contains CR3". +// * All flag altering instructions flush this +// * A flush simply does a conditional write to the appropriate CRx. +// * If flag available, branch code can become absolutely trivial. + +#ifndef _JIT_H +#define _JIT_H + +#include "../PPCAnalyst.h" +#include "JitCache.h" +#include "JitRegCache.h" +#include "x64Emitter.h" +#include "x64Analyzer.h" +#include "IR.h" + +#ifdef _WIN32 + +#include + +#else + +// A bit of a hack to get things building under linux. We manually fill in this structure as needed +// from the real context. +struct CONTEXT +{ +#ifdef _M_X64 + u64 Rip; + u64 Rax; +#else + u32 Eip; + u32 Eax; +#endif +}; + +#endif + + +class TrampolineCache : public Gen::XCodeBlock +{ +public: + void Init(); + void Shutdown(); + + const u8 *GetReadTrampoline(const InstructionInfo &info); + const u8 *GetWriteTrampoline(const InstructionInfo &info); +}; + + +class Jit64 : public Gen::XCodeBlock +{ +private: + struct JitState + { + u32 compilerPC; + u32 next_compilerPC; + u32 blockStart; + bool cancel; + UGeckoInstruction next_inst; // for easy peephole opt. + int blockSize; + int instructionNumber; + int downcountAmount; + int block_flags; + + bool isLastInstruction; + bool blockSetsQuantizers; + bool forceUnsafeLoad; + + int fifoBytesThisBlock; + + PPCAnalyst::BlockStats st; + PPCAnalyst::BlockRegStats gpa; + PPCAnalyst::BlockRegStats fpa; + PPCAnalyst::CodeOp *op; + u8* normalEntry; + + JitBlock *curBlock; + }; + + struct JitOptions + { + bool optimizeStack; + bool assumeFPLoadFromMem; + bool enableBlocklink; + bool fpAccurateFlags; + bool enableFastMem; + bool optimizeGatherPipe; + bool fastInterrupts; + bool accurateSinglePrecision; + }; + + JitBlockCache blocks; + TrampolineCache trampolines; + GPRRegCache gpr; + FPURegCache fpr; + + // The default code buffer. We keep it around to not have to alloc/dealloc a + // large chunk of memory for each recompiled block. + PPCAnalyst::CodeBuffer code_buffer; + +public: + Jit64() : code_buffer(32000) {} + ~Jit64() {} + + JitState js; + JitOptions jo; + IREmitter::IRBuilder ibuild; + + // Initialization, etc + + void Init(); + void Shutdown(); + + // Jit! + + void Jit(u32 em_address); + const u8* DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitBlock *b); + + JitBlockCache *GetBlockCache() { return &blocks; } + + void NotifyBreakpoint(u32 em_address, bool set); + + void ClearCache() + { + blocks.Clear(); + trampolines.ClearCodeSpace(); + } + + // Run! + + void Run(); + void SingleStep(); + + const u8 *BackPatch(u8 *codePtr, int accessType, u32 em_address, CONTEXT *ctx); + +#define JIT_OPCODE 0 + + // Utilities for use by opcodes + + void WriteExit(u32 destination, int exit_num); + void WriteExitDestInEAX(int exit_num); + void WriteExceptionExit(u32 exception); + void WriteRfiExitDestInEAX(); + void WriteCallInterpreter(UGeckoInstruction _inst); + void Cleanup(); + + void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize, s32 offset = 0, bool signExtend = false); + void UnsafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset = 0); + void SafeLoadRegToEAX(Gen::X64Reg reg, int accessSize, s32 offset, bool signExtend = false); + void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset); + + void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address); + void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address); + void GenerateCarry(Gen::X64Reg temp_reg); + + void ForceSinglePrecisionS(Gen::X64Reg xmm); + void ForceSinglePrecisionP(Gen::X64Reg xmm); + void JitClearCA(); + void JitSetCA(); + void tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); + typedef u32 (*Operation)(u32 a, u32 b); + void regimmop(int d, int a, bool binary, u32 value, Operation doop, void (XEmitter::*op)(int, const Gen::OpArg&, const Gen::OpArg&), bool Rc = false, bool carry = false); + void fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)); + + void WriteCode(); + + // OPCODES + void unknown_instruction(UGeckoInstruction _inst); + void Default(UGeckoInstruction _inst); + void DoNothing(UGeckoInstruction _inst); + void HLEFunction(UGeckoInstruction _inst); + + void DynaRunTable4(UGeckoInstruction _inst); + void DynaRunTable19(UGeckoInstruction _inst); + void DynaRunTable31(UGeckoInstruction _inst); + void DynaRunTable59(UGeckoInstruction _inst); + void DynaRunTable63(UGeckoInstruction _inst); + + void addx(UGeckoInstruction inst); + void orx(UGeckoInstruction inst); + void xorx(UGeckoInstruction inst); + void andx(UGeckoInstruction inst); + void mulli(UGeckoInstruction inst); + void mulhwux(UGeckoInstruction inst); + void mullwx(UGeckoInstruction inst); + void divwux(UGeckoInstruction inst); + void srawix(UGeckoInstruction inst); + void srawx(UGeckoInstruction inst); + void addex(UGeckoInstruction inst); + + void extsbx(UGeckoInstruction inst); + void extshx(UGeckoInstruction inst); + + void sc(UGeckoInstruction _inst); + void rfi(UGeckoInstruction _inst); + + void bx(UGeckoInstruction inst); + void bclrx(UGeckoInstruction _inst); + void bcctrx(UGeckoInstruction _inst); + void bcx(UGeckoInstruction inst); + + void mtspr(UGeckoInstruction inst); + void mfspr(UGeckoInstruction inst); + void mtmsr(UGeckoInstruction inst); + void mfmsr(UGeckoInstruction inst); + void mftb(UGeckoInstruction inst); + void mtcrf(UGeckoInstruction inst); + void mfcr(UGeckoInstruction inst); + + void reg_imm(UGeckoInstruction inst); + + void ps_sel(UGeckoInstruction inst); + void ps_mr(UGeckoInstruction inst); + void ps_sign(UGeckoInstruction inst); //aggregate + void ps_arith(UGeckoInstruction inst); //aggregate + void ps_mergeXX(UGeckoInstruction inst); + void ps_maddXX(UGeckoInstruction inst); + void ps_rsqrte(UGeckoInstruction inst); + void ps_sum(UGeckoInstruction inst); + void ps_muls(UGeckoInstruction inst); + + void fp_arith_s(UGeckoInstruction inst); + + void fcmpx(UGeckoInstruction inst); + void fmrx(UGeckoInstruction inst); + + void cmpXX(UGeckoInstruction inst); + + void cntlzwx(UGeckoInstruction inst); + + void lfs(UGeckoInstruction inst); + void lfd(UGeckoInstruction inst); + void stfd(UGeckoInstruction inst); + void stfs(UGeckoInstruction inst); + void stfsx(UGeckoInstruction inst); + void psq_l(UGeckoInstruction inst); + void psq_st(UGeckoInstruction inst); + + void fmaddXX(UGeckoInstruction inst); + void stX(UGeckoInstruction inst); //stw sth stb + void lXz(UGeckoInstruction inst); + void lha(UGeckoInstruction inst); + void rlwinmx(UGeckoInstruction inst); + void rlwimix(UGeckoInstruction inst); + void rlwnmx(UGeckoInstruction inst); + void negx(UGeckoInstruction inst); + void slwx(UGeckoInstruction inst); + void srwx(UGeckoInstruction inst); + void dcbz(UGeckoInstruction inst); + void lfsx(UGeckoInstruction inst); + + void subfic(UGeckoInstruction inst); + void subfcx(UGeckoInstruction inst); + void subfx(UGeckoInstruction inst); + void subfex(UGeckoInstruction inst); + + void lbzx(UGeckoInstruction inst); + void lwzx(UGeckoInstruction inst); + void lhax(UGeckoInstruction inst); + + void lwzux(UGeckoInstruction inst); + + void stXx(UGeckoInstruction inst); + + void lmw(UGeckoInstruction inst); + void stmw(UGeckoInstruction inst); +}; + +extern Jit64 jit; + +void Jit(u32 em_address); + +void ProfiledReJit(); + +#endif + diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp new file mode 100644 index 0000000000..dc54c4a33e --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.cpp @@ -0,0 +1,277 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "ABI.h" +#include "x64Emitter.h" + +#include "../../HW/Memmap.h" + +#include "../PowerPC.h" +#include "../../CoreTiming.h" +#include "MemoryUtil.h" + +#include "ABI.h" +#include "Jit.h" +#include "JitCache.h" + +#include "../../HW/CPUCompare.h" +#include "../../HW/GPFifo.h" +#include "../../Core.h" +#include "JitAsm.h" + +using namespace Gen; +int blocksExecuted; + +static int temp32; + +bool compareEnabled = false; + +//TODO - make an option +//#if _DEBUG +static bool enableDebug = false; +//#else +// bool enableDebug = false; +//#endif + +static bool enableStatistics = false; + +//GLOBAL STATIC ALLOCATIONS x86 +//EAX - ubiquitous scratch register - EVERYBODY scratches this + +//GLOBAL STATIC ALLOCATIONS x64 +//EAX - ubiquitous scratch register - EVERYBODY scratches this +//RBX - Base pointer of memory +//R15 - Pointer to array of block pointers + +AsmRoutineManager asm_routines; + +// PLAN: no more block numbers - crazy opcodes just contain offset within +// dynarec buffer +// At this offset - 4, there is an int specifying the block number. + + +void AsmRoutineManager::Generate() +{ + enterCode = AlignCode16(); + ABI_PushAllCalleeSavedRegsAndAdjustStack(); +#ifndef _M_IX86 + // Two statically allocated registers. + MOV(64, R(RBX), Imm64((u64)Memory::base)); + MOV(64, R(R15), Imm64((u64)jit.GetBlockCache()->GetCodePointers())); //It's below 2GB so 32 bits are good enough +#endif + + const u8 *outerLoop = GetCodePtr(); + ABI_CallFunction(reinterpret_cast(&CoreTiming::Advance)); + FixupBranch skipToRealDispatch = J(); //skip the sync and compare first time + + dispatcher = GetCodePtr(); + //This is the place for CPUCompare! + + //The result of slice decrementation should be in flags if somebody jumped here + FixupBranch bail = J_CC(CC_S); + SetJumpTarget(skipToRealDispatch); + + dispatcherNoCheck = GetCodePtr(); + MOV(32, R(EAX), M(&PowerPC::ppcState.pc)); + dispatcherPcInEAX = GetCodePtr(); +#ifdef _M_IX86 + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(EBX), Imm32((u32)Memory::base)); + MOV(32, R(EAX), MComplex(EBX, EAX, SCALE_1, 0)); +#else + MOV(32, R(EAX), MComplex(RBX, RAX, SCALE_1, 0)); +#endif + TEST(32, R(EAX), Imm32(0xFC)); + FixupBranch notfound = J_CC(CC_NZ); + BSWAP(32, EAX); + //IDEA - we have 26 bits, why not just use offsets from base of code? + if (enableStatistics) + { + ADD(32, M(&blocksExecuted), Imm8(1)); + } + if (enableDebug) + { + ADD(32, M(&PowerPC::ppcState.DebugCount), Imm8(1)); + } + //grab from list and jump to it +#ifdef _M_IX86 + MOV(32, R(EDX), ImmPtr(jit.GetBlockCache()->GetCodePointers())); + JMPptr(MComplex(EDX, EAX, 4, 0)); +#else + JMPptr(MComplex(R15, RAX, 8, 0)); +#endif + SetJumpTarget(notfound); + + //Ok, no block, let's jit +#ifdef _M_IX86 + ABI_AlignStack(4); + PUSH(32, M(&PowerPC::ppcState.pc)); + CALL(reinterpret_cast(&Jit)); + ABI_RestoreStack(4); +#else + MOV(32, R(ABI_PARAM1), M(&PowerPC::ppcState.pc)); + CALL((void *)&Jit); +#endif + JMP(dispatcherNoCheck); // no point in special casing this + + //FP blocks test for FPU available, jump here if false + fpException = AlignCode4(); + MOV(32, R(EAX), M(&PC)); + MOV(32, M(&NPC), R(EAX)); + OR(32, M(&PowerPC::ppcState.Exceptions), Imm32(EXCEPTION_FPU_UNAVAILABLE)); + ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExceptions)); + MOV(32, R(EAX), M(&NPC)); + MOV(32, M(&PC), R(EAX)); + JMP(dispatcher); + + SetJumpTarget(bail); + doTiming = GetCodePtr(); + + ABI_CallFunction(reinterpret_cast(&CoreTiming::Advance)); + + testExceptions = GetCodePtr(); + TEST(32, M(&PowerPC::ppcState.Exceptions), Imm32(0xFFFFFFFF)); + FixupBranch skipExceptions = J_CC(CC_Z); + MOV(32, R(EAX), M(&PC)); + MOV(32, M(&NPC), R(EAX)); + ABI_CallFunction(reinterpret_cast(&PowerPC::CheckExceptions)); + MOV(32, R(EAX), M(&NPC)); + MOV(32, M(&PC), R(EAX)); + SetJumpTarget(skipExceptions); + + TEST(32, M((void*)&PowerPC::state), Imm32(0xFFFFFFFF)); + J_CC(CC_Z, outerLoop, true); + + //Landing pad for drec space + ABI_PopAllCalleeSavedRegsAndAdjustStack(); + RET(); + + breakpointBailout = GetCodePtr(); + //Landing pad for drec space + ABI_PopAllCalleeSavedRegsAndAdjustStack(); + RET(); + + GenerateCommon(); +} + + +void AsmRoutineManager::GenFifoWrite(int size) +{ + // Assume value in ABI_PARAM1 + PUSH(ESI); + if (size != 32) + PUSH(EDX); + BSWAP(size, ABI_PARAM1); + MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe)); + MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount)); + if (size != 32) { + MOV(32, R(EDX), R(ABI_PARAM1)); + MOV(size, MComplex(RAX, RSI, 1, 0), R(EDX)); + } else { + MOV(size, MComplex(RAX, RSI, 1, 0), R(ABI_PARAM1)); + } + ADD(32, R(ESI), Imm8(size >> 3)); + MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI)); + if (size != 32) + POP(EDX); + POP(ESI); + RET(); +} + +void AsmRoutineManager::GenFifoFloatWrite() +{ + // Assume value in XMM0 + PUSH(ESI); + PUSH(EDX); + MOVSS(M(&temp32), XMM0); + MOV(32, R(EDX), M(&temp32)); + BSWAP(32, EDX); + MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe)); + MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount)); + MOV(32, MComplex(RAX, RSI, 1, 0), R(EDX)); + ADD(32, R(ESI), Imm8(4)); + MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI)); + POP(EDX); + POP(ESI); + RET(); +} + +void AsmRoutineManager::GenFifoXmm64Write() +{ + // Assume value in XMM0. Assume pre-byteswapped (unlike the others here!) + PUSH(ESI); + MOV(32, R(EAX), Imm32((u32)(u64)GPFifo::m_gatherPipe)); + MOV(32, R(ESI), M(&GPFifo::m_gatherPipeCount)); + MOVQ_xmm(MComplex(RAX, RSI, 1, 0), XMM0); + ADD(32, R(ESI), Imm8(8)); + MOV(32, M(&GPFifo::m_gatherPipeCount), R(ESI)); + POP(ESI); + RET(); +} + +void AsmRoutineManager::GenerateCommon() +{ + // USES_CR + computeRc = AlignCode16(); + CMP(32, R(EAX), Imm8(0)); + FixupBranch pLesser = J_CC(CC_L); + FixupBranch pGreater = J_CC(CC_G); + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x2)); // _x86Reg == 0 + RET(); + SetJumpTarget(pGreater); + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x4)); // _x86Reg > 0 + RET(); + SetJumpTarget(pLesser); + MOV(8, M(&PowerPC::ppcState.cr_fast[0]), Imm8(0x8)); // _x86Reg < 0 + RET(); + + fifoDirectWrite8 = AlignCode4(); + GenFifoWrite(8); + fifoDirectWrite16 = AlignCode4(); + GenFifoWrite(16); + fifoDirectWrite32 = AlignCode4(); + GenFifoWrite(32); + fifoDirectWriteFloat = AlignCode4(); + GenFifoFloatWrite(); + fifoDirectWriteXmm64 = AlignCode4(); + GenFifoXmm64Write(); + + doReJit = AlignCode4(); + ABI_AlignStack(0); + CALL(reinterpret_cast(&ProfiledReJit)); + ABI_RestoreStack(0); + SUB(32, M(&CoreTiming::downcount), Imm8(0)); + JMP(dispatcher, true); + + computeRcFp = AlignCode16(); + //CMPSD(R(XMM0), M(&zero), + // TODO + + // Fast write routines - special case the most common hardware write + // TODO: use this. + // Even in x86, the param values will be in the right registers. + /* + const u8 *fastMemWrite8 = AlignCode16(); + CMP(32, R(ABI_PARAM2), Imm32(0xCC008000)); + FixupBranch skip_fast_write = J_CC(CC_NE, false); + MOV(32, EAX, M(&m_gatherPipeCount)); + MOV(8, MDisp(EAX, (u32)&m_gatherPipe), ABI_PARAM1); + ADD(32, 1, M(&m_gatherPipeCount)); + RET(); + SetJumpTarget(skip_fast_write); + CALL((void *)&Memory::Write_U8);*/ +} diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h new file mode 100644 index 0000000000..e5ef03647c --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitAsm.h @@ -0,0 +1,88 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#ifndef _JITASM_H +#define _JITASM_H + +#include "x64Emitter.h" + +// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near +// code at runtime. In the case of fixed code like this, after writing it, we write +// protect the memory, essentially making it work just like precompiled code. + +// There are some advantages to this approach: +// 1) No need to setup an external assembler in the build. +// 2) Cross platform, as long as it's x86/x64. +// 3) Can optimize code at runtime for the specific CPU model. +// There aren't really any disadvantages other than having to maintain a x86 emitter, +// which we have to do anyway :) +// +// To add a new asm routine, just add another const here, and add the code to Generate. +// Also, possibly increase the size of the code buffer. + +class AsmRoutineManager : public Gen::XCodeBlock +{ +private: + void Generate(); + void GenerateCommon(); + void GenFifoWrite(int size); + void GenFifoFloatWrite(); + void GenFifoXmm64Write(); + +public: + void Init() { + AllocCodeSpace(8192); + Generate(); + WriteProtect(); + } + + void Shutdown() { + FreeCodeSpace(); + } + + + // Public generated functions. Just CALL(M((void*)func)) them. + + const u8 *enterCode; + + const u8 *dispatcher; + const u8 *dispatcherNoCheck; + const u8 *dispatcherPcInEAX; + + const u8 *fpException; + const u8 *computeRc; + const u8 *computeRcFp; + const u8 *testExceptions; + const u8 *dispatchPcInEAX; + const u8 *doTiming; + + const u8 *fifoDirectWrite8; + const u8 *fifoDirectWrite16; + const u8 *fifoDirectWrite32; + const u8 *fifoDirectWriteFloat; + const u8 *fifoDirectWriteXmm64; + + const u8 *breakpointBailout; + + const u8 *doReJit; + + bool compareEnabled; +}; + +extern AsmRoutineManager asm_routines; + +#endif diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitBackpatch.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitBackpatch.cpp new file mode 100644 index 0000000000..65e5bbdea2 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitBackpatch.cpp @@ -0,0 +1,215 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include + +#include "Common.h" +#include "disasm.h" +#include "JitAsm.h" +#include "../../HW/Memmap.h" + +#include "x64Emitter.h" +#include "ABI.h" +#include "Thunk.h" +#include "x64Analyzer.h" + +#include "StringUtil.h" +#include "Jit.h" + +using namespace Gen; + +extern u8 *trampolineCodePtr; + +void BackPatchError(const std::string &text, u8 *codePtr, u32 emAddress) { + u64 code_addr = (u64)codePtr; + disassembler disasm; + char disbuf[256]; + memset(disbuf, 0, 256); +#ifdef _M_IX86 + disasm.disasm32(0, code_addr, codePtr, disbuf); +#else + disasm.disasm64(0, code_addr, codePtr, disbuf); +#endif + PanicAlert("%s\n\n" + "Error encountered accessing emulated address %08x.\n" + "Culprit instruction: \n%s\nat %08x%08x", + text.c_str(), emAddress, disbuf, code_addr>>32, code_addr); + return; +} + + +void TrampolineCache::Init() +{ + AllocCodeSpace(1024 * 1024); +} + +void TrampolineCache::Shutdown() +{ + AllocCodeSpace(1024 * 1024); +} + +// Extremely simplistic - just generate the requested trampoline. May reuse them in the future. +const u8 *TrampolineCache::GetReadTrampoline(const InstructionInfo &info) +{ + if (GetSpaceLeft() < 1024) + PanicAlert("Trampoline cache full"); + + X64Reg addrReg = (X64Reg)info.scaledReg; + X64Reg dataReg = (X64Reg)info.regOperandReg; + const u8 *trampoline = GetCodePtr(); +#ifdef _M_X64 + // It's a read. Easy. + ABI_PushAllCallerSavedRegsAndAdjustStack(); + if (addrReg != ABI_PARAM1) + MOV(32, R(ABI_PARAM1), R((X64Reg)addrReg)); + if (info.displacement) { + ADD(32, R(ABI_PARAM1), Imm32(info.displacement)); + } + switch (info.operandSize) { + case 4: + CALL(thunks.ProtectFunction((void *)&Memory::Read_U32, 1)); + break; + } + ABI_PopAllCallerSavedRegsAndAdjustStack(); + MOV(32, R(dataReg), R(EAX)); + RET(); +#endif + return trampoline; +} + +// Extremely simplistic - just generate the requested trampoline. May reuse them in the future. +const u8 *TrampolineCache::GetWriteTrampoline(const InstructionInfo &info) +{ + if (GetSpaceLeft() < 1024) + PanicAlert("Trampoline cache full"); + + X64Reg addrReg = (X64Reg)info.scaledReg; + X64Reg dataReg = (X64Reg)info.regOperandReg; + if (dataReg != EAX) + PanicAlert("Backpatch write - not through EAX"); + + const u8 *trampoline = GetCodePtr(); + +#ifdef _M_X64 + + // It's a write. Yay. Remember that we don't have to be super efficient since it's "just" a + // hardware access - we can take shortcuts. + //if (emAddress == 0xCC008000) + // PanicAlert("caught a fifo write"); + CMP(32, R(addrReg), Imm32(0xCC008000)); + FixupBranch skip_fast = J_CC(CC_NE, false); + MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); + CALL((void*)asm_routines.fifoDirectWrite32); + RET(); + SetJumpTarget(skip_fast); + ABI_PushAllCallerSavedRegsAndAdjustStack(); + if (addrReg != ABI_PARAM1) { + MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); + MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg)); + } else { + MOV(32, R(ABI_PARAM2), R((X64Reg)addrReg)); + MOV(32, R(ABI_PARAM1), R((X64Reg)dataReg)); + } + if (info.displacement) { + ADD(32, R(ABI_PARAM2), Imm32(info.displacement)); + } + switch (info.operandSize) { + case 4: + CALL(thunks.ProtectFunction((void *)&Memory::Write_U32, 2)); + break; + } + ABI_PopAllCallerSavedRegsAndAdjustStack(); + RET(); +#endif + + return trampoline; +} + + +// This generates some fairly heavy trampolines, but: +// 1) It's really necessary. We don't know anything about the context. +// 2) It doesn't really hurt. Only instructions that access I/O will get these, and there won't be +// that many of them in a typical program/game. +const u8 *Jit64::BackPatch(u8 *codePtr, int accessType, u32 emAddress, CONTEXT *ctx) +{ +#ifdef _M_X64 + if (!jit.IsInCodeSpace(codePtr)) + return 0; // this will become a regular crash real soon after this + + InstructionInfo info; + if (!DisassembleMov(codePtr, info, accessType)) { + BackPatchError("BackPatch - failed to disassemble MOV instruction", codePtr, emAddress); + } + + /* + if (info.isMemoryWrite) { + if (!Memory::IsRAMAddress(emAddress, true)) { + PanicAlert("Exception: Caught write to invalid address %08x", emAddress); + return; + } + BackPatchError("BackPatch - determined that MOV is write, not yet supported and should have been caught before", + codePtr, emAddress); + }*/ + + if (info.operandSize != 4) { + BackPatchError(StringFromFormat("BackPatch - no support for operand size %i", info.operandSize), codePtr, emAddress); + } + + if (info.otherReg != RBX) + PanicAlert("BackPatch : Base reg not RBX." + "\n\nAttempted to access %08x.", emAddress); + + if (accessType == OP_ACCESS_WRITE) + PanicAlert("BackPatch : Currently only supporting reads." + "\n\nAttempted to write to %08x.", emAddress); + + // In the first iteration, we assume that all accesses are 32-bit. We also only deal with reads. + if (accessType == 0) + { + XEmitter emitter(codePtr); + int bswapNopCount; + // Check the following BSWAP for REX byte + if ((codePtr[info.instructionSize] & 0xF0) == 0x40) + bswapNopCount = 3; + else + bswapNopCount = 2; + const u8 *trampoline = trampolines.GetReadTrampoline(info); + emitter.CALL((void *)trampoline); + emitter.NOP((int)info.instructionSize + bswapNopCount - 5); + return codePtr; + } + else if (accessType == 1) + { + // TODO: special case FIFO writes. Also, support 32-bit mode. + // Also, debug this so that it actually works correctly :P + XEmitter emitter(codePtr - 2); + // We know it's EAX so the BSWAP before will be two byte. Overwrite it. + const u8 *trampoline = trampolines.GetWriteTrampoline(info); + emitter.CALL((void *)trampoline); + emitter.NOP((int)info.instructionSize - 3); + if (info.instructionSize < 3) + PanicAlert("instruction too small"); + // We entered here with a BSWAP-ed EAX. We'll have to swap it back. + ctx->Rax = Common::swap32((u32)ctx->Rax); + return codePtr - 2; + } + return 0; +#else + return 0; +#endif +} + diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.cpp new file mode 100644 index 0000000000..43ef7260a0 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.cpp @@ -0,0 +1,346 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +// Enable define below to enable oprofile integration. For this to work, +// it requires at least oprofile version 0.9.4, and changing the build +// system to link the Dolphin executable against libopagent. Since the +// dependency is a little inconvenient and this is possibly a slight +// performance hit, it's not enabled by default, but it's useful for +// locating performance issues. + +#define OPROFILE_REPORT + +#include "Common.h" +#include "../../Core.h" +#include "MemoryUtil.h" + +#include "../../HW/Memmap.h" +#include "../../CoreTiming.h" + +#include "../PowerPC.h" +#include "../PPCTables.h" +#include "../PPCAnalyst.h" + +#include "x64Emitter.h" +#include "x64Analyzer.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitAsm.h" + +#include "disasm.h" + +#ifdef OPROFILE_REPORT +#include +#endif + +#ifdef OPROFILE_REPORT + op_agent_t agent; +#endif + +using namespace Gen; + +#define INVALID_EXIT 0xFFFFFFFF + + +bool JitBlock::ContainsAddress(u32 em_address) +{ + // WARNING - THIS DOES NOT WORK WITH INLINING ENABLED. + return (em_address >= originalAddress && em_address < originalAddress + originalSize); +} + + bool JitBlockCache::IsFull() const + { + return GetNumBlocks() >= MAX_NUM_BLOCKS - 1; + } + + void JitBlockCache::Init() + { + MAX_NUM_BLOCKS = 65536*2; + if (Core::g_CoreStartupParameter.bJITUnlimitedCache) + { + MAX_NUM_BLOCKS = 65536*8; + } + +#ifdef OPROFILE_REPORT + agent = op_open_agent(); +#endif + blocks = new JitBlock[MAX_NUM_BLOCKS]; + blockCodePointers = new const u8*[MAX_NUM_BLOCKS]; + + Clear(); + } + + void JitBlockCache::Shutdown() + { + delete [] blocks; + delete [] blockCodePointers; + blocks = 0; + blockCodePointers = 0; + num_blocks = 0; +#ifdef OPROFILE_REPORT + op_close_agent(agent); +#endif + } + + // This clears the JIT cache. It's called from JitCache.cpp when the JIT cache + // is full and when saving and loading states. + void JitBlockCache::Clear() + { + Core::DisplayMessage("Cleared code cache.", 3000); + // Is destroying the blocks really necessary? + for (int i = 0; i < num_blocks; i++) + { + DestroyBlock(i, false); + } + links_to.clear(); + num_blocks = 0; + memset(blockCodePointers, 0, sizeof(u8*)*MAX_NUM_BLOCKS); + } + + void JitBlockCache::DestroyBlocksWithFlag(BlockFlag death_flag) + { + for (int i = 0; i < num_blocks; i++) + { + if (blocks[i].flags & death_flag) + { + DestroyBlock(i, false); + } + } + } + + void JitBlockCache::Reset() + { + Shutdown(); + Init(); + } + + JitBlock *JitBlockCache::GetBlock(int no) + { + return &blocks[no]; + } + + int JitBlockCache::GetNumBlocks() const + { + return num_blocks; + } + + bool JitBlockCache::RangeIntersect(int s1, int e1, int s2, int e2) const + { + // check if any endpoint is inside the other range + if ((s1 >= s2 && s1 <= e2) || + (e1 >= s2 && e1 <= e2) || + (s2 >= s1 && s2 <= e1) || + (e2 >= s1 && e2 <= e1)) + return true; + else + return false; + } + + int JitBlockCache::AllocateBlock(u32 em_address) + { + JitBlock &b = blocks[num_blocks]; + b.invalid = false; + b.originalAddress = em_address; + b.originalFirstOpcode = Memory::ReadFast32(em_address); + b.exitAddress[0] = INVALID_EXIT; + b.exitAddress[1] = INVALID_EXIT; + b.exitPtrs[0] = 0; + b.exitPtrs[1] = 0; + b.linkStatus[0] = false; + b.linkStatus[1] = false; + num_blocks++; //commit the current block + return num_blocks - 1; + } + + void JitBlockCache::FinalizeBlock(int block_num, bool block_link, const u8 *code_ptr) + { + blockCodePointers[block_num] = code_ptr; + JitBlock &b = blocks[block_num]; + Memory::WriteUnchecked_U32((JIT_OPCODE << 26) | block_num, blocks[block_num].originalAddress); + if (block_link) + { + for (int i = 0; i < 2; i++) + { + if (b.exitAddress[i] != INVALID_EXIT) + links_to.insert(std::pair(b.exitAddress[i], block_num)); + } + + LinkBlock(block_num); + LinkBlockExits(block_num); + } + +#ifdef OPROFILE_REPORT + char buf[100]; + sprintf(buf, "EmuCode%x", b.originalAddress); + const u8* blockStart = blockCodePointers[block_num]; + op_write_native_code(agent, buf, (uint64_t)blockStart, + blockStart, b.codeSize); +#endif + } + + const u8 **JitBlockCache::GetCodePointers() + { + return blockCodePointers; + } + + int JitBlockCache::GetBlockNumberFromStartAddress(u32 addr) + { + if (!blocks) + return -1; + u32 code = Memory::ReadFast32(addr); + if ((code >> 26) == JIT_OPCODE) + { + // Jitted code. + unsigned int block = code & 0x03FFFFFF; + if (block >= (unsigned int)num_blocks) { + return -1; + } + + if (blocks[block].originalAddress != addr) + { + //_assert_msg_(DYNA_REC, 0, "GetBlockFromAddress %08x - No match - This is BAD", addr); + return -1; + } + return block; + } + else + { + return -1; + } + } + +void JitBlockCache::GetBlockNumbersFromAddress(u32 em_address, std::vector *block_numbers) +{ + for (int i = 0; i < num_blocks; i++) + if (blocks[i].ContainsAddress(em_address)) + block_numbers->push_back(i); +} + + u32 JitBlockCache::GetOriginalCode(u32 address) + { + int num = GetBlockNumberFromStartAddress(address); + if (num == -1) + return Memory::ReadUnchecked_U32(address); + else + return blocks[num].originalFirstOpcode; + } + + CompiledCode JitBlockCache::GetCompiledCodeFromBlock(int blockNumber) + { + return (CompiledCode)blockCodePointers[blockNumber]; + } + + //Block linker + //Make sure to have as many blocks as possible compiled before calling this + //It's O(N), so it's fast :) + //Can be faster by doing a queue for blocks to link up, and only process those + //Should probably be done + + void JitBlockCache::LinkBlockExits(int i) + { + JitBlock &b = blocks[i]; + if (b.invalid) + { + // This block is dead. Don't relink it. + return; + } + for (int e = 0; e < 2; e++) + { + if (b.exitAddress[e] != INVALID_EXIT && !b.linkStatus[e]) + { + int destinationBlock = GetBlockNumberFromStartAddress(b.exitAddress[e]); + if (destinationBlock != -1) + { + XEmitter emit(b.exitPtrs[e]); + emit.JMP(blocks[destinationBlock].checkedEntry, true); + b.linkStatus[e] = true; + } + } + } + } + + using namespace std; + + void JitBlockCache::LinkBlock(int i) + { + LinkBlockExits(i); + JitBlock &b = blocks[i]; + std::map::iterator iter; + pair::iterator, multimap::iterator> ppp; + // equal_range(b) returns pair representing the range + // of element with key b + ppp = links_to.equal_range(b.originalAddress); + if (ppp.first == ppp.second) + return; + for (multimap::iterator iter2 = ppp.first; iter2 != ppp.second; ++iter2) { + // PanicAlert("Linking block %i to block %i", iter2->second, i); + LinkBlockExits(iter2->second); + } + } + + void JitBlockCache::DestroyBlock(int blocknum, bool invalidate) + { + u32 codebytes = (JIT_OPCODE << 26) | blocknum; //generate from i + JitBlock &b = blocks[blocknum]; + b.invalid = 1; + if (codebytes == Memory::ReadFast32(b.originalAddress)) + { + //nobody has changed it, good + Memory::WriteUnchecked_U32(b.originalFirstOpcode, b.originalAddress); + } + else if (!invalidate) + { + //PanicAlert("Detected code overwrite"); + //else, we may be in trouble, since we apparently know of this block but it's been + //overwritten. We should have thrown it out before, on instruction cache invalidate or something. + //Not ne cessarily bad though , if a game has simply thrown away a lot of code and is now using the space + //for something else, then it's fine. + LOG(MASTER_LOG, "WARNING - ClearCache detected code overwrite @ %08x", blocks[blocknum].originalAddress); + } + + // We don't unlink blocks, we just send anyone who tries to run them back to the dispatcher. + // Not entirely ideal, but .. pretty good. + + // TODO - make sure that the below stuff really is safe. + + // Spurious entrances from previously linked blocks can only come through checkedEntry + XEmitter emit((u8 *)b.checkedEntry); + emit.MOV(32, M(&PC), Imm32(b.originalAddress)); + emit.JMP(asm_routines.dispatcher, true); + + emit.SetCodePtr((u8 *)blockCodePointers[blocknum]); + emit.MOV(32, M(&PC), Imm32(b.originalAddress)); + emit.JMP(asm_routines.dispatcher, true); + } + + + void JitBlockCache::InvalidateCodeRange(u32 address, u32 length) + { + if (!jit.jo.enableBlocklink) + return; + return; + //This is slow but should be safe (zelda needs it for block linking) + for (int i = 0; i < num_blocks; i++) + { + if (RangeIntersect(blocks[i].originalAddress, blocks[i].originalAddress + blocks[i].originalSize, + address, address + length)) + { + DestroyBlock(i, true); + } + } + } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.h b/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.h new file mode 100644 index 0000000000..f75a5e1db2 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitCache.h @@ -0,0 +1,116 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#ifndef _JITCACHE_H +#define _JITCACHE_H + +#include +#include + +#include "../Gekko.h" +#include "../PPCAnalyst.h" + +#ifdef _WIN32 +#include +#endif + +enum BlockFlag +{ + BLOCK_USE_GQR0 = 0x1, BLOCK_USE_GQR1 = 0x2, BLOCK_USE_GQR2 = 0x4, BLOCK_USE_GQR3 = 0x8, + BLOCK_USE_GQR4 = 0x10, BLOCK_USE_GQR5 = 0x20, BLOCK_USE_GQR6 = 0x40, BLOCK_USE_GQR7 = 0x80, +}; + +// TODO(ector) - optimize this struct for size +struct JitBlock +{ + u32 exitAddress[2]; // 0xFFFFFFFF == unknown + u8 *exitPtrs[2]; // to be able to rewrite the exit jump + bool linkStatus[2]; + + u32 originalAddress; + u32 originalFirstOpcode; //to be able to restore + u32 codeSize; + u32 originalSize; + int runCount; // for profiling. + +#ifdef _WIN32 + // we don't really need to save start and stop + // TODO (mb2): ticStart and ticStop -> "local var" mean "in block" ... low priority ;) + LARGE_INTEGER ticStart; // for profiling - time. + LARGE_INTEGER ticStop; // for profiling - time. + LARGE_INTEGER ticCounter; // for profiling - time. +#endif + const u8 *checkedEntry; + bool invalid; + int flags; + + bool ContainsAddress(u32 em_address); +}; + +typedef void (*CompiledCode)(); + +class JitBlockCache +{ + const u8 **blockCodePointers; + JitBlock *blocks; + int num_blocks; + std::multimap links_to; + int MAX_NUM_BLOCKS; + + bool RangeIntersect(int s1, int e1, int s2, int e2) const; + void LinkBlockExits(int i); + void LinkBlock(int i); + +public: + JitBlockCache() {} + + int AllocateBlock(u32 em_address); + void FinalizeBlock(int block_num, bool block_link, const u8 *code_ptr); + + void Clear(); + void Init(); + void Shutdown(); + void Reset(); + + bool IsFull() const; + + // Code Cache + JitBlock *GetBlock(int block_num); + int GetNumBlocks() const; + const u8 **GetCodePointers(); + + // Fast way to get a block. Only works on the first ppc instruction of a block. + int GetBlockNumberFromStartAddress(u32 em_address); + + // slower, but can get numbers from within blocks, not just the first instruction. + // WARNING! WILL NOT WORK WITH INLINING ENABLED (not yet a feature but will be soon) + // Returns a list of block numbers - only one block can start at a particular address, but they CAN overlap. + // This one is slow so should only be used for one-shots from the debugger UI, not for anything during runtime. + void GetBlockNumbersFromAddress(u32 em_address, std::vector *block_numbers); + + u32 GetOriginalCode(u32 address); + CompiledCode GetCompiledCodeFromBlock(int blockNumber); + + // DOES NOT WORK CORRECTLY WITH INLINING + void InvalidateCodeRange(u32 em_address, u32 length); + void DestroyBlock(int blocknum, bool invalidate); + + // Not currently used + void DestroyBlocksWithFlag(BlockFlag death_flag); +}; + +#endif diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.cpp new file mode 100644 index 0000000000..b5f987aa1c --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.cpp @@ -0,0 +1,395 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "../PowerPC.h" +#include "../PPCTables.h" +#include "../PPCAnalyst.h" +#include "Jit.h" +#include "JitCache.h" +#include "JitAsm.h" +#include "JitRegCache.h" + +using namespace Gen; +using namespace PowerPC; + +void RegCache::Start(PPCAnalyst::BlockRegStats &stats) +{ + for (int i = 0; i < NUMXREGS; i++) + { + xregs[i].free = true; + xregs[i].dirty = false; + xlocks[i] = false; + } + for (int i = 0; i < 32; i++) + { + regs[i].location = GetDefaultLocation(i); + regs[i].away = false; + } + + // todo: sort to find the most popular regs + /* + int maxPreload = 2; + for (int i = 0; i < 32; i++) + { + if (stats.numReads[i] > 2 || stats.numWrites[i] >= 2) + { + LoadToX64(i, true, false); //stats.firstRead[i] <= stats.firstWrite[i], false); + maxPreload--; + if (!maxPreload) + break; + } + }*/ + //Find top regs - preload them (load bursts ain't bad) + //But only preload IF written OR reads >= 3 +} + +// these are powerpc reg indices +void RegCache::Lock(int p1, int p2, int p3, int p4) +{ + locks[p1] = true; + if (p2 != 0xFF) locks[p2] = true; + if (p3 != 0xFF) locks[p3] = true; + if (p4 != 0xFF) locks[p4] = true; +} + +// these are x64 reg indices +void RegCache::LockX(int x1, int x2, int x3, int x4) +{ + if (xlocks[x1]) { + PanicAlert("RegCache: x %i already locked!"); + } + xlocks[x1] = true; + if (x2 != 0xFF) xlocks[x2] = true; + if (x3 != 0xFF) xlocks[x3] = true; + if (x4 != 0xFF) xlocks[x4] = true; +} + +bool RegCache::IsFreeX(int xreg) const +{ + return xregs[xreg].free && !xlocks[xreg]; +} + +void RegCache::UnlockAll() +{ + for (int i = 0; i < 32; i++) + locks[i] = false; +} + +void RegCache::UnlockAllX() +{ + for (int i = 0; i < NUMXREGS; i++) + xlocks[i] = false; +} + +X64Reg RegCache::GetFreeXReg() +{ + int aCount; + const int *aOrder = GetAllocationOrder(aCount); + for (int i = 0; i < aCount; i++) + { + X64Reg xr = (X64Reg)aOrder[i]; + if (!xlocks[xr] && xregs[xr].free) + { + return (X64Reg)xr; + } + } + //Okay, not found :( Force grab one + + //TODO - add a pass to grab xregs whose ppcreg is not used in the next 3 instructions + for (int i = 0; i < aCount; i++) + { + X64Reg xr = (X64Reg)aOrder[i]; + if (xlocks[xr]) + continue; + int preg = xregs[xr].ppcReg; + if (!locks[preg]) + { + StoreFromX64(preg); + return xr; + } + } + //Still no dice? Die! + _assert_msg_(DYNA_REC, 0, "Regcache ran out of regs"); + return (X64Reg) -1; +} + +void RegCache::SaveState() +{ + memcpy(saved_locks, locks, sizeof(locks)); + memcpy(saved_xlocks, xlocks, sizeof(xlocks)); + memcpy(saved_regs, regs, sizeof(regs)); + memcpy(saved_xregs, xregs, sizeof(xregs)); +} + +void RegCache::LoadState() +{ + memcpy(xlocks, saved_xlocks, sizeof(xlocks)); + memcpy(locks, saved_locks, sizeof(locks)); + memcpy(regs, saved_regs, sizeof(regs)); + memcpy(xregs, saved_xregs, sizeof(xregs)); +} + +void RegCache::FlushR(X64Reg reg) +{ + if (reg >= NUMXREGS) + PanicAlert("Flushing non existent reg"); + if (!xregs[reg].free) + { + StoreFromX64(xregs[reg].ppcReg); + } +} + +void RegCache::SanityCheck() const +{ + for (int i = 0; i < 32; i++) { + if (regs[i].away) { + if (regs[i].location.IsSimpleReg()) { + Gen::X64Reg simple = regs[i].location.GetSimpleReg(); + if (xlocks[simple]) { + PanicAlert("%08x : PPC Reg %i is in locked x64 register %i", /*js.compilerPC*/ 0, i, regs[i].location.GetSimpleReg()); + } + if (xregs[simple].ppcReg != i) { + PanicAlert("%08x : Xreg/ppcreg mismatch"); + } + } + } + } +} + +void RegCache::DiscardRegContentsIfCached(int preg) +{ + if (regs[preg].away && regs[preg].location.IsSimpleReg()) + { + xregs[regs[preg].location.GetSimpleReg()].free = true; + xregs[regs[preg].location.GetSimpleReg()].dirty = false; + regs[preg].away = false; + } +} + + +void GPRRegCache::SetImmediate32(int preg, u32 immValue) +{ + DiscardRegContentsIfCached(preg); + regs[preg].away = true; + regs[preg].location = Imm32(immValue); +} + +void GPRRegCache::Start(PPCAnalyst::BlockRegStats &stats) +{ + RegCache::Start(stats); +} + +void FPURegCache::Start(PPCAnalyst::BlockRegStats &stats) +{ + RegCache::Start(stats); +} + +const int *GPRRegCache::GetAllocationOrder(int &count) +{ + static const int allocationOrder[] = + { +#ifdef _M_X64 +#ifdef _WIN32 + RSI, RDI, R12, R13, R14, R8, R9, R10, R11 //, RCX +#else + RBP, R12, R13, R14, R8, R9, R10, R11, //, RCX +#endif +#elif _M_IX86 + ESI, EDI, EBX, EBP, EDX, ECX, +#endif + }; + count = sizeof(allocationOrder) / sizeof(const int); + return allocationOrder; +} + +const int *FPURegCache::GetAllocationOrder(int &count) +{ + static const int allocationOrder[] = + { +#ifdef _M_X64 + XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM2, XMM3, XMM4, XMM5 +#elif _M_IX86 + XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, +#endif + }; + count = sizeof(allocationOrder) / sizeof(int); + return allocationOrder; +} + +OpArg GPRRegCache::GetDefaultLocation(int reg) const +{ + return M(&ppcState.gpr[reg]); +} + +OpArg FPURegCache::GetDefaultLocation(int reg) const +{ + return M(&ppcState.ps[reg][0]); +} + +void RegCache::KillImmediate(int preg) +{ + if (regs[preg].away && regs[preg].location.IsImm()) + { + LoadToX64(preg, true, true); + } +} + +void GPRRegCache::LoadToX64(int i, bool doLoad, bool makeDirty) +{ + PanicAlert("BADNESS!"); + + if (!regs[i].away && regs[i].location.IsImm()) + PanicAlert("Bad immedaite"); + + if (!regs[i].away || (regs[i].away && regs[i].location.IsImm())) + { + X64Reg xr = GetFreeXReg(); + if (xregs[xr].dirty) PanicAlert("Xreg already dirty"); + if (xlocks[xr]) PanicAlert("GetFreeXReg returned locked register"); + xregs[xr].free = false; + xregs[xr].ppcReg = i; + xregs[xr].dirty = makeDirty || regs[i].location.IsImm(); + OpArg newloc = ::Gen::R(xr); + if (doLoad) + emit->MOV(32, newloc, regs[i].location); + for (int j = 0; j < 32; j++) + { + if (i != j && regs[j].location.IsSimpleReg() && regs[j].location.GetSimpleReg() == xr) + { + Crash(); + } + } + regs[i].away = true; + regs[i].location = newloc; + } + else + { + // reg location must be simplereg; memory locations + // and immediates are taken care of above. + xregs[RX(i)].dirty |= makeDirty; + } + if (xlocks[RX(i)]) { + PanicAlert("Seriously WTF, this reg should have been flushed"); + } +} + +void GPRRegCache::StoreFromX64(int i) +{ + if (regs[i].away) + { + bool doStore; + if (regs[i].location.IsSimpleReg()) + { + X64Reg xr = RX(i); + xregs[xr].free = true; + xregs[xr].ppcReg = -1; + doStore = xregs[xr].dirty; + xregs[xr].dirty = false; + } + else + { + //must be immediate - do nothing + doStore = true; + } + OpArg newLoc = GetDefaultLocation(i); + // if (doStore) //<-- Breaks JIT compilation + emit->MOV(32, newLoc, regs[i].location); + regs[i].location = newLoc; + regs[i].away = false; + } +} + +void FPURegCache::LoadToX64(int i, bool doLoad, bool makeDirty) +{ + _assert_msg_(DYNA_REC, !regs[i].location.IsImm(), "WTF - load - imm"); + if (!regs[i].away) + { + // Reg is at home in the memory register file. Let's pull it out. + X64Reg xr = GetFreeXReg(); + _assert_msg_(DYNA_REC, xr >= 0 && xr < NUMXREGS, "WTF - load - invalid reg"); + xregs[xr].ppcReg = i; + xregs[xr].free = false; + xregs[xr].dirty = makeDirty; + OpArg newloc = ::Gen::R(xr); + if (doLoad) + { + if (!regs[i].location.IsImm() && (regs[i].location.offset & 0xF)) + { + PanicAlert("WARNING - misaligned fp register location %i", i); + } + emit->MOVAPD(xr, regs[i].location); + } + regs[i].location = newloc; + regs[i].away = true; + } else { + // There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary. + xregs[RX(i)].dirty |= makeDirty; + } +} + +void FPURegCache::StoreFromX64(int i) +{ + _assert_msg_(DYNA_REC, !regs[i].location.IsImm(), "WTF - store - imm"); + if (regs[i].away) + { + X64Reg xr = regs[i].location.GetSimpleReg(); + _assert_msg_(DYNA_REC, xr >= 0 && xr < NUMXREGS, "WTF - store - invalid reg"); + xregs[xr].free = true; + xregs[xr].dirty = false; + xregs[xr].ppcReg = -1; + OpArg newLoc = GetDefaultLocation(i); + emit->MOVAPD(newLoc, xr); + regs[i].location = newLoc; + regs[i].away = false; + } + else + { + // _assert_msg_(DYNA_REC,0,"already stored"); + } +} + +void RegCache::Flush(FlushMode mode) +{ + for (int i = 0; i < NUMXREGS; i++) { + if (xlocks[i]) + PanicAlert("Somone forgot to unlock X64 reg %i.", i); + } + for (int i = 0; i < 32; i++) + { + if (locks[i]) + { + PanicAlert("Somebody forgot to unlock PPC reg %i.", i); + } + if (regs[i].away) + { + if (regs[i].location.IsSimpleReg()) + { + X64Reg xr = RX(i); + StoreFromX64(i); + xregs[xr].dirty = false; + } + else if (regs[i].location.IsImm()) + { + StoreFromX64(i); + } + else + { + _assert_msg_(DYNA_REC,0,"Jit64 - Flush unhandled case, reg %i", i); + } + } + } +} diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.h b/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.h new file mode 100644 index 0000000000..fa65c596c5 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/JitRegCache.h @@ -0,0 +1,150 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#ifndef _JITREGCACHE_H +#define _JITREGCACHE_H + +#include "x64Emitter.h" + +using namespace Gen; +enum FlushMode +{ + FLUSH_ALL +}; + +enum GrabMode +{ + M_READ = 1, + M_WRITE = 2, + M_READWRITE = 3, +}; + +struct PPCCachedReg +{ + OpArg location; + bool away; // value not in source register +}; + +struct X64CachedReg +{ + int ppcReg; + bool dirty; + bool free; +}; + +typedef int XReg; +typedef int PReg; + +#ifdef _M_X64 +#define NUMXREGS 16 +#elif _M_IX86 +#define NUMXREGS 8 +#endif + +class RegCache +{ +private: + bool locks[32]; + bool saved_locks[32]; + bool saved_xlocks[NUMXREGS]; + +protected: + bool xlocks[NUMXREGS]; + PPCCachedReg regs[32]; + X64CachedReg xregs[NUMXREGS]; + + PPCCachedReg saved_regs[32]; + X64CachedReg saved_xregs[NUMXREGS]; + + void DiscardRegContentsIfCached(int preg); + virtual const int *GetAllocationOrder(int &count) = 0; + + XEmitter *emit; + +public: + virtual ~RegCache() {} + virtual void Start(PPCAnalyst::BlockRegStats &stats) = 0; + + void SetEmitter(XEmitter *emitter) {emit = emitter;} + + void FlushR(X64Reg reg); + void FlushR(X64Reg reg, X64Reg reg2) {FlushR(reg); FlushR(reg2);} + void FlushLockX(X64Reg reg) { + FlushR(reg); + LockX(reg); + } + void FlushLockX(X64Reg reg1, X64Reg reg2) { + FlushR(reg1); FlushR(reg2); + LockX(reg1); LockX(reg2); + } + virtual void Flush(FlushMode mode); + virtual void Flush(PPCAnalyst::CodeOp *op) {Flush(FLUSH_ALL);} + void SanityCheck() const; + void KillImmediate(int preg); + + //TODO - instead of doload, use "read", "write" + //read only will not set dirty flag + virtual void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true) = 0; + virtual void StoreFromX64(int preg) = 0; + + const OpArg &R(int preg) const {return regs[preg].location;} + X64Reg RX(int preg) const + { + if (regs[preg].away && regs[preg].location.IsSimpleReg()) + return regs[preg].location.GetSimpleReg(); + PanicAlert("Not so simple - %i", preg); + return (X64Reg)-1; + } + virtual OpArg GetDefaultLocation(int reg) const = 0; + + // Register locking. + void Lock(int p1, int p2=0xff, int p3=0xff, int p4=0xff); + void LockX(int x1, int x2=0xff, int x3=0xff, int x4=0xff); + void UnlockAll(); + void UnlockAllX(); + + bool IsFreeX(int xreg) const; + + X64Reg GetFreeXReg(); + + void SaveState(); + void LoadState(); +}; + +class GPRRegCache : public RegCache +{ +public: + void Start(PPCAnalyst::BlockRegStats &stats); + void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true); + void StoreFromX64(int preg); + OpArg GetDefaultLocation(int reg) const; + const int *GetAllocationOrder(int &count); + void SetImmediate32(int preg, u32 immValue); +}; + + +class FPURegCache : public RegCache +{ +public: + void Start(PPCAnalyst::BlockRegStats &stats); + void LoadToX64(int preg, bool doLoad = true, bool makeDirty = true); + void StoreFromX64(int preg); + const int *GetAllocationOrder(int &count); + OpArg GetDefaultLocation(int reg) const; +}; + +#endif diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp new file mode 100644 index 0000000000..9be9ee9ff9 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Branch.cpp @@ -0,0 +1,200 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ +#include "Common.h" +#include "Thunk.h" + +#include "../../Core.h" +#include "../PowerPC.h" +#include "../../CoreTiming.h" +#include "../PPCTables.h" +#include "x64Emitter.h" + +#include "Jit.h" +#include "JitRegCache.h" +#include "JitCache.h" +#include "JitAsm.h" + +// The branches are known good, or at least reasonably good. +// No need for a disable-mechanism. + +// If defined, clears CR0 at blr and bl-s. If the assumption that +// flags never carry over between functions holds, then the task for +// an optimizer becomes much easier. + +// #define ACID_TEST + +// Zelda and many more games seem to pass the Acid Test. + +using namespace Gen; + + void Jit64::sc(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff) + {Default(inst); return;} // turn off from debugger + + gpr.Flush(FLUSH_ALL); + fpr.Flush(FLUSH_ALL); + WriteExceptionExit(EXCEPTION_SYSCALL); + } + + void Jit64::rfi(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff) + {Default(inst); return;} // turn off from debugger + + gpr.Flush(FLUSH_ALL); + fpr.Flush(FLUSH_ALL); + //Bits SRR1[0, 5-9, 16-23, 25-27, 30-31] are placed into the corresponding bits of the MSR. + //MSR[13] is set to 0. + const u32 mask = 0x87C0FF73; + // MSR = (MSR & ~mask) | (SRR1 & mask); + MOV(32, R(EAX), M(&MSR)); + MOV(32, R(ECX), M(&SRR1)); + AND(32, R(EAX), Imm32(~mask)); + AND(32, R(ECX), Imm32(mask)); + OR(32, R(EAX), R(ECX)); + // MSR &= 0xFFFDFFFF; //TODO: VERIFY + AND(32, R(EAX), Imm32(0xFFFDFFFF)); + MOV(32, M(&MSR), R(EAX)); + // NPC = SRR0; + MOV(32, R(EAX), M(&SRR0)); + WriteRfiExitDestInEAX(); + } + + void Jit64::bx(UGeckoInstruction inst) + { + if (inst.LK) + ibuild.EmitStoreLink(ibuild.EmitIntConst(js.compilerPC + 4)); + + u32 destination; + if (inst.AA) + destination = SignExt26(inst.LI << 2); + else + destination = js.compilerPC + SignExt26(inst.LI << 2); + + ibuild.EmitBranchUncond(ibuild.EmitIntConst(destination)); + } + + // TODO - optimize to hell and beyond + // TODO - make nice easy to optimize special cases for the most common + // variants of this instruction. + void Jit64::bcx(UGeckoInstruction inst) + { + if (inst.LK) + ibuild.EmitStoreLink( + ibuild.EmitIntConst(js.compilerPC + 4)); + + IREmitter::InstLoc CRTest = 0, CTRTest = 0; + if ((inst.BO & 16) == 0) // Test a CR bit + { + IREmitter::InstLoc CRReg = ibuild.EmitLoadCR(inst.BI >> 2); + IREmitter::InstLoc CRCmp = ibuild.EmitIntConst(8 >> (inst.BI & 3)); + CRTest = ibuild.EmitAnd(CRReg, CRCmp); + if (inst.BO & 8) + CRTest = ibuild.EmitXor(CRTest, CRCmp); + } + + if ((inst.BO & BO_DONT_DECREMENT_FLAG) == 0) { + IREmitter::InstLoc c = ibuild.EmitLoadCTR(); + c = ibuild.EmitSub(c, ibuild.EmitIntConst(1)); + ibuild.EmitStoreCTR(c); + } + + if ((inst.BO & 4) == 0) { + IREmitter::InstLoc c = ibuild.EmitLoadCTR(); + if (!(inst.BO & 2)) { + CTRTest = ibuild.EmitICmpEq(c, + ibuild.EmitIntConst(0)); + } else { + CTRTest = c; + } + } + + IREmitter::InstLoc Test = CRTest; + if (CTRTest) { + if (Test) + Test = ibuild.EmitOr(Test, CTRTest); + else + Test = CTRTest; + } + + if (!Test) { + PanicAlert("Unconditional conditional branch?!"); + } + + u32 destination; + if(inst.AA) + destination = SignExt16(inst.BD << 2); + else + destination = js.compilerPC + SignExt16(inst.BD << 2); + + ibuild.EmitBranchCond(Test, ibuild.EmitIntConst(destination)); + ibuild.EmitBranchUncond(ibuild.EmitIntConst(js.compilerPC + 4)); + } + + void Jit64::bcctrx(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITBranchOff) + {Default(inst); return;} // turn off from debugger + + gpr.Flush(FLUSH_ALL); + fpr.Flush(FLUSH_ALL); + + // bool fastway = true; + + if ((inst.BO & 16) == 0) + { + PanicAlert("Bizarro bcctrx %08x, not supported.", inst.hex); + _assert_msg_(DYNA_REC, 0, "Bizarro bcctrx"); + /* + fastway = false; + MOV(32, M(&PC), Imm32(js.compilerPC+4)); + MOV(32, R(EAX), M(&CR)); + XOR(32, R(ECX), R(ECX)); + AND(32, R(EAX), Imm32(0x80000000 >> inst.BI)); + + CCFlags branch; + if(inst.BO & 8) + branch = CC_NZ; + else + branch = CC_Z; + */ + // TODO(ector): Why is this commented out? + //SETcc(branch, R(ECX)); + // check for EBX + //TEST(32, R(ECX), R(ECX)); + //linkEnd = J_CC(branch); + } + // NPC = CTR & 0xfffffffc; + MOV(32, R(EAX), M(&CTR)); + if (inst.LK) + MOV(32, M(&LR), Imm32(js.compilerPC + 4)); // LR = PC + 4; + AND(32, R(EAX), Imm32(0xFFFFFFFC)); + WriteExitDestInEAX(0); + } + + + void Jit64::bclrx(UGeckoInstruction inst) + { + if (inst.hex == 0x4e800020) { + ibuild.EmitBranchUncond(ibuild.EmitLoadLink()); + return; + } + Default(inst); + return; + } + diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp new file mode 100644 index 0000000000..64bb657a40 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_FloatingPoint.cpp @@ -0,0 +1,224 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "Common.h" + +#include "../../Core.h" +#include "../PowerPC.h" +#include "../PPCTables.h" +#include "x64Emitter.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitRegCache.h" + +#define INSTRUCTION_START +// #define INSTRUCTION_START Default(inst); return; + + const u64 GC_ALIGNED16(psSignBits2[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; + const u64 GC_ALIGNED16(psAbsMask2[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; + const double GC_ALIGNED16(psOneOne2[2]) = {1.0, 1.0}; + + void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEmitter::*op)(Gen::X64Reg, Gen::OpArg)) + { + fpr.Lock(d, a, b); + if (d == a) + { + fpr.LoadToX64(d, true); + (this->*op)(fpr.RX(d), fpr.R(b)); + } + else if (d == b && reversible) + { + fpr.LoadToX64(d, true); + (this->*op)(fpr.RX(d), fpr.R(a)); + } + else if (a != d && b != d) + { + // Sources different from d, can use rather quick solution + fpr.LoadToX64(d, !dupe); + MOVSD(fpr.RX(d), fpr.R(a)); + (this->*op)(fpr.RX(d), fpr.R(b)); + } + else if (b != d) + { + fpr.LoadToX64(d, !dupe); + MOVSD(XMM0, fpr.R(b)); + MOVSD(fpr.RX(d), fpr.R(a)); + (this->*op)(fpr.RX(d), Gen::R(XMM0)); + } + else // Other combo, must use two temps :( + { + MOVSD(XMM0, fpr.R(a)); + MOVSD(XMM1, fpr.R(b)); + fpr.LoadToX64(d, !dupe); + (this->*op)(XMM0, Gen::R(XMM1)); + MOVSD(fpr.RX(d), Gen::R(XMM0)); + } + if (dupe) { + ForceSinglePrecisionS(fpr.RX(d)); + MOVDDUP(fpr.RX(d), fpr.R(d)); + } + fpr.UnlockAll(); + } + + void Jit64::fp_arith_s(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + bool dupe = inst.OPCD == 59; + switch (inst.SUBOP5) + { + case 18: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::DIVSD); break; //div + case 20: fp_tri_op(inst.FD, inst.FA, inst.FB, false, dupe, &XEmitter::SUBSD); break; //sub + case 21: fp_tri_op(inst.FD, inst.FA, inst.FB, true, dupe, &XEmitter::ADDSD); break; //add + case 23: //sel + Default(inst); + break; + case 24: //res + Default(inst); + break; + case 25: fp_tri_op(inst.FD, inst.FA, inst.FC, true, dupe, &XEmitter::MULSD); break; //mul + default: + _assert_msg_(DYNA_REC, 0, "fp_arith_s WTF!!!"); + } + } + + void Jit64::fmaddXX(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + + bool single_precision = inst.OPCD == 59; + + int a = inst.FA; + int b = inst.FB; + int c = inst.FC; + int d = inst.FD; + + fpr.Lock(a, b, c, d); + MOVSD(XMM0, fpr.R(a)); + switch (inst.SUBOP5) + { + case 28: //msub + MULSD(XMM0, fpr.R(c)); + SUBSD(XMM0, fpr.R(b)); + break; + case 29: //madd + MULSD(XMM0, fpr.R(c)); + ADDSD(XMM0, fpr.R(b)); + break; + case 30: //nmsub + MULSD(XMM0, fpr.R(c)); + SUBSD(XMM0, fpr.R(b)); + XORPD(XMM0, M((void*)&psSignBits2)); + break; + case 31: //nmadd + MULSD(XMM0, fpr.R(c)); + ADDSD(XMM0, fpr.R(b)); + XORPD(XMM0, M((void*)&psSignBits2)); + break; + } + fpr.LoadToX64(d, false); + //YES it is necessary to dupe the result :( + //TODO : analysis - does the top reg get used? If so, dupe, if not, don't. + if (single_precision) { + ForceSinglePrecisionS(XMM0); + MOVDDUP(fpr.RX(d), R(XMM0)); + } else { + MOVSD(fpr.RX(d), R(XMM0)); + } + fpr.UnlockAll(); + } + + void Jit64::fmrx(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + int d = inst.FD; + int b = inst.FB; + fpr.LoadToX64(d, true); // we don't want to destroy the high bit + MOVSD(fpr.RX(d), fpr.R(b)); + } + + void Jit64::fcmpx(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITFloatingPointOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (jo.fpAccurateFlags) + { + Default(inst); + return; + } + bool ordered = inst.SUBOP10 == 32; + /* + double fa = rPS0(_inst.FA); + double fb = rPS0(_inst.FB); + u32 compareResult; + + if(IsNAN(fa) || IsNAN(fb)) compareResult = 1; + else if(fa < fb) compareResult = 8; + else if(fa > fb) compareResult = 4; + else compareResult = 2; + + FPSCR.FPRF = compareResult; + CR = (CR & (~(0xf0000000 >> (_inst.CRFD * 4)))) | (compareResult << ((7 - _inst.CRFD) * 4)); +*/ + int a = inst.FA; + int b = inst.FB; + int crf = inst.CRFD; + int shift = crf * 4; + //FPSCR + //XOR(32,R(EAX),R(EAX)); + + fpr.Lock(a,b); + if (a != b) + fpr.LoadToX64(a, true); + + // USES_CR + if (ordered) + COMISD(fpr.R(a).GetSimpleReg(), fpr.R(b)); + else + UCOMISD(fpr.R(a).GetSimpleReg(), fpr.R(b)); + FixupBranch pLesser = J_CC(CC_B); + FixupBranch pGreater = J_CC(CC_A); + // _x86Reg == 0 + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); + FixupBranch continue1 = J(); + // _x86Reg > 0 + SetJumpTarget(pGreater); + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); + FixupBranch continue2 = J(); + // _x86Reg < 0 + SetJumpTarget(pLesser); + MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); + SetJumpTarget(continue1); + SetJumpTarget(continue2); + fpr.UnlockAll(); + } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Integer.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Integer.cpp new file mode 100644 index 0000000000..04cf273211 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Integer.cpp @@ -0,0 +1,520 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "../../Core.h" // include "Common.h", "CoreParameter.h", SCoreStartupParameter +#include "../PowerPC.h" +#include "../PPCTables.h" +#include "x64Emitter.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitRegCache.h" +#include "JitAsm.h" + +// #define INSTRUCTION_START Default(inst); return; +#define INSTRUCTION_START + + static void ComputeRC(IREmitter::IRBuilder& ibuild, + IREmitter::InstLoc val) { + IREmitter::InstLoc res = + ibuild.EmitICmpCRSigned(val, ibuild.EmitIntConst(0)); + ibuild.EmitStoreCR(res, 0); + } + + void Jit64::reg_imm(UGeckoInstruction inst) + { + int d = inst.RD, a = inst.RA, s = inst.RS; + IREmitter::InstLoc val, test, c; + switch (inst.OPCD) + { + case 14: //addi + val = ibuild.EmitIntConst(inst.SIMM_16); + if (a) + val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), val); + ibuild.EmitStoreGReg(val, d); + break; + case 15: //addis + val = ibuild.EmitIntConst(inst.SIMM_16 << 16); + if (a) + val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), val); + ibuild.EmitStoreGReg(val, d); + break; + case 24: //ori + val = ibuild.EmitIntConst(inst.UIMM); + val = ibuild.EmitOr(ibuild.EmitLoadGReg(s), val); + ibuild.EmitStoreGReg(val, a); + break; + case 25: //oris + val = ibuild.EmitIntConst(inst.UIMM << 16); + val = ibuild.EmitOr(ibuild.EmitLoadGReg(s), val); + ibuild.EmitStoreGReg(val, a); + break; + case 28: //andi + val = ibuild.EmitIntConst(inst.UIMM); + val = ibuild.EmitAnd(ibuild.EmitLoadGReg(s), val); + ibuild.EmitStoreGReg(val, a); + ComputeRC(ibuild, val); + break; + case 29: //andis + val = ibuild.EmitIntConst(inst.UIMM << 16); + val = ibuild.EmitAnd(ibuild.EmitLoadGReg(s), val); + ibuild.EmitStoreGReg(val, a); + ComputeRC(ibuild, val); + break; + case 26: //xori + val = ibuild.EmitIntConst(inst.UIMM); + val = ibuild.EmitXor(ibuild.EmitLoadGReg(s), val); + ibuild.EmitStoreGReg(val, a); + break; + case 27: //xoris + val = ibuild.EmitIntConst(inst.UIMM << 16); + val = ibuild.EmitXor(ibuild.EmitLoadGReg(s), val); + ibuild.EmitStoreGReg(val, a); + break; + case 12: //addic + case 13: //addic_rc + c = ibuild.EmitIntConst(inst.SIMM_16); + val = ibuild.EmitAdd(ibuild.EmitLoadGReg(a), c); + ibuild.EmitStoreGReg(val, d); + test = ibuild.EmitICmpUgt(c, val); + ibuild.EmitStoreCarry(test); + if (inst.OPCD == 13) + ComputeRC(ibuild, val); + break; + default: + Default(inst); + break; + } + } + + void Jit64::cmpXX(UGeckoInstruction inst) + { + IREmitter::InstLoc lhs, rhs, res; + lhs = ibuild.EmitLoadGReg(inst.RA); + if (inst.OPCD == 31) { + rhs = ibuild.EmitLoadGReg(inst.RB); + if (inst.SUBOP10 == 32) { + res = ibuild.EmitICmpCRUnsigned(lhs, rhs); + } else { + res = ibuild.EmitICmpCRSigned(lhs, rhs); + } + } else if (inst.OPCD == 10) { + rhs = ibuild.EmitIntConst(inst.UIMM); + res = ibuild.EmitICmpCRUnsigned(lhs, rhs); + } else { // inst.OPCD == 11 + rhs = ibuild.EmitIntConst(inst.SIMM_16); + res = ibuild.EmitICmpCRSigned(lhs, rhs); + } + + ibuild.EmitStoreCR(res, inst.CRFD); + } + + void Jit64::orx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB); + val = ibuild.EmitOr(ibuild.EmitLoadGReg(inst.RS), val); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + + // m_GPR[_inst.RA] = m_GPR[_inst.RS] ^ m_GPR[_inst.RB]; + void Jit64::xorx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB); + val = ibuild.EmitXor(ibuild.EmitLoadGReg(inst.RS), val); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::andx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB); + val = ibuild.EmitAnd(ibuild.EmitLoadGReg(inst.RS), val); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::extsbx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS); + val = ibuild.EmitSExt8(val); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::extshx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS); + val = ibuild.EmitSExt16(val); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::subfic(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) + {Default(inst); return;} // turn off from debugger + + INSTRUCTION_START; + int a = inst.RA, d = inst.RD; + gpr.FlushLockX(ECX); + gpr.Lock(a, d); + gpr.LoadToX64(d, a == d, true); + int imm = inst.SIMM_16; + MOV(32, R(EAX), gpr.R(a)); + NOT(32, R(EAX)); + ADD(32, R(EAX), Imm32(imm + 1)); + MOV(32, gpr.R(d), R(EAX)); + //GenerateCarry(ECX); + gpr.UnlockAll(); + gpr.UnlockAllX(); + // This instruction has no RC flag + } + + void Jit64::subfcx(UGeckoInstruction inst) + { + INSTRUCTION_START; + Default(inst); + return; + /* + u32 a = m_GPR[_inst.RA]; + u32 b = m_GPR[_inst.RB]; + m_GPR[_inst.RD] = b - a; + SetCarry(a == 0 || Helper_Carry(b, 0-a)); + + if (_inst.OE) PanicAlert("OE: subfcx"); + if (_inst.Rc) Helper_UpdateCR0(m_GPR[_inst.RD]); + */ + } + + void Jit64::subfex(UGeckoInstruction inst) + { + INSTRUCTION_START; + Default(inst); + return; + /* + u32 a = m_GPR[_inst.RA]; + u32 b = m_GPR[_inst.RB]; + int carry = GetCarry(); + m_GPR[_inst.RD] = (~a) + b + carry; + SetCarry(Helper_Carry(~a, b) || Helper_Carry((~a) + b, carry)); + + if (_inst.OE) PanicAlert("OE: subfcx"); + if (_inst.Rc) Helper_UpdateCR0(m_GPR[_inst.RD]); + */ + } + + void Jit64::subfx(UGeckoInstruction inst) + { + if (inst.OE) PanicAlert("OE: subfx"); + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB); + val = ibuild.EmitSub(val, ibuild.EmitLoadGReg(inst.RA)); + ibuild.EmitStoreGReg(val, inst.RD); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::mulli(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RA); + val = ibuild.EmitMul(val, ibuild.EmitIntConst(inst.SIMM_16)); + ibuild.EmitStoreGReg(val, inst.RD); + } + + void Jit64::mullwx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB); + val = ibuild.EmitMul(ibuild.EmitLoadGReg(inst.RA), val); + ibuild.EmitStoreGReg(val, inst.RD); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::mulhwux(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) + {Default(inst); return;} // turn off from debugger + + INSTRUCTION_START; + int a = inst.RA, b = inst.RB, d = inst.RD; + gpr.FlushLockX(EDX); + gpr.Lock(a, b, d); + if (d != a && d != b) { + gpr.LoadToX64(d, false, true); + } else { + gpr.LoadToX64(d, true, true); + } + if (gpr.RX(d) == EDX) + PanicAlert("mulhwux : WTF"); + MOV(32, R(EAX), gpr.R(a)); + gpr.KillImmediate(b); + MUL(32, gpr.R(b)); + gpr.UnlockAll(); + gpr.UnlockAllX(); + if (inst.Rc) { + MOV(32, R(EAX), R(EDX)); + MOV(32, gpr.R(d), R(EDX)); + // result is already in eax + CALL((u8*)asm_routines.computeRc); + } else { + MOV(32, gpr.R(d), R(EDX)); + } + } + + // skipped some of the special handling in here - if we get crashes, let the interpreter handle this op + void Jit64::divwux(UGeckoInstruction inst) { + Default(inst); return; + + int a = inst.RA, b = inst.RB, d = inst.RD; + gpr.FlushLockX(EDX); + gpr.Lock(a, b, d); + if (d != a && d != b) { + gpr.LoadToX64(d, false, true); + } else { + gpr.LoadToX64(d, true, true); + } + MOV(32, R(EAX), gpr.R(a)); + XOR(32, R(EDX), R(EDX)); + gpr.KillImmediate(b); + DIV(32, gpr.R(b)); + MOV(32, gpr.R(d), R(EAX)); + gpr.UnlockAll(); + gpr.UnlockAllX(); + if (inst.Rc) { + CALL((u8*)asm_routines.computeRc); + } + } + + u32 Helper_Mask(u8 mb, u8 me) + { + return (((mb > me) ? + ~(((u32)-1 >> mb) ^ ((me >= 31) ? 0 : (u32) -1 >> (me + 1))) + : + (((u32)-1 >> mb) ^ ((me >= 31) ? 0 : (u32) -1 >> (me + 1)))) + ); + } + + void Jit64::addx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RB); + val = ibuild.EmitAdd(ibuild.EmitLoadGReg(inst.RA), val); + ibuild.EmitStoreGReg(val, inst.RD); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + // This can be optimized + void Jit64::addex(UGeckoInstruction inst) + { + Default(inst); return; + // USES_XER + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) + {Default(inst); return;} // turn off from debugger + + INSTRUCTION_START; + int a = inst.RA, b = inst.RB, d = inst.RD; + gpr.FlushLockX(ECX); + gpr.Lock(a, b, d); + if (d != a && d != b) + gpr.LoadToX64(d, false); + else + gpr.LoadToX64(d, true); + MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER])); + SHR(32, R(EAX), Imm8(30)); // shift the carry flag out into the x86 carry flag + MOV(32, R(EAX), gpr.R(a)); + ADC(32, R(EAX), gpr.R(b)); + MOV(32, gpr.R(d), R(EAX)); + //GenerateCarry(ECX); + gpr.UnlockAll(); + gpr.UnlockAllX(); + if (inst.Rc) + { + CALL((u8*)asm_routines.computeRc); + } + } + + void Jit64::rlwinmx(UGeckoInstruction inst) + { + unsigned mask = Helper_Mask(inst.MB, inst.ME); + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS); + val = ibuild.EmitRol(val, ibuild.EmitIntConst(inst.SH)); + val = ibuild.EmitAnd(val, ibuild.EmitIntConst(mask)); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + + void Jit64::rlwimix(UGeckoInstruction inst) + { + unsigned mask = Helper_Mask(inst.MB, inst.ME); + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS); + val = ibuild.EmitRol(val, ibuild.EmitIntConst(inst.SH)); + val = ibuild.EmitAnd(val, ibuild.EmitIntConst(mask)); + IREmitter::InstLoc ival = ibuild.EmitLoadGReg(inst.RA); + ival = ibuild.EmitAnd(ival, ibuild.EmitIntConst(~mask)); + val = ibuild.EmitOr(ival, val); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::rlwnmx(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) + {Default(inst); return;} // turn off from debugger + + INSTRUCTION_START; + int a = inst.RA, b = inst.RB, s = inst.RS; + if (gpr.R(a).IsImm()) + { + Default(inst); + return; + } + + u32 mask = Helper_Mask(inst.MB, inst.ME); + gpr.FlushLockX(ECX); + gpr.Lock(a, b, s); + MOV(32, R(EAX), gpr.R(s)); + MOV(32, R(ECX), gpr.R(b)); + AND(32, R(ECX), Imm32(0x1f)); + ROL(32, R(EAX), R(ECX)); + AND(32, R(EAX), Imm32(mask)); + MOV(32, gpr.R(a), R(EAX)); + gpr.UnlockAll(); + gpr.UnlockAllX(); + if (inst.Rc) + { + MOV(32, R(EAX), gpr.R(a)); + CALL((u8*)asm_routines.computeRc); + } + } + + void Jit64::negx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RA); + val = ibuild.EmitSub(ibuild.EmitIntConst(0), val); + ibuild.EmitStoreGReg(val, inst.RD); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::srwx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS), + samt = ibuild.EmitLoadGReg(inst.RB), + corr; + // FIXME: We can do better with a cmov + // FIXME: We can do better on 64-bit + val = ibuild.EmitShrl(val, samt); + corr = ibuild.EmitShl(samt, ibuild.EmitIntConst(26)); + corr = ibuild.EmitSarl(corr, ibuild.EmitIntConst(31)); + corr = ibuild.EmitXor(corr, ibuild.EmitIntConst(-1)); + val = ibuild.EmitAnd(corr, val); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::slwx(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS), + samt = ibuild.EmitLoadGReg(inst.RB), + corr; + // FIXME: We can do better with a cmov + // FIXME: We can do better on 64-bit + val = ibuild.EmitShl(val, samt); + corr = ibuild.EmitShl(samt, ibuild.EmitIntConst(26)); + corr = ibuild.EmitSarl(corr, ibuild.EmitIntConst(31)); + corr = ibuild.EmitXor(corr, ibuild.EmitIntConst(-1)); + val = ibuild.EmitAnd(corr, val); + ibuild.EmitStoreGReg(val, inst.RA); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + void Jit64::srawx(UGeckoInstruction inst) + { + // FIXME: We can do a lot better on 64-bit + IREmitter::InstLoc val, samt, mask, mask2, test; + val = ibuild.EmitLoadGReg(inst.RS); + samt = ibuild.EmitLoadGReg(inst.RB); + mask = ibuild.EmitIntConst(-1); + val = ibuild.EmitSarl(val, samt); + mask = ibuild.EmitShl(mask, samt); + samt = ibuild.EmitShl(samt, ibuild.EmitIntConst(26)); + samt = ibuild.EmitSarl(samt, ibuild.EmitIntConst(31)); + samt = ibuild.EmitAnd(samt, ibuild.EmitIntConst(31)); + val = ibuild.EmitSarl(val, samt); + ibuild.EmitStoreGReg(val, inst.RA); + mask = ibuild.EmitShl(mask, samt); + mask2 = ibuild.EmitAnd(mask, ibuild.EmitIntConst(0x7FFFFFFF)); + test = ibuild.EmitOr(val, mask2); + test = ibuild.EmitICmpUgt(test, mask); + ibuild.EmitStoreCarry(test); + } + + void Jit64::srawix(UGeckoInstruction inst) + { + IREmitter::InstLoc val = ibuild.EmitLoadGReg(inst.RS), test; + val = ibuild.EmitSarl(val, ibuild.EmitIntConst(inst.SH)); + ibuild.EmitStoreGReg(val, inst.RA); + unsigned mask = -1u << inst.SH; + test = ibuild.EmitOr(val, ibuild.EmitIntConst(mask & 0x7FFFFFFF)); + test = ibuild.EmitICmpUgt(test, ibuild.EmitIntConst(mask)); + + ibuild.EmitStoreCarry(test); + if (inst.Rc) + ComputeRC(ibuild, val); + } + + // count leading zeroes + void Jit64::cntlzwx(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITIntegerOff) + {Default(inst); return;} // turn off from debugger + + INSTRUCTION_START; + int a = inst.RA; + int s = inst.RS; + if (gpr.R(a).IsImm() || gpr.R(s).IsImm() || s == a) + { + Default(inst); + return; + } + gpr.Lock(a,s); + gpr.LoadToX64(a,false); + BSR(32, gpr.R(a).GetSimpleReg(), gpr.R(s)); + FixupBranch gotone = J_CC(CC_NZ); + MOV(32, gpr.R(a), Imm32(63)); + SetJumpTarget(gotone); + XOR(32, gpr.R(a), Imm8(0x1f)); // flip order + gpr.UnlockAll(); + + if (inst.Rc) + { + MOV(32, R(EAX), gpr.R(a)); + CALL((u8*)asm_routines.computeRc); + // TODO: Check PPC manual too + } + } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp new file mode 100644 index 0000000000..141942c71b --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStore.cpp @@ -0,0 +1,198 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only. +// Should give a very noticable speed boost to paired single heavy code. + +#include "Common.h" +#include "Thunk.h" + +#include "../PowerPC.h" +#include "../../Core.h" +#include "../../HW/GPFifo.h" +#include "../../HW/CommandProcessor.h" +#include "../../HW/PixelEngine.h" +#include "../../HW/Memmap.h" +#include "../PPCTables.h" +#include "x64Emitter.h" +#include "ABI.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitAsm.h" +#include "JitRegCache.h" + +// #define INSTRUCTION_START Default(inst); return; +#define INSTRUCTION_START + + void Jit64::lbzx(UGeckoInstruction inst) + { + IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB); + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + ibuild.EmitStoreGReg(ibuild.EmitLoad8(addr), inst.RD); + } + + void Jit64::lwzx(UGeckoInstruction inst) + { + IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB); + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + ibuild.EmitStoreGReg(ibuild.EmitLoad32(addr), inst.RD); + } + + void Jit64::lhax(UGeckoInstruction inst) + { + IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB); + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + IREmitter::InstLoc val = ibuild.EmitLoad16(addr); + val = ibuild.EmitSExt16(val); + ibuild.EmitStoreGReg(val, inst.RD); + } + + void Jit64::lXz(UGeckoInstruction inst) + { + IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16); + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + IREmitter::InstLoc val; + switch (inst.OPCD) + { + case 32: val = ibuild.EmitLoad32(addr); break; //lwz + case 40: val = ibuild.EmitLoad16(addr); break; //lhz + case 34: val = ibuild.EmitLoad8(addr); break; //lbz + default: PanicAlert("lXz: invalid access size"); + } + ibuild.EmitStoreGReg(val, inst.RD); + } + + void Jit64::lha(UGeckoInstruction inst) + { + IREmitter::InstLoc addr = + ibuild.EmitIntConst((s32)(s16)inst.SIMM_16); + if (inst.RA) + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + IREmitter::InstLoc val = ibuild.EmitLoad16(addr); + val = ibuild.EmitSExt16(val); + ibuild.EmitStoreGReg(val, inst.RD); + } + + void Jit64::lwzux(UGeckoInstruction inst) + { + IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB); + if (inst.RA) { + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + ibuild.EmitStoreGReg(addr, inst.RA); + } + ibuild.EmitStoreGReg(ibuild.EmitLoad32(addr), inst.RD); + } + + // Zero cache line. + void Jit64::dcbz(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + MOV(32, R(EAX), gpr.R(inst.RB)); + if (inst.RA) + ADD(32, R(EAX), gpr.R(inst.RA)); + AND(32, R(EAX), Imm32(~31)); + XORPD(XMM0, R(XMM0)); +#ifdef _M_X64 + MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0); + MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0); +#else + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0); + MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0); +#endif + } + + void Jit64::stX(UGeckoInstruction inst) + { + IREmitter::InstLoc addr = ibuild.EmitIntConst(inst.SIMM_16), + value = ibuild.EmitLoadGReg(inst.RS); + if (inst.RA) + addr = ibuild.EmitAdd(ibuild.EmitLoadGReg(inst.RA), addr); + if (inst.OPCD & 1) + ibuild.EmitStoreGReg(addr, inst.RA); + switch (inst.OPCD & ~1) + { + case 36: ibuild.EmitStore32(value, addr); break; //stw + case 44: ibuild.EmitStore16(value, addr); break; //sth + case 38: ibuild.EmitStore8(value, addr); break; //stb + default: _assert_msg_(DYNA_REC, 0, "AWETKLJASDLKF"); return; + } + } + + void Jit64::stXx(UGeckoInstruction inst) + { + IREmitter::InstLoc addr = ibuild.EmitLoadGReg(inst.RB), + value = ibuild.EmitLoadGReg(inst.RS); + addr = ibuild.EmitAdd(addr, ibuild.EmitLoadGReg(inst.RA)); + if (inst.SUBOP10 & 32) + ibuild.EmitStoreGReg(addr, inst.RA); + switch (inst.SUBOP10 & ~32) + { + case 151: ibuild.EmitStore32(value, addr); break; //stw + case 407: ibuild.EmitStore16(value, addr); break; //sth + case 215: ibuild.EmitStore8(value, addr); break; //stb + default: _assert_msg_(DYNA_REC, 0, "AWETKLJASDLKF"); return; + } + } + +// A few games use these heavily in video codecs. +void Jit64::lmw(UGeckoInstruction inst) +{ +#ifdef _M_IX86 + Default(inst); return; +#else + gpr.FlushLockX(ECX); + MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16)); + if (inst.RA) + ADD(32, R(EAX), gpr.R(inst.RA)); + for (int i = inst.RD; i < 32; i++) + { + MOV(32, R(ECX), MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4)); + BSWAP(32, ECX); + gpr.LoadToX64(i, false, true); + MOV(32, gpr.R(i), R(ECX)); + } + gpr.UnlockAllX(); +#endif +} + +void Jit64::stmw(UGeckoInstruction inst) +{ +#ifdef _M_IX86 + Default(inst); return; +#else + gpr.FlushLockX(ECX); + MOV(32, R(EAX), Imm32((u32)(s32)inst.SIMM_16)); + if (inst.RA) + ADD(32, R(EAX), gpr.R(inst.RA)); + for (int i = inst.RD; i < 32; i++) + { + MOV(32, R(ECX), gpr.R(i)); + BSWAP(32, ECX); + MOV(32, MComplex(EBX, EAX, SCALE_1, (i - inst.RD) * 4), R(ECX)); + } + gpr.UnlockAllX(); +#endif +} diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp new file mode 100644 index 0000000000..a6161e6575 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStoreFloating.cpp @@ -0,0 +1,322 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only. +// Should give a very noticable speed boost to paired single heavy code. + +#include "Common.h" + +#include "../PowerPC.h" +#include "../../Core.h" // include "Common.h", "CoreParameter.h" +#include "../../HW/GPFifo.h" +#include "../../HW/CommandProcessor.h" +#include "../../HW/PixelEngine.h" +#include "../../HW/Memmap.h" +#include "../PPCTables.h" +#include "CPUDetect.h" +#include "x64Emitter.h" +#include "ABI.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitAsm.h" +#include "JitRegCache.h" + +// #define INSTRUCTION_START Default(inst); return; +#define INSTRUCTION_START + +// pshufb todo: MOVQ +const u8 GC_ALIGNED16(bswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; +const u8 GC_ALIGNED16(bswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; +const u8 GC_ALIGNED16(bswapShuffle1x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 8, 9, 10, 11, 12, 13, 14, 15}; +const u8 GC_ALIGNED16(bswapShuffle1x8Dupe[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0}; +const u8 GC_ALIGNED16(bswapShuffle2x8[16]) = {7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}; + +namespace { + +u64 GC_ALIGNED16(temp64); +u32 GC_ALIGNED16(temp32); +} +// TODO: Add peephole optimizations for multiple consecutive lfd/lfs/stfd/stfs since they are so common, +// and pshufb could help a lot. +// Also add hacks for things like lfs/stfs the same reg consecutively, that is, simple memory moves. + +void Jit64::lfs(UGeckoInstruction inst) +{ + if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + int d = inst.RD; + int a = inst.RA; + if (!a) + { + Default(inst); + return; + } + s32 offset = (s32)(s16)inst.SIMM_16; + gpr.FlushLockX(ABI_PARAM1); + gpr.Lock(a); + MOV(32, R(ABI_PARAM1), gpr.R(a)); + if (jo.assumeFPLoadFromMem) + { + UnsafeLoadRegToReg(ABI_PARAM1, EAX, 32, offset, false); + } + else + { + SafeLoadRegToEAX(ABI_PARAM1, 32, offset); + } + + MOV(32, M(&temp32), R(EAX)); + fpr.Lock(d); + fpr.LoadToX64(d, false); + CVTSS2SD(fpr.RX(d), M(&temp32)); + MOVDDUP(fpr.RX(d), fpr.R(d)); + gpr.UnlockAll(); + gpr.UnlockAllX(); + fpr.UnlockAll(); +} + + +void Jit64::lfd(UGeckoInstruction inst) +{ + if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + int d = inst.RD; + int a = inst.RA; + if (!a) + { + Default(inst); + return; + } + s32 offset = (s32)(s16)inst.SIMM_16; + gpr.FlushLockX(ABI_PARAM1); + gpr.Lock(a); + MOV(32, R(ABI_PARAM1), gpr.R(a)); + // TODO - optimize. This has to load the previous value - upper double should stay unmodified. + fpr.LoadToX64(d, true); + fpr.Lock(d); + X64Reg xd = fpr.RX(d); + if (cpu_info.bSSSE3) { +#ifdef _M_X64 + MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); +#else + AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); + MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); +#endif + PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe)); + MOVSD(xd, R(XMM0)); + } else { +#ifdef _M_X64 + MOV(64, R(EAX), MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); + BSWAP(64, EAX); + MOV(64, M(&temp64), R(EAX)); + MOVSD(XMM0, M(&temp64)); + MOVSD(xd, R(XMM0)); +#else + AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset)); + BSWAP(32, EAX); + MOV(32, M((void*)((u32)&temp64+4)), R(EAX)); + MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4)); + BSWAP(32, EAX); + MOV(32, M(&temp64), R(EAX)); + MOVSD(XMM0, M(&temp64)); + MOVSD(xd, R(XMM0)); +#if 0 + // Alternate implementation; possibly faster + AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); + MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); + PSHUFLW(XMM0, R(XMM0), 0x1B); + PSRLW(XMM0, 8); + MOVSD(xd, R(XMM0)); + MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); + PSHUFLW(XMM0, R(XMM0), 0x1B); + PSLLW(XMM0, 8); + POR(xd, R(XMM0)); +#endif +#endif + } + gpr.UnlockAll(); + gpr.UnlockAllX(); + fpr.UnlockAll(); +} + + +void Jit64::stfd(UGeckoInstruction inst) +{ + if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + int s = inst.RS; + int a = inst.RA; + if (!a) + { + Default(inst); + return; + } + s32 offset = (s32)(s16)inst.SIMM_16; + gpr.FlushLockX(ABI_PARAM1); + gpr.Lock(a); + fpr.Lock(s); + MOV(32, R(ABI_PARAM1), gpr.R(a)); +#ifdef _M_IX86 + AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); +#endif + if (cpu_info.bSSSE3) { + MOVAPD(XMM0, fpr.R(s)); + PSHUFB(XMM0, M((void *)bswapShuffle1x8)); +#ifdef _M_X64 + MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, offset), XMM0); +#else + MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base + offset), XMM0); +#endif + } else { +#ifdef _M_X64 + fpr.LoadToX64(s, true, false); + MOVSD(M(&temp64), fpr.RX(s)); + MOV(64, R(EAX), M(&temp64)); + BSWAP(64, EAX); + MOV(64, MComplex(RBX, ABI_PARAM1, SCALE_1, offset), R(EAX)); +#else + fpr.LoadToX64(s, true, false); + MOVSD(M(&temp64), fpr.RX(s)); + MOV(32, R(EAX), M(&temp64)); + BSWAP(32, EAX); + MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4), R(EAX)); + MOV(32, R(EAX), M((void*)((u32)&temp64 + 4))); + BSWAP(32, EAX); + MOV(32, MDisp(ABI_PARAM1, (u32)Memory::base + offset), R(EAX)); +#endif + } + gpr.UnlockAll(); + gpr.UnlockAllX(); + fpr.UnlockAll(); +} + + +void Jit64::stfs(UGeckoInstruction inst) +{ + if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + bool update = inst.OPCD & 1; + int s = inst.RS; + int a = inst.RA; + s32 offset = (s32)(s16)inst.SIMM_16; + if (!a || update) { + Default(inst); + return; + } + + if (gpr.R(a).IsImm()) + { + u32 addr = (u32)(gpr.R(a).offset + offset); + if (Memory::IsRAMAddress(addr)) + { + if (cpu_info.bSSSE3) { + CVTSD2SS(XMM0, fpr.R(s)); + PSHUFB(XMM0, M((void *)bswapShuffle1x4)); + WriteFloatToConstRamAddress(XMM0, addr); + return; + } + } + else if (addr == 0xCC008000) + { + // Float directly to write gather pipe! Fun! + CVTSD2SS(XMM0, fpr.R(s)); + CALL((void*)asm_routines.fifoDirectWriteFloat); + // TODO + js.fifoBytesThisBlock += 4; + return; + } + } + + gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); + gpr.Lock(a); + fpr.Lock(s); + MOV(32, R(ABI_PARAM2), gpr.R(a)); + ADD(32, R(ABI_PARAM2), Imm32(offset)); + if (update && offset) + { + MOV(32, gpr.R(a), R(ABI_PARAM2)); + } + CVTSD2SS(XMM0, fpr.R(s)); + MOVSS(M(&temp32), XMM0); + MOV(32, R(ABI_PARAM1), M(&temp32)); + SafeWriteRegToReg(ABI_PARAM1, ABI_PARAM2, 32, 0); + gpr.UnlockAll(); + gpr.UnlockAllX(); + fpr.UnlockAll(); +} + + +void Jit64::stfsx(UGeckoInstruction inst) +{ + if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + // We can take a shortcut here - it's not likely that a hardware access would use this instruction. + gpr.FlushLockX(ABI_PARAM1); + fpr.Lock(inst.RS); + MOV(32, R(ABI_PARAM1), gpr.R(inst.RB)); + if (inst.RA) + ADD(32, R(ABI_PARAM1), gpr.R(inst.RA)); + CVTSD2SS(XMM0, fpr.R(inst.RS)); + MOVD_xmm(R(EAX), XMM0); + UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0); + gpr.UnlockAllX(); + fpr.UnlockAll(); +} + + +void Jit64::lfsx(UGeckoInstruction inst) +{ + if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreFloatingOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + fpr.Lock(inst.RS); + fpr.LoadToX64(inst.RS, false, true); + MOV(32, R(EAX), gpr.R(inst.RB)); + if (inst.RA) + ADD(32, R(EAX), gpr.R(inst.RA)); + if (cpu_info.bSSSE3) { + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); +#ifdef _M_IX86 + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOVD_xmm(r, MDisp(EAX, (u32)Memory::base)); +#else + MOVD_xmm(r, MComplex(RBX, EAX, SCALE_1, 0)); +#endif + PSHUFB(r, M((void *)bswapShuffle1x4)); + CVTSS2SD(r, R(r)); + MOVDDUP(r, R(r)); + } else { + UnsafeLoadRegToReg(EAX, EAX, 32, false); + MOV(32, M(&temp32), R(EAX)); + CVTSS2SD(XMM0, M(&temp32)); + MOVDDUP(fpr.R(inst.RS).GetSimpleReg(), R(XMM0)); + } + fpr.UnlockAll(); +} + diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp new file mode 100644 index 0000000000..d98a0f9ece --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_LoadStorePaired.cpp @@ -0,0 +1,458 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +// TODO(ector): Tons of pshufb optimization of the loads/stores, for SSSE3+, possibly SSE4, only. +// Should give a very noticable speed boost to paired single heavy code. + +#include "Common.h" + +#include "Thunk.h" +#include "../PowerPC.h" +#include "../../Core.h" +#include "../../HW/GPFifo.h" +#include "../../HW/CommandProcessor.h" +#include "../../HW/PixelEngine.h" +#include "../../HW/Memmap.h" +#include "../PPCTables.h" +#include "CPUDetect.h" +#include "x64Emitter.h" +#include "ABI.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitAsm.h" +#include "JitRegCache.h" + +#define INSTRUCTION_START +// #define INSTRUCTION_START Default(inst); return; + +const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15}; +const u8 GC_ALIGNED16(pbswapShuffleNoop[16]) = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + +static double GC_ALIGNED16(psTemp[2]) = {1.0, 1.0}; +static u64 GC_ALIGNED16(temp64); + +// TODO(ector): Improve 64-bit version +static void WriteDual32(u64 value, u32 address) +{ + Memory::Write_U32((u32)(value >> 32), address); + Memory::Write_U32((u32)value, address + 4); +} + +const double GC_ALIGNED16(m_quantizeTableD[]) = +{ + (1 << 0), (1 << 1), (1 << 2), (1 << 3), + (1 << 4), (1 << 5), (1 << 6), (1 << 7), + (1 << 8), (1 << 9), (1 << 10), (1 << 11), + (1 << 12), (1 << 13), (1 << 14), (1 << 15), + (1 << 16), (1 << 17), (1 << 18), (1 << 19), + (1 << 20), (1 << 21), (1 << 22), (1 << 23), + (1 << 24), (1 << 25), (1 << 26), (1 << 27), + (1 << 28), (1 << 29), (1 << 30), (1 << 31), + 1.0 / (1ULL << 32), 1.0 / (1 << 31), 1.0 / (1 << 30), 1.0 / (1 << 29), + 1.0 / (1 << 28), 1.0 / (1 << 27), 1.0 / (1 << 26), 1.0 / (1 << 25), + 1.0 / (1 << 24), 1.0 / (1 << 23), 1.0 / (1 << 22), 1.0 / (1 << 21), + 1.0 / (1 << 20), 1.0 / (1 << 19), 1.0 / (1 << 18), 1.0 / (1 << 17), + 1.0 / (1 << 16), 1.0 / (1 << 15), 1.0 / (1 << 14), 1.0 / (1 << 13), + 1.0 / (1 << 12), 1.0 / (1 << 11), 1.0 / (1 << 10), 1.0 / (1 << 9), + 1.0 / (1 << 8), 1.0 / (1 << 7), 1.0 / (1 << 6), 1.0 / (1 << 5), + 1.0 / (1 << 4), 1.0 / (1 << 3), 1.0 / (1 << 2), 1.0 / (1 << 1), +}; + +const double GC_ALIGNED16(m_dequantizeTableD[]) = +{ + 1.0 / (1 << 0), 1.0 / (1 << 1), 1.0 / (1 << 2), 1.0 / (1 << 3), + 1.0 / (1 << 4), 1.0 / (1 << 5), 1.0 / (1 << 6), 1.0 / (1 << 7), + 1.0 / (1 << 8), 1.0 / (1 << 9), 1.0 / (1 << 10), 1.0 / (1 << 11), + 1.0 / (1 << 12), 1.0 / (1 << 13), 1.0 / (1 << 14), 1.0 / (1 << 15), + 1.0 / (1 << 16), 1.0 / (1 << 17), 1.0 / (1 << 18), 1.0 / (1 << 19), + 1.0 / (1 << 20), 1.0 / (1 << 21), 1.0 / (1 << 22), 1.0 / (1 << 23), + 1.0 / (1 << 24), 1.0 / (1 << 25), 1.0 / (1 << 26), 1.0 / (1 << 27), + 1.0 / (1 << 28), 1.0 / (1 << 29), 1.0 / (1 << 30), 1.0 / (1 << 31), + (1ULL << 32), (1 << 31), (1 << 30), (1 << 29), + (1 << 28), (1 << 27), (1 << 26), (1 << 25), + (1 << 24), (1 << 23), (1 << 22), (1 << 21), + (1 << 20), (1 << 19), (1 << 18), (1 << 17), + (1 << 16), (1 << 15), (1 << 14), (1 << 13), + (1 << 12), (1 << 11), (1 << 10), (1 << 9), + (1 << 8), (1 << 7), (1 << 6), (1 << 5), + (1 << 4), (1 << 3), (1 << 2), (1 << 1), +}; + +// The big problem is likely instructions that set the quantizers in the same block. +// We will have to break block after quantizers are written to. +void Jit64::psq_st(UGeckoInstruction inst) +{ + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + js.block_flags |= BLOCK_USE_GQR0 << inst.I; + + if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers) + { + Default(inst); + return; + } + if (!inst.RA) + { + // This really should never happen. Unless we change this to also support stwux + Default(inst); + return; + } + + const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); + const EQuantizeType stType = static_cast(gqr.ST_TYPE); + int stScale = gqr.ST_SCALE; + bool update = inst.OPCD == 61; + + int offset = inst.SIMM_12; + int a = inst.RA; + int s = inst.RS; // Fp numbers + + if (inst.W) { + // PanicAlert("W=1: stType %i stScale %i update %i", (int)stType, (int)stScale, (int)update); + // It's fairly common that games write stuff to the pipe using this. Then, it's pretty much only + // floats so that's what we'll work on. + switch (stType) + { + case QUANTIZE_FLOAT: + { + // This one has quite a bit of optimization potential. + if (gpr.R(a).IsImm()) + { + PanicAlert("Imm: %08x", gpr.R(a).offset); + } + gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); + gpr.Lock(a); + fpr.Lock(s); + if (update) + gpr.LoadToX64(a, true, true); + MOV(32, R(ABI_PARAM2), gpr.R(a)); + if (offset) + ADD(32, R(ABI_PARAM2), Imm32((u32)offset)); + TEST(32, R(ABI_PARAM2), Imm32(0x0C000000)); + if (update && offset) + MOV(32, gpr.R(a), R(ABI_PARAM2)); + CVTSD2SS(XMM0, fpr.R(s)); + MOVD_xmm(M(&temp64), XMM0); + MOV(32, R(ABI_PARAM1), M(&temp64)); + FixupBranch argh = J_CC(CC_NZ); + BSWAP(32, ABI_PARAM1); +#ifdef _M_X64 + MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); +#else + MOV(32, R(EAX), R(ABI_PARAM2)); + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1)); +#endif + FixupBranch skip_call = J(); + SetJumpTarget(argh); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); + SetJumpTarget(skip_call); + gpr.UnlockAll(); + gpr.UnlockAllX(); + fpr.UnlockAll(); + return; + } + default: + Default(inst); + return; + } + return; + } + + if (stType == QUANTIZE_FLOAT) + { + if (gpr.R(a).IsImm() && !update && cpu_info.bSSSE3) + { + u32 addr = (u32)(gpr.R(a).offset + offset); + if (addr == 0xCC008000) { + // Writing to FIFO. Let's do fast method. + CVTPD2PS(XMM0, fpr.R(s)); + PSHUFB(XMM0, M((void*)&pbswapShuffle2x4)); + CALL((void*)asm_routines.fifoDirectWriteXmm64); + js.fifoBytesThisBlock += 8; + return; + } + } + + gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); + gpr.Lock(a); + fpr.Lock(s); + if (update) + gpr.LoadToX64(a, true, true); + MOV(32, R(ABI_PARAM2), gpr.R(a)); + if (offset) + ADD(32, R(ABI_PARAM2), Imm32((u32)offset)); + TEST(32, R(ABI_PARAM2), Imm32(0x0C000000)); + if (update && offset) + MOV(32, gpr.R(a), R(ABI_PARAM2)); + CVTPD2PS(XMM0, fpr.R(s)); + SHUFPS(XMM0, R(XMM0), 1); + MOVQ_xmm(M(&temp64), XMM0); +#ifdef _M_X64 + MOV(64, R(ABI_PARAM1), M(&temp64)); + FixupBranch argh = J_CC(CC_NZ); + BSWAP(64, ABI_PARAM1); + MOV(64, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); + FixupBranch arg2 = J(); + SetJumpTarget(argh); + CALL(thunks.ProtectFunction((void *)&WriteDual32, 0)); +#else + FixupBranch argh = J_CC(CC_NZ); + MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4)); + BSWAP(32, ABI_PARAM1); + AND(32, R(ABI_PARAM2), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, MDisp(ABI_PARAM2, (u32)Memory::base), R(ABI_PARAM1)); + MOV(32, R(ABI_PARAM1), M(&temp64)); + BSWAP(32, ABI_PARAM1); + MOV(32, MDisp(ABI_PARAM2, 4+(u32)Memory::base), R(ABI_PARAM1)); + FixupBranch arg2 = J(); + SetJumpTarget(argh); + MOV(32, R(ABI_PARAM1), M(((char*)&temp64) + 4)); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); + MOV(32, R(ABI_PARAM1), M(((char*)&temp64))); + ADD(32, R(ABI_PARAM2), Imm32(4)); + ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), ABI_PARAM1, ABI_PARAM2); +#endif + SetJumpTarget(arg2); + gpr.UnlockAll(); + gpr.UnlockAllX(); + fpr.UnlockAll(); + } + else if (stType == QUANTIZE_U8) + { + gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); + gpr.Lock(a); + fpr.Lock(s); + if (update) + gpr.LoadToX64(a, true, update); + MOV(32, R(ABI_PARAM2), gpr.R(a)); + if (offset) + ADD(32, R(ABI_PARAM2), Imm32((u32)offset)); + if (update && offset) + MOV(32, gpr.R(a), R(ABI_PARAM2)); + MOVAPD(XMM0, fpr.R(s)); + MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale])); + MULPD(XMM0, R(XMM1)); + CVTPD2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + PACKUSWB(XMM0, R(XMM0)); + MOVD_xmm(M(&temp64), XMM0); + MOV(16, R(ABI_PARAM1), M(&temp64)); +#ifdef _M_X64 + MOV(16, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); +#else + MOV(32, R(EAX), R(ABI_PARAM2)); + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(16, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1)); +#endif + if (update) + MOV(32, gpr.R(a), R(ABI_PARAM2)); + gpr.UnlockAll(); + gpr.UnlockAllX(); + fpr.UnlockAll(); + } + else if (stType == QUANTIZE_S16) + { + gpr.FlushLockX(ABI_PARAM1, ABI_PARAM2); + gpr.Lock(a); + fpr.Lock(s); + if (update) + gpr.LoadToX64(a, true, update); + MOV(32, R(ABI_PARAM2), gpr.R(a)); + if (offset) + ADD(32, R(ABI_PARAM2), Imm32((u32)offset)); + if (update) + MOV(32, gpr.R(a), R(ABI_PARAM2)); + MOVAPD(XMM0, fpr.R(s)); + MOVDDUP(XMM1, M((void*)&m_quantizeTableD[stScale])); + MULPD(XMM0, R(XMM1)); + SHUFPD(XMM0, R(XMM0), 1); + CVTPD2DQ(XMM0, R(XMM0)); + PACKSSDW(XMM0, R(XMM0)); + MOVD_xmm(M(&temp64), XMM0); + MOV(32, R(ABI_PARAM1), M(&temp64)); + BSWAP(32, ABI_PARAM1); +#ifdef _M_X64 + MOV(32, MComplex(RBX, ABI_PARAM2, SCALE_1, 0), R(ABI_PARAM1)); +#else + MOV(32, R(EAX), R(ABI_PARAM2)); + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, MDisp(EAX, (u32)Memory::base), R(ABI_PARAM1)); +#endif + gpr.UnlockAll(); + gpr.UnlockAllX(); + fpr.UnlockAll(); + } + else { + // Dodger uses this. + // mario tennis + //PanicAlert("st %i:%i", stType, inst.W); + Default(inst); + } +} + +void Jit64::psq_l(UGeckoInstruction inst) +{ + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStorePairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + js.block_flags |= BLOCK_USE_GQR0 << inst.I; + + if (js.blockSetsQuantizers || !Core::GetStartupParameter().bOptimizeQuantizers) + { + Default(inst); + return; + } + + const UGQR gqr(rSPR(SPR_GQR0 + inst.I)); + const EQuantizeType ldType = static_cast(gqr.LD_TYPE); + int ldScale = gqr.LD_SCALE; + bool update = inst.OPCD == 57; + if (!inst.RA || inst.W) + { + // 0 1 during load + //PanicAlert("ld:%i %i", ldType, (int)inst.W); + Default(inst); + return; + } + int offset = inst.SIMM_12; + switch (ldType) { + case QUANTIZE_FLOAT: // We know this is from RAM, so we don't need to check the address. + { +#ifdef _M_X64 + gpr.LoadToX64(inst.RA, true, update); + fpr.LoadToX64(inst.RS, false); + if (cpu_info.bSSSE3) { + X64Reg xd = fpr.R(inst.RS).GetSimpleReg(); + MOVQ_xmm(xd, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); + PSHUFB(xd, M((void *)pbswapShuffle2x4)); + CVTPS2PD(xd, R(xd)); + } else { + MOV(64, R(RAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); + BSWAP(64, RAX); + MOV(64, M(&psTemp[0]), R(RAX)); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + CVTPS2PD(r, M(&psTemp[0])); + SHUFPD(r, R(r), 1); + } + if (update && offset != 0) + ADD(32, gpr.R(inst.RA), Imm32(offset)); + break; +#else + if (cpu_info.bSSSE3) { + gpr.LoadToX64(inst.RA, true, update); + fpr.LoadToX64(inst.RS, false); + X64Reg xd = fpr.R(inst.RS).GetSimpleReg(); + MOV(32, R(EAX), gpr.R(inst.RA)); + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOVQ_xmm(xd, MDisp(EAX, (u32)Memory::base + offset)); + PSHUFB(xd, M((void *)pbswapShuffle2x4)); + CVTPS2PD(xd, R(xd)); + } else { + gpr.FlushLockX(ECX); + gpr.LoadToX64(inst.RA, true, update); + // This can probably be optimized somewhat. + LEA(32, ECX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset)); + AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base)); + BSWAP(32, RAX); + MOV(32, M(&psTemp[0]), R(RAX)); + MOV(32, R(EAX), MDisp(ECX, (u32)Memory::base + 4)); + BSWAP(32, RAX); + MOV(32, M(((float *)&psTemp[0]) + 1), R(RAX)); + fpr.LoadToX64(inst.RS, false, true); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + CVTPS2PD(r, M(&psTemp[0])); + gpr.UnlockAllX(); + } + if (update && offset != 0) + ADD(32, gpr.R(inst.RA), Imm32(offset)); + break; +#endif + } + case QUANTIZE_U8: + { + gpr.LoadToX64(inst.RA, true, update); +#ifdef _M_X64 + MOVZX(32, 16, EAX, MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); +#else + LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset)); + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOVZX(32, 16, EAX, MDisp(EAX, (u32)Memory::base)); +#endif + MOV(32, M(&temp64), R(EAX)); + MOVD_xmm(XMM0, M(&temp64)); + // SSE4 optimization opportunity here. + PXOR(XMM1, R(XMM1)); + PUNPCKLBW(XMM0, R(XMM1)); + PUNPCKLWD(XMM0, R(XMM1)); + CVTDQ2PD(XMM0, R(XMM0)); + fpr.LoadToX64(inst.RS, false, true); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + MOVDDUP(r, M((void *)&m_dequantizeTableD[ldScale])); + MULPD(r, R(XMM0)); + if (update && offset != 0) + ADD(32, gpr.R(inst.RA), Imm32(offset)); + } + break; + case QUANTIZE_S16: + { + gpr.LoadToX64(inst.RA, true, update); +#ifdef _M_X64 + MOV(32, R(EAX), MComplex(RBX, gpr.R(inst.RA).GetSimpleReg(), 1, offset)); +#else + LEA(32, EAX, MDisp(gpr.R(inst.RA).GetSimpleReg(), offset)); + AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); + MOV(32, R(EAX), MDisp(EAX, (u32)Memory::base)); +#endif + BSWAP(32, EAX); + MOV(32, M(&temp64), R(EAX)); + fpr.LoadToX64(inst.RS, false, true); + X64Reg r = fpr.R(inst.RS).GetSimpleReg(); + MOVD_xmm(XMM0, M(&temp64)); + PUNPCKLWD(XMM0, R(XMM0)); // unpack to higher word in each dword.. + PSRAD(XMM0, 16); // then use this signed shift to sign extend. clever eh? :P + CVTDQ2PD(XMM0, R(XMM0)); + MOVDDUP(r, M((void*)&m_dequantizeTableD[ldScale])); + MULPD(r, R(XMM0)); + SHUFPD(r, R(r), 1); + if (update && offset != 0) + ADD(32, gpr.R(inst.RA), Imm32(offset)); + } + break; + + /* + Dynamic quantizer. Todo when we have a test set. + MOVZX(32, 8, EAX, M(((char *)&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]) + 3)); // it's in the high byte. + AND(32, R(EAX), Imm8(0x3F)); + MOV(32, R(ECX), Imm32((u32)&m_dequantizeTableD)); + MOVDDUP(r, MComplex(RCX, EAX, 8, 0)); + */ + default: + // 4 0 + // 6 0 //power tennis + // 5 0 + // PanicAlert("ld:%i %i", ldType, (int)inst.W); + Default(inst); + return; + } + + //u32 EA = (m_GPR[_inst.RA] + _inst.SIMM_12) : _inst.SIMM_12; +} diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp new file mode 100644 index 0000000000..91ca5829e5 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Paired.cpp @@ -0,0 +1,407 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "Common.h" + +#include "../../Core.h" +#include "../PowerPC.h" +#include "../PPCTables.h" +#include "x64Emitter.h" +#include "../../HW/GPFifo.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitRegCache.h" + +// TODO +// ps_madds0 +// ps_muls0 +// ps_madds1 +// ps_sel +// cmppd, andpd, andnpd, or +// lfsx, ps_merge01 etc + +// #define INSTRUCTION_START Default(inst); return; +#define INSTRUCTION_START + + const u64 GC_ALIGNED16(psSignBits[2]) = {0x8000000000000000ULL, 0x8000000000000000ULL}; + const u64 GC_ALIGNED16(psAbsMask[2]) = {0x7FFFFFFFFFFFFFFFULL, 0x7FFFFFFFFFFFFFFFULL}; + const double GC_ALIGNED16(psOneOne[2]) = {1.0, 1.0}; + const double GC_ALIGNED16(psZeroZero[2]) = {0.0, 0.0}; + + void Jit64::ps_mr(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + int d = inst.FD; + int b = inst.FB; + if (d == b) + return; + fpr.LoadToX64(d, false); + MOVAPD(fpr.RX(d), fpr.R(b)); + } + + void Jit64::ps_sel(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + Default(inst); + return; + + if (inst.Rc) { + Default(inst); return; + } + // GRR can't get this to work 100%. Getting artifacts in D.O.N. intro. + int d = inst.FD; + int a = inst.FA; + int b = inst.FB; + int c = inst.FC; + fpr.FlushLockX(XMM7); + fpr.FlushLockX(XMM6); + fpr.Lock(a, b, c, d); + fpr.LoadToX64(a, true, false); + fpr.LoadToX64(d, false, true); + // BLENDPD would have been nice... + MOVAPD(XMM7, fpr.R(a)); + CMPPD(XMM7, M((void*)psZeroZero), 1); //less-than = 111111 + MOVAPD(XMM6, R(XMM7)); + ANDPD(XMM7, fpr.R(d)); + ANDNPD(XMM6, fpr.R(c)); + MOVAPD(fpr.RX(d), R(XMM7)); + ORPD(fpr.RX(d), R(XMM6)); + fpr.UnlockAll(); + fpr.UnlockAllX(); + } + + void Jit64::ps_sign(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + int d = inst.FD; + int b = inst.FB; + + fpr.Lock(d, b); + if (d != b) + { + fpr.LoadToX64(d, false); + MOVAPD(fpr.RX(d), fpr.R(b)); + } + else + { + fpr.LoadToX64(d, true); + } + + switch (inst.SUBOP10) + { + case 40: //neg + XORPD(fpr.RX(d), M((void*)&psSignBits)); + break; + case 136: //nabs + ORPD(fpr.RX(d), M((void*)&psSignBits)); + break; + case 264: //abs + ANDPD(fpr.RX(d), M((void*)&psAbsMask)); + break; + } + + fpr.UnlockAll(); + } + + void Jit64::ps_rsqrte(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + int d = inst.FD; + int b = inst.FB; + fpr.Lock(d, b); + SQRTPD(XMM0, fpr.R(b)); + MOVAPD(XMM1, M((void*)&psOneOne)); + DIVPD(XMM1, R(XMM0)); + MOVAPD(fpr.R(d), XMM1); + fpr.UnlockAll(); + } + + //add a, b, c + + //mov a, b + //add a, c + //we need: + /* + psq_l + psq_stu + */ + + /* + add a,b,a + */ + + //There's still a little bit more optimization that can be squeezed out of this + void Jit64::tri_op(int d, int a, int b, bool reversible, void (XEmitter::*op)(X64Reg, OpArg)) + { + fpr.Lock(d, a, b); + + if (d == a) + { + fpr.LoadToX64(d, true); + (this->*op)(fpr.RX(d), fpr.R(b)); + } + else if (d == b && reversible) + { + fpr.LoadToX64(d, true); + (this->*op)(fpr.RX(d), fpr.R(a)); + } + else if (a != d && b != d) + { + //sources different from d, can use rather quick solution + fpr.LoadToX64(d, false); + MOVAPD(fpr.RX(d), fpr.R(a)); + (this->*op)(fpr.RX(d), fpr.R(b)); + } + else if (b != d) + { + fpr.LoadToX64(d, false); + MOVAPD(XMM0, fpr.R(b)); + MOVAPD(fpr.RX(d), fpr.R(a)); + (this->*op)(fpr.RX(d), Gen::R(XMM0)); + } + else //Other combo, must use two temps :( + { + MOVAPD(XMM0, fpr.R(a)); + MOVAPD(XMM1, fpr.R(b)); + fpr.LoadToX64(d, false); + (this->*op)(XMM0, Gen::R(XMM1)); + MOVAPD(fpr.RX(d), Gen::R(XMM0)); + } + ForceSinglePrecisionP(fpr.RX(d)); + fpr.UnlockAll(); + } + + void Jit64::ps_arith(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + switch (inst.SUBOP5) + { + case 18: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::DIVPD); break; //div + case 20: tri_op(inst.FD, inst.FA, inst.FB, false, &XEmitter::SUBPD); break; //sub + case 21: tri_op(inst.FD, inst.FA, inst.FB, true, &XEmitter::ADDPD); break; //add + case 23://sel + Default(inst); + break; + case 24://res + Default(inst); + break; + case 25: tri_op(inst.FD, inst.FA, inst.FC, true, &XEmitter::MULPD); break; //mul + default: + _assert_msg_(DYNA_REC, 0, "ps_arith WTF!!!"); + } + } + + void Jit64::ps_sum(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + int d = inst.FD; + int a = inst.FA; + int b = inst.FB; + int c = inst.FC; + fpr.Lock(a,b,c,d); + fpr.LoadToX64(d, d == a || d == b || d == c, true); + switch (inst.SUBOP5) + { + case 10: + // Do the sum in upper subregisters, merge uppers + MOVDDUP(XMM0, fpr.R(a)); + MOVAPD(XMM1, fpr.R(b)); + ADDPD(XMM0, R(XMM1)); + UNPCKHPD(XMM0, fpr.R(c)); //merge + MOVAPD(fpr.R(d), XMM0); + break; + case 11: + // Do the sum in lower subregisters, merge lowers + MOVAPD(XMM0, fpr.R(a)); + MOVAPD(XMM1, fpr.R(b)); + SHUFPD(XMM1, R(XMM1), 5); // copy higher to lower + ADDPD(XMM0, R(XMM1)); // sum lowers + MOVAPD(XMM1, fpr.R(c)); + UNPCKLPD(XMM1, R(XMM0)); // merge + MOVAPD(fpr.R(d), XMM1); + break; + default: + PanicAlert("ps_sum WTF!!!"); + } + ForceSinglePrecisionP(fpr.RX(d)); + fpr.UnlockAll(); + } + + + void Jit64::ps_muls(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + int d = inst.FD; + int a = inst.FA; + int c = inst.FC; + fpr.Lock(a, c, d); + fpr.LoadToX64(d, d == a || d == c, true); + switch (inst.SUBOP5) + { + case 12: + // Single multiply scalar high + // TODO - faster version for when regs are different + MOVAPD(XMM0, fpr.R(a)); + MOVDDUP(XMM1, fpr.R(c)); + MULPD(XMM0, R(XMM1)); + MOVAPD(fpr.R(d), XMM0); + break; + case 13: + // TODO - faster version for when regs are different + MOVAPD(XMM0, fpr.R(a)); + MOVAPD(XMM1, fpr.R(c)); + SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower + MULPD(XMM0, R(XMM1)); + MOVAPD(fpr.R(d), XMM0); + break; + default: + PanicAlert("ps_muls WTF!!!"); + } + ForceSinglePrecisionP(fpr.RX(d)); + fpr.UnlockAll(); + } + + + //TODO: find easy cases and optimize them, do a breakout like ps_arith + void Jit64::ps_mergeXX(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + int d = inst.FD; + int a = inst.FA; + int b = inst.FB; + fpr.Lock(a,b,d); + + MOVAPD(XMM0, fpr.R(a)); + switch (inst.SUBOP10) + { + case 528: + UNPCKLPD(XMM0, fpr.R(b)); //unpck is faster than shuf + break; //00 + case 560: + SHUFPD(XMM0, fpr.R(b), 2); //must use shuf here + break; //01 + case 592: + SHUFPD(XMM0, fpr.R(b), 1); + break; //10 + case 624: + UNPCKHPD(XMM0, fpr.R(b)); + break; //11 + default: + _assert_msg_(DYNA_REC, 0, "ps_merge - invalid op"); + } + fpr.LoadToX64(d, false); + MOVAPD(fpr.RX(d), Gen::R(XMM0)); + fpr.UnlockAll(); + } + + + //TODO: add optimized cases + void Jit64::ps_maddXX(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITPairedOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + if (inst.Rc) { + Default(inst); return; + } + int a = inst.FA; + int b = inst.FB; + int c = inst.FC; + int d = inst.FD; + fpr.Lock(a,b,c,d); + + MOVAPD(XMM0, fpr.R(a)); + switch (inst.SUBOP5) + { + case 14: //madds0 + MOVDDUP(XMM1, fpr.R(c)); + MULPD(XMM0, R(XMM1)); + ADDPD(XMM0, fpr.R(b)); + break; + case 15: //madds1 + MOVAPD(XMM1, fpr.R(c)); + SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower + MULPD(XMM0, R(XMM1)); + ADDPD(XMM0, fpr.R(b)); + break; + case 28: //msub + MULPD(XMM0, fpr.R(c)); + SUBPD(XMM0, fpr.R(b)); + break; + case 29: //madd + MULPD(XMM0, fpr.R(c)); + ADDPD(XMM0, fpr.R(b)); + break; + case 30: //nmsub + MULPD(XMM0, fpr.R(c)); + SUBPD(XMM0, fpr.R(b)); + XORPD(XMM0, M((void*)&psSignBits)); + break; + case 31: //nmadd + MULPD(XMM0, fpr.R(c)); + ADDPD(XMM0, fpr.R(b)); + XORPD(XMM0, M((void*)&psSignBits)); + break; + default: + _assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!"); + //Default(inst); + //fpr.UnlockAll(); + return; + } + fpr.LoadToX64(d, false); + MOVAPD(fpr.RX(d), Gen::R(XMM0)); + ForceSinglePrecisionP(fpr.RX(d)); + fpr.UnlockAll(); + } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_SystemRegisters.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_SystemRegisters.cpp new file mode 100644 index 0000000000..10dec47e99 --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_SystemRegisters.cpp @@ -0,0 +1,149 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "Common.h" + +#include "../../Core.h" +#include "../../CoreTiming.h" +#include "../../HW/SystemTimers.h" +#include "../PowerPC.h" +#include "../PPCTables.h" +#include "x64Emitter.h" +#include "ABI.h" +#include "Thunk.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitRegCache.h" + +#define INSTRUCTION_START +// #define INSTRUCTION_START Default(inst); return; + + void Jit64::mtspr(UGeckoInstruction inst) + { + u32 iIndex = (inst.SPRU << 5) | (inst.SPRL & 0x1F); + switch(iIndex) { + case SPR_LR: + ibuild.EmitStoreLink(ibuild.EmitLoadGReg(inst.RD)); + return; + case SPR_CTR: + ibuild.EmitStoreCTR(ibuild.EmitLoadGReg(inst.RD)); + return; + default: + printf("mtspr case %d", iIndex); + Default(inst); + return; + } + } + + void Jit64::mfspr(UGeckoInstruction inst) + { + u32 iIndex = (inst.SPRU << 5) | (inst.SPRL & 0x1F); + switch (iIndex) + { + case SPR_LR: + ibuild.EmitStoreGReg(ibuild.EmitLoadLink(), inst.RD); + return; + case SPR_CTR: + ibuild.EmitStoreGReg(ibuild.EmitLoadCTR(), inst.RD); + return; + default: + printf("mfspr case %d", iIndex); + Default(inst); + return; + } + } + + + // ======================================================================================= + // Don't interpret this, if we do we get thrown out + // -------------- + void Jit64::mtmsr(UGeckoInstruction inst) + { + ibuild.EmitStoreMSR(ibuild.EmitLoadGReg(inst.RS)); + ibuild.EmitBranchUncond(ibuild.EmitIntConst(js.compilerPC + 4)); + } + // ============== + + + void Jit64::mfmsr(UGeckoInstruction inst) + { + ibuild.EmitStoreGReg(ibuild.EmitLoadMSR(), inst.RD); + } + + void Jit64::mftb(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + mfspr(inst); + } + + void Jit64::mfcr(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + // USES_CR + int d = inst.RD; + gpr.LoadToX64(d, false, true); + MOV(8, R(EAX), M(&PowerPC::ppcState.cr_fast[0])); + SHL(32, R(EAX), Imm8(4)); + for (int i = 1; i < 7; i++) { + OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[i])); + SHL(32, R(EAX), Imm8(4)); + } + OR(8, R(EAX), M(&PowerPC::ppcState.cr_fast[7])); + MOV(32, gpr.R(d), R(EAX)); + } + + void Jit64::mtcrf(UGeckoInstruction inst) + { + if(Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITSystemRegistersOff) + {Default(inst); return;} // turn off from debugger + INSTRUCTION_START; + + // USES_CR + u32 mask = 0; + u32 crm = inst.CRM; + if (crm == 0xFF) { + gpr.FlushLockX(ECX); + MOV(32, R(EAX), gpr.R(inst.RS)); + for (int i = 0; i < 8; i++) { + MOV(32, R(ECX), R(EAX)); + SHR(32, R(ECX), Imm8(28 - (i * 4))); + AND(32, R(ECX), Imm32(0xF)); + MOV(8, M(&PowerPC::ppcState.cr_fast[i]), R(ECX)); + } + gpr.UnlockAllX(); + } else { + Default(inst); + return; + + // TODO: translate this to work in new CR model. + for (int i = 0; i < 8; i++) { + if (crm & (1 << i)) + mask |= 0xF << (i*4); + } + MOV(32, R(EAX), gpr.R(inst.RS)); + MOV(32, R(ECX), M(&PowerPC::ppcState.cr)); + AND(32, R(EAX), Imm32(mask)); + AND(32, R(ECX), Imm32(~mask)); + OR(32, R(EAX), R(ECX)); + MOV(32, M(&PowerPC::ppcState.cr), R(EAX)); + } + } diff --git a/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Util.cpp b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Util.cpp new file mode 100644 index 0000000000..20d5caddbf --- /dev/null +++ b/Source/Core/Core/Src/PowerPC/Jit64IL/Jit_Util.cpp @@ -0,0 +1,161 @@ +// Copyright (C) 2003-2008 Dolphin Project. + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, version 2.0. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License 2.0 for more details. + +// A copy of the GPL 2.0 should have been included with the program. +// If not, see http://www.gnu.org/licenses/ + +// Official SVN repository and contact information can be found at +// http://code.google.com/p/dolphin-emu/ + +#include "Common.h" +#include "Thunk.h" + +#include "../PowerPC.h" +#include "../../Core.h" +#include "../../HW/GPFifo.h" +#include "../../HW/CommandProcessor.h" +#include "../../HW/PixelEngine.h" +#include "../../HW/Memmap.h" +#include "../PPCTables.h" +#include "x64Emitter.h" +#include "ABI.h" + +#include "Jit.h" +#include "JitCache.h" +#include "JitAsm.h" +#include "JitRegCache.h" + +void Jit64::JitClearCA() +{ + AND(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(~XER_CA_MASK)); //XER.CA = 0 +} + +void Jit64::JitSetCA() +{ + OR(32, M(&PowerPC::ppcState.spr[SPR_XER]), Imm32(XER_CA_MASK)); //XER.CA = 1 +} + +void Jit64::UnsafeLoadRegToReg(X64Reg reg_addr, X64Reg reg_value, int accessSize, s32 offset, bool signExtend) +{ +#ifdef _M_IX86 + AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK)); + MOVZX(32, accessSize, reg_value, MDisp(reg_addr, (u32)Memory::base + offset)); +#else + MOVZX(32, accessSize, reg_value, MComplex(RBX, reg_addr, SCALE_1, offset)); +#endif + if (accessSize == 32) + { + BSWAP(32, reg_value); + } + else if (accessSize == 16) + { + BSWAP(32, reg_value); + if (signExtend) + SAR(32, R(reg_value), Imm8(16)); + else + SHR(32, R(reg_value), Imm8(16)); + } else if (signExtend) { + // TODO: bake 8-bit into the original load. + MOVSX(32, accessSize, reg_value, R(reg_value)); + } +} + +void Jit64::SafeLoadRegToEAX(X64Reg reg, int accessSize, s32 offset, bool signExtend) +{ + if (offset) + ADD(32, R(reg), Imm32((u32)offset)); + TEST(32, R(reg), Imm32(0x0C000000)); + FixupBranch argh = J_CC(CC_Z); + switch (accessSize) + { + case 32: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U32, 1), reg); break; + case 16: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U16, 1), reg); break; + case 8: ABI_CallFunctionR(thunks.ProtectFunction((void *)&Memory::Read_U8, 1), reg); break; + } + if (signExtend && accessSize < 32) { + // Need to sign extend values coming from the Read_U* functions. + MOVSX(32, accessSize, EAX, R(EAX)); + } + FixupBranch arg2 = J(); + SetJumpTarget(argh); + UnsafeLoadRegToReg(reg, EAX, accessSize, 0, signExtend); + SetJumpTarget(arg2); +} + +void Jit64::UnsafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset) +{ + if (accessSize == 8 && reg_value >= 4) { + PanicAlert("WARNING: likely incorrect use of UnsafeWriteRegToReg!"); + } + BSWAP(accessSize, reg_value); +#ifdef _M_IX86 + AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK)); + MOV(accessSize, MDisp(reg_addr, (u32)Memory::base + offset), R(reg_value)); +#else + MOV(accessSize, MComplex(RBX, reg_addr, SCALE_1, offset), R(reg_value)); +#endif +} + +// Destroys both arg registers +void Jit64::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int accessSize, s32 offset) +{ + if (offset) + ADD(32, R(reg_addr), Imm32(offset)); + TEST(32, R(reg_addr), Imm32(0x0C000000)); + FixupBranch argh = J_CC(CC_Z); + switch (accessSize) + { + case 32: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U32, 2), reg_value, reg_addr); break; + case 16: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U16, 2), reg_value, reg_addr); break; + case 8: ABI_CallFunctionRR(thunks.ProtectFunction((void *)&Memory::Write_U8, 2), reg_value, reg_addr); break; + } + FixupBranch arg2 = J(); + SetJumpTarget(argh); + UnsafeWriteRegToReg(reg_value, reg_addr, accessSize, 0); + SetJumpTarget(arg2); +} + +void Jit64::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address) +{ +#ifdef _M_X64 + MOV(accessSize, MDisp(RBX, address & 0x3FFFFFFF), arg); +#else + MOV(accessSize, M((void*)(Memory::base + (address & Memory::MEMVIEW32_MASK))), arg); +#endif +} + +void Jit64::WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address) +{ +#ifdef _M_X64 + MOV(32, R(RAX), Imm32(address)); + MOVSS(MComplex(RBX, RAX, 1, 0), xmm_reg); +#else + MOVSS(M((void*)((u32)Memory::base + (address & Memory::MEMVIEW32_MASK))), xmm_reg); +#endif +} + +void Jit64::ForceSinglePrecisionS(X64Reg xmm) { + // Most games don't need these. Zelda requires it though - some platforms get stuck without them. + if (jo.accurateSinglePrecision) + { + CVTSD2SS(xmm, R(xmm)); + CVTSS2SD(xmm, R(xmm)); + } +} + +void Jit64::ForceSinglePrecisionP(X64Reg xmm) { + // Most games don't need these. Zelda requires it though - some platforms get stuck without them. + if (jo.accurateSinglePrecision) + { + CVTPD2PS(xmm, R(xmm)); + CVTPS2PD(xmm, R(xmm)); + } +}