Add code for frsqrtex to JIT. Disable the table-based implementation in the interpreter until we find something that it actually fixes, so far it seems like it breaks stuff.
Assorted cleanup around the JIT of flags that we don't need anymore. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4867 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
936664314f
commit
f50e3cf5fe
|
@ -405,6 +405,8 @@ void fresx(UGeckoInstruction _inst)
|
||||||
if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD));
|
if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// #define USE_ACCURATE_FRSQRTEX
|
||||||
|
|
||||||
void frsqrtex(UGeckoInstruction _inst)
|
void frsqrtex(UGeckoInstruction _inst)
|
||||||
{
|
{
|
||||||
double b = rPS0(_inst.FB);
|
double b = rPS0(_inst.FB);
|
||||||
|
@ -415,6 +417,7 @@ void frsqrtex(UGeckoInstruction _inst)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
#ifdef USE_ACCURATE_FRSQRTEX
|
||||||
if (b == 0.0) {
|
if (b == 0.0) {
|
||||||
SetFPException(FPSCR_ZX);
|
SetFPException(FPSCR_ZX);
|
||||||
riPS0(_inst.FD) = 0x7ff0000000000000;
|
riPS0(_inst.FD) = 0x7ff0000000000000;
|
||||||
|
@ -436,6 +439,11 @@ void frsqrtex(UGeckoInstruction _inst)
|
||||||
outa |= frsqrtex_lut[idx] >> 12;
|
outa |= frsqrtex_lut[idx] >> 12;
|
||||||
riPS0(_inst.FD) = ((u64)outa << 32) + (u64)outb;
|
riPS0(_inst.FD) = ((u64)outa << 32) + (u64)outb;
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
if (b == 0.0)
|
||||||
|
SetFPException(FPSCR_ZX);
|
||||||
|
rPS0(_inst.FD) = ForceDouble(1.0 / sqrt(b));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
UpdateFPRF(rPS0(_inst.FD));
|
UpdateFPRF(rPS0(_inst.FD));
|
||||||
if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD));
|
if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD));
|
||||||
|
|
|
@ -443,7 +443,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
|
||||||
js.blockStart = em_address;
|
js.blockStart = em_address;
|
||||||
js.fifoBytesThisBlock = 0;
|
js.fifoBytesThisBlock = 0;
|
||||||
js.curBlock = b;
|
js.curBlock = b;
|
||||||
js.blockSetsQuantizers = false;
|
|
||||||
js.block_flags = 0;
|
js.block_flags = 0;
|
||||||
js.cancel = false;
|
js.cancel = false;
|
||||||
|
|
||||||
|
|
|
@ -110,7 +110,6 @@ private:
|
||||||
int block_flags;
|
int block_flags;
|
||||||
|
|
||||||
bool isLastInstruction;
|
bool isLastInstruction;
|
||||||
bool blockSetsQuantizers;
|
|
||||||
|
|
||||||
int fifoBytesThisBlock;
|
int fifoBytesThisBlock;
|
||||||
|
|
||||||
|
@ -247,6 +246,7 @@ public:
|
||||||
void ps_muls(UGeckoInstruction inst);
|
void ps_muls(UGeckoInstruction inst);
|
||||||
|
|
||||||
void fp_arith_s(UGeckoInstruction inst);
|
void fp_arith_s(UGeckoInstruction inst);
|
||||||
|
void frsqrtex(UGeckoInstruction inst);
|
||||||
|
|
||||||
void fcmpx(UGeckoInstruction inst);
|
void fcmpx(UGeckoInstruction inst);
|
||||||
void fmrx(UGeckoInstruction inst);
|
void fmrx(UGeckoInstruction inst);
|
||||||
|
|
|
@ -71,6 +71,9 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEm
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static const double one_const = 1.0f;
|
||||||
|
|
||||||
void Jit64::fp_arith_s(UGeckoInstruction inst)
|
void Jit64::fp_arith_s(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
|
@ -79,6 +82,20 @@ void Jit64::fp_arith_s(UGeckoInstruction inst)
|
||||||
Default(inst); return;
|
Default(inst); return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (inst.SUBOP5 == 26) {
|
||||||
|
// frsqrtex
|
||||||
|
int d = inst.FD;
|
||||||
|
int b = inst.FB;
|
||||||
|
fpr.Lock(b, d);
|
||||||
|
fpr.LoadToX64(d, true, true);
|
||||||
|
MOVSD(XMM0, M((void *)&one_const));
|
||||||
|
SQRTSD(XMM1, fpr.R(b));
|
||||||
|
DIVSD(XMM0, R(XMM1));
|
||||||
|
MOVSD(fpr.R(d), XMM0);
|
||||||
|
fpr.UnlockAll();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (inst.SUBOP5 != 18 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21 &&
|
if (inst.SUBOP5 != 18 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21 &&
|
||||||
inst.SUBOP5 != 25) {
|
inst.SUBOP5 != 25) {
|
||||||
Default(inst); return;
|
Default(inst); return;
|
||||||
|
@ -253,3 +270,5 @@ void Jit64::fcmpx(UGeckoInstruction inst)
|
||||||
SetJumpTarget(continue3);
|
SetJumpTarget(continue3);
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -275,9 +275,7 @@ void Jit64::stfs(UGeckoInstruction inst)
|
||||||
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
MOV(32, gpr.R(a), R(ABI_PARAM2));
|
||||||
}
|
}
|
||||||
CVTSD2SS(XMM0, fpr.R(s));
|
CVTSD2SS(XMM0, fpr.R(s));
|
||||||
MOVSS(M(&temp32), XMM0);
|
SafeWriteFloatToReg(XMM0, ABI_PARAM2);
|
||||||
MOV(32, R(ABI_PARAM1), M(&temp32));
|
|
||||||
SafeWriteRegToReg(ABI_PARAM1, ABI_PARAM2, 32, 0);
|
|
||||||
gpr.UnlockAll();
|
gpr.UnlockAll();
|
||||||
gpr.UnlockAllX();
|
gpr.UnlockAllX();
|
||||||
fpr.UnlockAll();
|
fpr.UnlockAll();
|
||||||
|
|
|
@ -51,9 +51,8 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(LoadStorePaired)
|
JITDISABLE(LoadStorePaired)
|
||||||
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
|
|
||||||
|
|
||||||
if (js.blockSetsQuantizers || !inst.RA)
|
if (!inst.RA)
|
||||||
{
|
{
|
||||||
// TODO: Support these cases if it becomes necessary.
|
// TODO: Support these cases if it becomes necessary.
|
||||||
Default(inst);
|
Default(inst);
|
||||||
|
@ -105,12 +104,13 @@ void Jit64::psq_st(UGeckoInstruction inst)
|
||||||
MOV(32, gpr.R(a), R(ECX));
|
MOV(32, gpr.R(a), R(ECX));
|
||||||
MOVZX(32, 16, EAX, M(&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]));
|
MOVZX(32, 16, EAX, M(&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]));
|
||||||
MOVZX(32, 8, EDX, R(AL));
|
MOVZX(32, 8, EDX, R(AL));
|
||||||
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]!
|
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32] without a base register!
|
||||||
#ifdef _M_IX86
|
#ifdef _M_IX86
|
||||||
SHL(32, R(EDX), Imm8(2));
|
int addr_shift = 2;
|
||||||
#else
|
#else
|
||||||
SHL(32, R(EDX), Imm8(3));
|
int addr_shift = 3;
|
||||||
#endif
|
#endif
|
||||||
|
SHL(32, R(EDX), Imm8(addr_shift));
|
||||||
if (inst.W) {
|
if (inst.W) {
|
||||||
// One value
|
// One value
|
||||||
XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
|
XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
|
||||||
|
@ -130,14 +130,20 @@ void Jit64::psq_l(UGeckoInstruction inst)
|
||||||
INSTRUCTION_START
|
INSTRUCTION_START
|
||||||
JITDISABLE(LoadStorePaired)
|
JITDISABLE(LoadStorePaired)
|
||||||
|
|
||||||
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
|
if (!inst.RA)
|
||||||
|
|
||||||
if (js.blockSetsQuantizers || !inst.RA || inst.W)
|
|
||||||
{
|
{
|
||||||
Default(inst);
|
Default(inst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
|
||||||
|
|
||||||
|
if (inst.W) {
|
||||||
|
// PanicAlert("Single ps load: %i %i", gqr.ST_TYPE, gqr.ST_SCALE);
|
||||||
|
Default(inst);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
bool update = inst.OPCD == 57;
|
bool update = inst.OPCD == 57;
|
||||||
int offset = inst.SIMM_12;
|
int offset = inst.SIMM_12;
|
||||||
|
|
||||||
|
|
|
@ -52,7 +52,6 @@ void Jit64::mtspr(UGeckoInstruction inst)
|
||||||
case SPR_GQR0 + 5:
|
case SPR_GQR0 + 5:
|
||||||
case SPR_GQR0 + 6:
|
case SPR_GQR0 + 6:
|
||||||
case SPR_GQR0 + 7:
|
case SPR_GQR0 + 7:
|
||||||
js.blockSetsQuantizers = true;
|
|
||||||
// Prevent recompiler from compiling in old quantizer values.
|
// Prevent recompiler from compiling in old quantizer values.
|
||||||
// If the value changed, destroy all blocks using this quantizer
|
// If the value changed, destroy all blocks using this quantizer
|
||||||
// This will create a little bit of block churn, but hopefully not too bad.
|
// This will create a little bit of block churn, but hopefully not too bad.
|
||||||
|
|
|
@ -397,8 +397,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitB
|
||||||
js.blockStart = em_address;
|
js.blockStart = em_address;
|
||||||
js.fifoBytesThisBlock = 0;
|
js.fifoBytesThisBlock = 0;
|
||||||
js.curBlock = b;
|
js.curBlock = b;
|
||||||
js.blockSetsQuantizers = false;
|
|
||||||
js.block_flags = 0;
|
|
||||||
js.cancel = false;
|
js.cancel = false;
|
||||||
|
|
||||||
//Analyze the block, collect all instructions it is made of (including inlining,
|
//Analyze the block, collect all instructions it is made of (including inlining,
|
||||||
|
@ -464,7 +462,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitB
|
||||||
// Perform actual code generation
|
// Perform actual code generation
|
||||||
WriteCode();
|
WriteCode();
|
||||||
|
|
||||||
b->flags = js.block_flags;
|
|
||||||
b->codeSize = (u32)(GetCodePtr() - normalEntry);
|
b->codeSize = (u32)(GetCodePtr() - normalEntry);
|
||||||
b->originalSize = size;
|
b->originalSize = size;
|
||||||
return normalEntry;
|
return normalEntry;
|
||||||
|
|
|
@ -98,10 +98,8 @@ private:
|
||||||
UGeckoInstruction next_inst; // for easy peephole opt.
|
UGeckoInstruction next_inst; // for easy peephole opt.
|
||||||
int instructionNumber;
|
int instructionNumber;
|
||||||
int downcountAmount;
|
int downcountAmount;
|
||||||
int block_flags;
|
|
||||||
|
|
||||||
bool isLastInstruction;
|
bool isLastInstruction;
|
||||||
bool blockSetsQuantizers;
|
|
||||||
bool forceUnsafeLoad;
|
bool forceUnsafeLoad;
|
||||||
|
|
||||||
int fifoBytesThisBlock;
|
int fifoBytesThisBlock;
|
||||||
|
|
|
@ -299,6 +299,9 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
||||||
|
|
||||||
// Easy!
|
// Easy!
|
||||||
const u8* storeSingleFloat = AlignCode4();
|
const u8* storeSingleFloat = AlignCode4();
|
||||||
|
SafeWriteFloatToReg(XMM0, ECX);
|
||||||
|
RET();
|
||||||
|
/*
|
||||||
if (cpu_info.bSSSE3) {
|
if (cpu_info.bSSSE3) {
|
||||||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||||
// TODO: SafeWriteFloat
|
// TODO: SafeWriteFloat
|
||||||
|
@ -309,8 +312,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
||||||
MOVSS(M(&psTemp[0]), XMM0);
|
MOVSS(M(&psTemp[0]), XMM0);
|
||||||
MOV(32, R(EAX), M(&psTemp[0]));
|
MOV(32, R(EAX), M(&psTemp[0]));
|
||||||
SafeWriteRegToReg(EAX, ECX, 32, 0, true);
|
SafeWriteRegToReg(EAX, ECX, 32, 0, true);
|
||||||
}
|
}*/
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||||
SHR(32, R(EAX), Imm8(6));
|
SHR(32, R(EAX), Imm8(6));
|
||||||
|
@ -336,8 +338,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
|
||||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||||
SHR(32, R(EAX), Imm8(6));
|
SHR(32, R(EAX), Imm8(6));
|
||||||
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS));
|
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS));
|
||||||
PUNPCKLDQ(XMM1, R(XMM1));
|
MULSS(XMM0, R(XMM1));
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
PXOR(XMM1, R(XMM1));
|
PXOR(XMM1, R(XMM1));
|
||||||
MAXSS(XMM0, R(XMM1));
|
MAXSS(XMM0, R(XMM1));
|
||||||
MINSS(XMM0, M((void *)&m_65535));
|
MINSS(XMM0, M((void *)&m_65535));
|
||||||
|
|
|
@ -42,13 +42,6 @@
|
||||||
#define JIT_ICACHE_INVALID_BYTE 0x14
|
#define JIT_ICACHE_INVALID_BYTE 0x14
|
||||||
#define JIT_ICACHE_INVALID_WORD 0x14141414
|
#define JIT_ICACHE_INVALID_WORD 0x14141414
|
||||||
|
|
||||||
|
|
||||||
enum BlockFlag
|
|
||||||
{
|
|
||||||
BLOCK_USE_GQR0 = 0x1, BLOCK_USE_GQR1 = 0x2, BLOCK_USE_GQR2 = 0x4, BLOCK_USE_GQR3 = 0x8,
|
|
||||||
BLOCK_USE_GQR4 = 0x10, BLOCK_USE_GQR5 = 0x20, BLOCK_USE_GQR6 = 0x40, BLOCK_USE_GQR7 = 0x80,
|
|
||||||
};
|
|
||||||
|
|
||||||
// TODO(ector) - optimize this struct for size
|
// TODO(ector) - optimize this struct for size
|
||||||
struct JitBlock
|
struct JitBlock
|
||||||
{
|
{
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#include "Common.h"
|
#include "Common.h"
|
||||||
#include "Thunk.h"
|
#include "Thunk.h"
|
||||||
|
|
||||||
|
#include "CPUDetect.h"
|
||||||
#include "../PowerPC.h"
|
#include "../PowerPC.h"
|
||||||
#include "../../Core.h"
|
#include "../../Core.h"
|
||||||
#include "../../HW/GPFifo.h"
|
#include "../../HW/GPFifo.h"
|
||||||
|
@ -139,6 +140,36 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
|
||||||
SetJumpTarget(arg2);
|
SetJumpTarget(arg2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||||
|
static u32 GC_ALIGNED16(float_buffer);
|
||||||
|
|
||||||
|
void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr)
|
||||||
|
{
|
||||||
|
TEST(32, R(reg_addr), Imm32(0x0C000000));
|
||||||
|
if (false && cpu_info.bSSSE3) {
|
||||||
|
// This path should be faster but for some reason it causes errors so I've disabled it.
|
||||||
|
FixupBranch argh = J_CC(CC_Z);
|
||||||
|
MOVSS(M(&float_buffer), xmm_value);
|
||||||
|
MOV(32, R(EAX), M(&float_buffer));
|
||||||
|
BSWAP(32, EAX);
|
||||||
|
ABI_CallFunctionRR(thunks.ProtectFunction(((void *)&Memory::Write_U32), 2), EAX, reg_addr);
|
||||||
|
FixupBranch arg2 = J();
|
||||||
|
SetJumpTarget(argh);
|
||||||
|
PSHUFB(xmm_value, M((void *)pbswapShuffle1x4));
|
||||||
|
#ifdef _M_IX86
|
||||||
|
AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK));
|
||||||
|
MOVD_xmm(MDisp(reg_addr, (u32)Memory::base), xmm_value);
|
||||||
|
#else
|
||||||
|
MOVD_xmm(MComplex(RBX, reg_addr, SCALE_1, 0), xmm_value);
|
||||||
|
#endif
|
||||||
|
SetJumpTarget(arg2);
|
||||||
|
} else {
|
||||||
|
MOVSS(M(&float_buffer), xmm_value);
|
||||||
|
MOV(32, R(EAX), M(&float_buffer));
|
||||||
|
SafeWriteRegToReg(EAX, reg_addr, 32, 0, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
|
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
|
||||||
{
|
{
|
||||||
#ifdef _M_X64
|
#ifdef _M_X64
|
||||||
|
|
|
@ -29,6 +29,9 @@ public:
|
||||||
void SafeLoadRegToEAX(Gen::X64Reg reg, int accessSize, s32 offset, bool signExtend = false);
|
void SafeLoadRegToEAX(Gen::X64Reg reg, int accessSize, s32 offset, bool signExtend = false);
|
||||||
void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, bool swap = true);
|
void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, bool swap = true);
|
||||||
|
|
||||||
|
// Trashes both inputs and EAX.
|
||||||
|
void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr);
|
||||||
|
|
||||||
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
|
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
|
||||||
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
|
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
|
||||||
void JitClearCA();
|
void JitClearCA();
|
||||||
|
|
Loading…
Reference in New Issue