Add code for frsqrtex to JIT. Disable the table-based implementation in the interpreter until we find something that it actually fixes, so far it seems like it breaks stuff.

Assorted cleanup around the JIT of flags that we don't need anymore.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4867 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
hrydgard 2010-01-17 11:47:35 +00:00
parent 936664314f
commit f50e3cf5fe
13 changed files with 83 additions and 31 deletions

View File

@ -405,6 +405,8 @@ void fresx(UGeckoInstruction _inst)
if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD));
}
// #define USE_ACCURATE_FRSQRTEX
void frsqrtex(UGeckoInstruction _inst)
{
double b = rPS0(_inst.FB);
@ -415,6 +417,7 @@ void frsqrtex(UGeckoInstruction _inst)
}
else
{
#ifdef USE_ACCURATE_FRSQRTEX
if (b == 0.0) {
SetFPException(FPSCR_ZX);
riPS0(_inst.FD) = 0x7ff0000000000000;
@ -436,6 +439,11 @@ void frsqrtex(UGeckoInstruction _inst)
outa |= frsqrtex_lut[idx] >> 12;
riPS0(_inst.FD) = ((u64)outa << 32) + (u64)outb;
}
#else
if (b == 0.0)
SetFPException(FPSCR_ZX);
rPS0(_inst.FD) = ForceDouble(1.0 / sqrt(b));
#endif
}
UpdateFPRF(rPS0(_inst.FD));
if (_inst.Rc) Helper_UpdateCR1(rPS0(_inst.FD));

View File

@ -443,7 +443,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buf, JitBloc
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.curBlock = b;
js.blockSetsQuantizers = false;
js.block_flags = 0;
js.cancel = false;

View File

@ -110,7 +110,6 @@ private:
int block_flags;
bool isLastInstruction;
bool blockSetsQuantizers;
int fifoBytesThisBlock;
@ -247,6 +246,7 @@ public:
void ps_muls(UGeckoInstruction inst);
void fp_arith_s(UGeckoInstruction inst);
void frsqrtex(UGeckoInstruction inst);
void fcmpx(UGeckoInstruction inst);
void fmrx(UGeckoInstruction inst);

View File

@ -71,6 +71,9 @@ void Jit64::fp_tri_op(int d, int a, int b, bool reversible, bool dupe, void (XEm
fpr.UnlockAll();
}
static const double one_const = 1.0f;
void Jit64::fp_arith_s(UGeckoInstruction inst)
{
INSTRUCTION_START
@ -79,6 +82,20 @@ void Jit64::fp_arith_s(UGeckoInstruction inst)
Default(inst); return;
}
if (inst.SUBOP5 == 26) {
// frsqrtex
int d = inst.FD;
int b = inst.FB;
fpr.Lock(b, d);
fpr.LoadToX64(d, true, true);
MOVSD(XMM0, M((void *)&one_const));
SQRTSD(XMM1, fpr.R(b));
DIVSD(XMM0, R(XMM1));
MOVSD(fpr.R(d), XMM0);
fpr.UnlockAll();
return;
}
if (inst.SUBOP5 != 18 && inst.SUBOP5 != 20 && inst.SUBOP5 != 21 &&
inst.SUBOP5 != 25) {
Default(inst); return;
@ -253,3 +270,5 @@ void Jit64::fcmpx(UGeckoInstruction inst)
SetJumpTarget(continue3);
fpr.UnlockAll();
}

View File

@ -275,9 +275,7 @@ void Jit64::stfs(UGeckoInstruction inst)
MOV(32, gpr.R(a), R(ABI_PARAM2));
}
CVTSD2SS(XMM0, fpr.R(s));
MOVSS(M(&temp32), XMM0);
MOV(32, R(ABI_PARAM1), M(&temp32));
SafeWriteRegToReg(ABI_PARAM1, ABI_PARAM2, 32, 0);
SafeWriteFloatToReg(XMM0, ABI_PARAM2);
gpr.UnlockAll();
gpr.UnlockAllX();
fpr.UnlockAll();

View File

@ -51,9 +51,8 @@ void Jit64::psq_st(UGeckoInstruction inst)
{
INSTRUCTION_START
JITDISABLE(LoadStorePaired)
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
if (js.blockSetsQuantizers || !inst.RA)
if (!inst.RA)
{
// TODO: Support these cases if it becomes necessary.
Default(inst);
@ -105,12 +104,13 @@ void Jit64::psq_st(UGeckoInstruction inst)
MOV(32, gpr.R(a), R(ECX));
MOVZX(32, 16, EAX, M(&PowerPC::ppcState.spr[SPR_GQR0 + inst.I]));
MOVZX(32, 8, EDX, R(AL));
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32]!
// FIXME: Fix ModR/M encoding to allow [EDX*4+disp32] without a base register!
#ifdef _M_IX86
SHL(32, R(EDX), Imm8(2));
int addr_shift = 2;
#else
SHL(32, R(EDX), Imm8(3));
int addr_shift = 3;
#endif
SHL(32, R(EDX), Imm8(addr_shift));
if (inst.W) {
// One value
XORPS(XMM0, R(XMM0)); // TODO: See if we can get rid of this cheaply by tweaking the code in the singleStore* functions.
@ -130,14 +130,20 @@ void Jit64::psq_l(UGeckoInstruction inst)
INSTRUCTION_START
JITDISABLE(LoadStorePaired)
js.block_flags |= BLOCK_USE_GQR0 << inst.I;
if (js.blockSetsQuantizers || !inst.RA || inst.W)
if (!inst.RA)
{
Default(inst);
return;
}
const UGQR gqr(rSPR(SPR_GQR0 + inst.I));
if (inst.W) {
// PanicAlert("Single ps load: %i %i", gqr.ST_TYPE, gqr.ST_SCALE);
Default(inst);
return;
}
bool update = inst.OPCD == 57;
int offset = inst.SIMM_12;

View File

@ -52,7 +52,6 @@ void Jit64::mtspr(UGeckoInstruction inst)
case SPR_GQR0 + 5:
case SPR_GQR0 + 6:
case SPR_GQR0 + 7:
js.blockSetsQuantizers = true;
// Prevent recompiler from compiling in old quantizer values.
// If the value changed, destroy all blocks using this quantizer
// This will create a little bit of block churn, but hopefully not too bad.

View File

@ -397,8 +397,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitB
js.blockStart = em_address;
js.fifoBytesThisBlock = 0;
js.curBlock = b;
js.blockSetsQuantizers = false;
js.block_flags = 0;
js.cancel = false;
//Analyze the block, collect all instructions it is made of (including inlining,
@ -464,7 +462,6 @@ const u8* Jit64::DoJit(u32 em_address, PPCAnalyst::CodeBuffer *code_buffer, JitB
// Perform actual code generation
WriteCode();
b->flags = js.block_flags;
b->codeSize = (u32)(GetCodePtr() - normalEntry);
b->originalSize = size;
return normalEntry;

View File

@ -98,10 +98,8 @@ private:
UGeckoInstruction next_inst; // for easy peephole opt.
int instructionNumber;
int downcountAmount;
int block_flags;
bool isLastInstruction;
bool blockSetsQuantizers;
bool forceUnsafeLoad;
int fifoBytesThisBlock;

View File

@ -299,6 +299,9 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
// Easy!
const u8* storeSingleFloat = AlignCode4();
SafeWriteFloatToReg(XMM0, ECX);
RET();
/*
if (cpu_info.bSSSE3) {
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
// TODO: SafeWriteFloat
@ -309,8 +312,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
MOVSS(M(&psTemp[0]), XMM0);
MOV(32, R(EAX), M(&psTemp[0]));
SafeWriteRegToReg(EAX, ECX, 32, 0, true);
}
RET();
}*/
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
SHR(32, R(EAX), Imm8(6));
@ -336,8 +338,7 @@ void CommonAsmRoutines::GenQuantizedSingleStores() {
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
SHR(32, R(EAX), Imm8(6));
MOVSS(XMM1, MDisp(EAX, (u32)(u64)m_quantizeTableS));
PUNPCKLDQ(XMM1, R(XMM1));
MULPS(XMM0, R(XMM1));
MULSS(XMM0, R(XMM1));
PXOR(XMM1, R(XMM1));
MAXSS(XMM0, R(XMM1));
MINSS(XMM0, M((void *)&m_65535));

View File

@ -42,13 +42,6 @@
#define JIT_ICACHE_INVALID_BYTE 0x14
#define JIT_ICACHE_INVALID_WORD 0x14141414
enum BlockFlag
{
BLOCK_USE_GQR0 = 0x1, BLOCK_USE_GQR1 = 0x2, BLOCK_USE_GQR2 = 0x4, BLOCK_USE_GQR3 = 0x8,
BLOCK_USE_GQR4 = 0x10, BLOCK_USE_GQR5 = 0x20, BLOCK_USE_GQR6 = 0x40, BLOCK_USE_GQR7 = 0x80,
};
// TODO(ector) - optimize this struct for size
struct JitBlock
{

View File

@ -18,6 +18,7 @@
#include "Common.h"
#include "Thunk.h"
#include "CPUDetect.h"
#include "../PowerPC.h"
#include "../../Core.h"
#include "../../HW/GPFifo.h"
@ -139,6 +140,36 @@ void EmuCodeBlock::SafeWriteRegToReg(X64Reg reg_value, X64Reg reg_addr, int acce
SetJumpTarget(arg2);
}
static const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
static u32 GC_ALIGNED16(float_buffer);
void EmuCodeBlock::SafeWriteFloatToReg(X64Reg xmm_value, X64Reg reg_addr)
{
TEST(32, R(reg_addr), Imm32(0x0C000000));
if (false && cpu_info.bSSSE3) {
// This path should be faster but for some reason it causes errors so I've disabled it.
FixupBranch argh = J_CC(CC_Z);
MOVSS(M(&float_buffer), xmm_value);
MOV(32, R(EAX), M(&float_buffer));
BSWAP(32, EAX);
ABI_CallFunctionRR(thunks.ProtectFunction(((void *)&Memory::Write_U32), 2), EAX, reg_addr);
FixupBranch arg2 = J();
SetJumpTarget(argh);
PSHUFB(xmm_value, M((void *)pbswapShuffle1x4));
#ifdef _M_IX86
AND(32, R(reg_addr), Imm32(Memory::MEMVIEW32_MASK));
MOVD_xmm(MDisp(reg_addr, (u32)Memory::base), xmm_value);
#else
MOVD_xmm(MComplex(RBX, reg_addr, SCALE_1, 0), xmm_value);
#endif
SetJumpTarget(arg2);
} else {
MOVSS(M(&float_buffer), xmm_value);
MOV(32, R(EAX), M(&float_buffer));
SafeWriteRegToReg(EAX, reg_addr, 32, 0, true);
}
}
void EmuCodeBlock::WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address)
{
#ifdef _M_X64

View File

@ -29,6 +29,9 @@ public:
void SafeLoadRegToEAX(Gen::X64Reg reg, int accessSize, s32 offset, bool signExtend = false);
void SafeWriteRegToReg(Gen::X64Reg reg_value, Gen::X64Reg reg_addr, int accessSize, s32 offset, bool swap = true);
// Trashes both inputs and EAX.
void SafeWriteFloatToReg(Gen::X64Reg xmm_value, Gen::X64Reg reg_addr);
void WriteToConstRamAddress(int accessSize, const Gen::OpArg& arg, u32 address);
void WriteFloatToConstRamAddress(const Gen::X64Reg& xmm_reg, u32 address);
void JitClearCA();