mirror of https://github.com/PCSX2/pcsx2.git
1087 lines
31 KiB
C++
1087 lines
31 KiB
C++
// SPDX-FileCopyrightText: 2002-2024 PCSX2 Dev Team
|
|
// SPDX-License-Identifier: GPL-3.0+
|
|
|
|
#include "Common.h"
|
|
#include "R5900OpcodeTables.h"
|
|
#include "common/emitter/x86emitter.h"
|
|
#include "iR5900.h"
|
|
#include "iFPU.h"
|
|
|
|
/* This is a version of the FPU that emulates an exponent of 0xff and overflow/underflow flags */
|
|
|
|
/* Can be made faster by not converting stuff back and forth between instructions. */
|
|
|
|
|
|
//----------------------------------------------------------------
|
|
// FPU emulation status:
|
|
// ADD, SUB (incl. accumulation stage of MADD/MSUB) - no known problems.
|
|
// Mul (incl. multiplication stage of MADD/MSUB) - incorrect. PS2's result mantissa is sometimes
|
|
// smaller by 0x1 than IEEE's result (with round to zero).
|
|
// DIV, SQRT, RSQRT - incorrect. PS2's result varies between IEEE's result with round to zero
|
|
// and IEEE's result with round to +/-infinity.
|
|
// other stuff - no known problems.
|
|
//----------------------------------------------------------------
|
|
|
|
|
|
using namespace x86Emitter;
|
|
|
|
// Set overflow flag (define only if FPU_RESULT is 1)
|
|
#define FPU_FLAGS_OVERFLOW 1
|
|
// Set underflow flag (define only if FPU_RESULT is 1)
|
|
#define FPU_FLAGS_UNDERFLOW 1
|
|
|
|
// If 1, result is not clamped (Gives correct results as in PS2,
|
|
// but can cause problems due to insufficient clamping levels in the VUs)
|
|
#define FPU_RESULT 1
|
|
|
|
// Set I&D flags. also impacts other aspects of DIV/R/SQRT correctness
|
|
#define FPU_FLAGS_ID 1
|
|
|
|
// Add/Sub opcodes produce the same results as the ps2
|
|
#define FPU_CORRECT_ADD_SUB 1
|
|
|
|
#ifdef FPU_RECOMPILE
|
|
|
|
//------------------------------------------------------------------
|
|
namespace R5900 {
|
|
namespace Dynarec {
|
|
namespace OpcodeImpl {
|
|
namespace COP1 {
|
|
|
|
namespace DOUBLE {
|
|
|
|
//------------------------------------------------------------------
|
|
// Helper Macros
|
|
//------------------------------------------------------------------
|
|
#define _Ft_ _Rt_
|
|
#define _Fs_ _Rd_
|
|
#define _Fd_ _Sa_
|
|
|
|
// FCR31 Flags
|
|
#define FPUflagC 0x00800000
|
|
#define FPUflagI 0x00020000
|
|
#define FPUflagD 0x00010000
|
|
#define FPUflagO 0x00008000
|
|
#define FPUflagU 0x00004000
|
|
#define FPUflagSI 0x00000040
|
|
#define FPUflagSD 0x00000020
|
|
#define FPUflagSO 0x00000010
|
|
#define FPUflagSU 0x00000008
|
|
|
|
//------------------------------------------------------------------
|
|
|
|
//------------------------------------------------------------------
|
|
// *FPU Opcodes!*
|
|
//------------------------------------------------------------------
|
|
|
|
//------------------------------------------------------------------
|
|
// PS2 -> DOUBLE
|
|
//------------------------------------------------------------------
|
|
|
|
#define SINGLE(sign, exp, mant) (((u32)(sign) << 31) | ((u32)(exp) << 23) | (u32)(mant))
|
|
#define DOUBLE(sign, exp, mant) (((sign##ULL) << 63) | ((exp##ULL) << 52) | (mant##ULL))
|
|
|
|
struct FPUd_Globals
|
|
{
|
|
u32 neg[4], pos[4];
|
|
|
|
u32 pos_inf[4], neg_inf[4],
|
|
one_exp[4];
|
|
|
|
u64 dbl_one_exp[2];
|
|
|
|
u64 dbl_cvt_overflow, // needs special code if above or equal
|
|
dbl_ps2_overflow, // overflow & clamp if above or equal
|
|
dbl_underflow; // underflow if below
|
|
|
|
u64 padding;
|
|
|
|
u64 dbl_s_pos[2];
|
|
//u64 dlb_s_neg[2];
|
|
};
|
|
|
|
alignas(32) static const FPUd_Globals s_const =
|
|
{
|
|
{0x80000000, 0xffffffff, 0xffffffff, 0xffffffff},
|
|
{0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff},
|
|
|
|
{SINGLE(0, 0xff, 0), 0, 0, 0},
|
|
{SINGLE(1, 0xff, 0), 0, 0, 0},
|
|
{SINGLE(0, 1, 0), 0, 0, 0},
|
|
|
|
{DOUBLE(0, 1, 0), 0},
|
|
|
|
DOUBLE(0, 1151, 0), // cvt_overflow
|
|
DOUBLE(0, 1152, 0), // ps2_overflow
|
|
DOUBLE(0, 897, 0), // underflow
|
|
|
|
0, // Padding!!
|
|
|
|
{0x7fffffffffffffffULL, 0},
|
|
//{0x8000000000000000ULL, 0},
|
|
};
|
|
|
|
|
|
// ToDouble : converts single-precision PS2 float to double-precision IEEE float
|
|
|
|
void ToDouble(int reg)
|
|
{
|
|
xUCOMI.SS(xRegisterSSE(reg), ptr[s_const.pos_inf]); // Sets ZF if reg is equal or incomparable to pos_inf
|
|
u8* to_complex = JE8(0); // Complex conversion if positive infinity or NaN
|
|
xUCOMI.SS(xRegisterSSE(reg), ptr[s_const.neg_inf]);
|
|
u8* to_complex2 = JE8(0); // Complex conversion if negative infinity
|
|
|
|
xCVTSS2SD(xRegisterSSE(reg), xRegisterSSE(reg)); // Simply convert
|
|
u8* end = JMP8(0);
|
|
|
|
x86SetJ8(to_complex);
|
|
x86SetJ8(to_complex2);
|
|
|
|
// Special conversion for when IEEE sees the value in reg as an INF/NaN
|
|
xPSUB.D(xRegisterSSE(reg), ptr[s_const.one_exp]); // Lower exponent by one
|
|
xCVTSS2SD(xRegisterSSE(reg), xRegisterSSE(reg));
|
|
xPADD.Q(xRegisterSSE(reg), ptr[s_const.dbl_one_exp]); // Raise exponent by one
|
|
|
|
x86SetJ8(end);
|
|
}
|
|
|
|
//------------------------------------------------------------------
|
|
// DOUBLE -> PS2
|
|
//------------------------------------------------------------------
|
|
|
|
// If FPU_RESULT is defined, results are more like the real PS2's FPU.
|
|
// But new issues may happen if the VU isn't clamping all operands since games may transfer FPU results into the VU.
|
|
// Ar tonelico 1 does this with the result from DIV/RSQRT (when a division by zero occurs).
|
|
// Otherwise, results are still usually better than iFPU.cpp.
|
|
|
|
// ToPS2FPU_Full - converts double-precision IEEE float to single-precision PS2 float
|
|
|
|
// converts small normal numbers to PS2 equivalent
|
|
// converts large normal numbers to PS2 equivalent (which represent NaN/inf in IEEE)
|
|
// converts really large normal numbers to PS2 signed max
|
|
// converts really small normal numbers to zero (flush)
|
|
// doesn't handle inf/nan/denormal
|
|
|
|
void ToPS2FPU_Full(int reg, bool flags, int absreg, bool acc, bool addsub)
|
|
{
|
|
if (flags)
|
|
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO | FPUflagU));
|
|
if (flags && acc)
|
|
xAND(ptr32[&fpuRegs.ACCflag], ~1);
|
|
|
|
xMOVAPS(xRegisterSSE(absreg), xRegisterSSE(reg));
|
|
xAND.PD(xRegisterSSE(absreg), ptr[&s_const.dbl_s_pos]);
|
|
|
|
xUCOMI.SD(xRegisterSSE(absreg), ptr[&s_const.dbl_cvt_overflow]);
|
|
u8* to_complex = JAE8(0);
|
|
|
|
xUCOMI.SD(xRegisterSSE(absreg), ptr[&s_const.dbl_underflow]);
|
|
u8* to_underflow = JB8(0);
|
|
|
|
xCVTSD2SS(xRegisterSSE(reg), xRegisterSSE(reg)); //simply convert
|
|
|
|
u32* end = JMP32(0);
|
|
|
|
x86SetJ8(to_complex);
|
|
xUCOMI.SD(xRegisterSSE(absreg), ptr[&s_const.dbl_ps2_overflow]);
|
|
u8* to_overflow = JAE8(0);
|
|
|
|
xPSUB.Q(xRegisterSSE(reg), ptr[&s_const.dbl_one_exp]); //lower exponent
|
|
xCVTSD2SS(xRegisterSSE(reg), xRegisterSSE(reg)); //convert
|
|
xPADD.D(xRegisterSSE(reg), ptr[s_const.one_exp]); //raise exponent
|
|
|
|
u32* end2 = JMP32(0);
|
|
|
|
x86SetJ8(to_overflow);
|
|
xCVTSD2SS(xRegisterSSE(reg), xRegisterSSE(reg));
|
|
xOR.PS(xRegisterSSE(reg), ptr[&s_const.pos]); //clamp
|
|
if (flags && FPU_FLAGS_OVERFLOW)
|
|
xOR(ptr32[&fpuRegs.fprc[31]], (FPUflagO | FPUflagSO));
|
|
if (flags && FPU_FLAGS_OVERFLOW && acc)
|
|
xOR(ptr32[&fpuRegs.ACCflag], 1);
|
|
u8* end3 = JMP8(0);
|
|
|
|
x86SetJ8(to_underflow);
|
|
u8* end4 = nullptr;
|
|
if (flags && FPU_FLAGS_UNDERFLOW) //set underflow flags if not zero
|
|
{
|
|
xXOR.PD(xRegisterSSE(absreg), xRegisterSSE(absreg));
|
|
xUCOMI.SD(xRegisterSSE(reg), xRegisterSSE(absreg));
|
|
u8* is_zero = JE8(0);
|
|
|
|
xOR(ptr32[&fpuRegs.fprc[31]], (FPUflagU | FPUflagSU));
|
|
if (addsub)
|
|
{
|
|
//On ADD/SUB, the PS2 simply leaves the mantissa bits as they are (after normalization)
|
|
//IEEE either clears them (FtZ) or returns the denormalized result.
|
|
//not thoroughly tested : other operations such as MUL and DIV seem to clear all mantissa bits?
|
|
xMOVAPS(xRegisterSSE(absreg), xRegisterSSE(reg));
|
|
xPSLL.Q(xRegisterSSE(reg), 12); //mantissa bits
|
|
xPSRL.Q(xRegisterSSE(reg), 41);
|
|
xPSRL.Q(xRegisterSSE(absreg), 63); //sign bit
|
|
xPSLL.Q(xRegisterSSE(absreg), 31);
|
|
xPOR(xRegisterSSE(reg), xRegisterSSE(absreg));
|
|
end4 = JMP8(0);
|
|
}
|
|
|
|
x86SetJ8(is_zero);
|
|
}
|
|
xCVTSD2SS(xRegisterSSE(reg), xRegisterSSE(reg));
|
|
xAND.PS(xRegisterSSE(reg), ptr[s_const.neg]); //flush to zero
|
|
|
|
x86SetJ32(end);
|
|
x86SetJ32(end2);
|
|
|
|
x86SetJ8(end3);
|
|
if (flags && FPU_FLAGS_UNDERFLOW && addsub)
|
|
x86SetJ8(end4);
|
|
}
|
|
|
|
void ToPS2FPU(int reg, bool flags, int absreg, bool acc, bool addsub = false)
|
|
{
|
|
if (FPU_RESULT)
|
|
ToPS2FPU_Full(reg, flags, absreg, acc, addsub);
|
|
else
|
|
{
|
|
xCVTSD2SS(xRegisterSSE(reg), xRegisterSSE(reg)); //clamp
|
|
xMIN.SS(xRegisterSSE(reg), ptr[&g_maxvals[0]]);
|
|
xMAX.SS(xRegisterSSE(reg), ptr[&g_minvals[0]]);
|
|
}
|
|
}
|
|
|
|
//sets the maximum (positive or negative) value into regd.
|
|
void SetMaxValue(int regd)
|
|
{
|
|
if (FPU_RESULT)
|
|
xOR.PS(xRegisterSSE(regd), ptr[&s_const.pos[0]]); // set regd to maximum
|
|
else
|
|
{
|
|
xAND.PS(xRegisterSSE(regd), ptr[&s_const.neg[0]]); // Get the sign bit
|
|
xOR.PS(xRegisterSSE(regd), ptr[&g_maxvals[0]]); // regd = +/- Maximum (CLAMP)!
|
|
}
|
|
}
|
|
|
|
#define GET_S(sreg) \
|
|
do { \
|
|
if (info & PROCESS_EE_S) \
|
|
xMOVSS(xRegisterSSE(sreg), xRegisterSSE(EEREC_S)); \
|
|
else \
|
|
xMOVSSZX(xRegisterSSE(sreg), ptr[&fpuRegs.fpr[_Fs_]]); \
|
|
} while (0)
|
|
|
|
#define ALLOC_S(sreg) \
|
|
do { \
|
|
(sreg) = _allocTempXMMreg(XMMT_FPS); \
|
|
GET_S(sreg); \
|
|
} while (0)
|
|
|
|
#define GET_T(treg) \
|
|
do { \
|
|
if (info & PROCESS_EE_T) \
|
|
xMOVSS(xRegisterSSE(treg), xRegisterSSE(EEREC_T)); \
|
|
else \
|
|
xMOVSSZX(xRegisterSSE(treg), ptr[&fpuRegs.fpr[_Ft_]]); \
|
|
} while (0)
|
|
|
|
#define ALLOC_T(treg) \
|
|
do { \
|
|
(treg) = _allocTempXMMreg(XMMT_FPS); \
|
|
GET_T(treg); \
|
|
} while (0)
|
|
|
|
#define GET_ACC(areg) \
|
|
do { \
|
|
if (info & PROCESS_EE_ACC) \
|
|
xMOVSS(xRegisterSSE(areg), xRegisterSSE(EEREC_ACC)); \
|
|
else \
|
|
xMOVSSZX(xRegisterSSE(areg), ptr[&fpuRegs.ACC]); \
|
|
} while (0)
|
|
|
|
#define ALLOC_ACC(areg) \
|
|
do { \
|
|
(areg) = _allocTempXMMreg(XMMT_FPS); \
|
|
GET_ACC(areg); \
|
|
} while (0)
|
|
|
|
#define CLEAR_OU_FLAGS \
|
|
do { \
|
|
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO | FPUflagU)); \
|
|
} while (0)
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// ABS XMM
|
|
//------------------------------------------------------------------
|
|
void recABS_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::ABS_F);
|
|
GET_S(EEREC_D);
|
|
|
|
CLEAR_OU_FLAGS;
|
|
|
|
xAND.PS(xRegisterSSE(EEREC_D), ptr[s_const.pos]);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(ABS_S, XMMINFO_WRITED | XMMINFO_READS);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// FPU_ADD_SUB (Used to mimic PS2's FPU add/sub behavior)
|
|
//------------------------------------------------------------------
|
|
// Compliant IEEE FPU uses, in computations, uses additional "guard" bits to the right of the mantissa
|
|
// but EE-FPU doesn't. Substraction (and addition of positive and negative) may shift the mantissa left,
|
|
// causing those bits to appear in the result; this function masks out the bits of the mantissa that will
|
|
// get shifted right to the guard bits to ensure that the guard bits are empty.
|
|
// The difference of the exponents = the amount that the smaller operand will be shifted right by.
|
|
// Modification - the PS2 uses a single guard bit? (Coded by Nneeve)
|
|
//------------------------------------------------------------------
|
|
void FPU_ADD_SUB(int tempd, int tempt) //tempd and tempt are overwritten, they are floats
|
|
{
|
|
const int xmmtemp = _allocTempXMMreg(XMMT_FPS); //temporary for anding with regd/regt
|
|
xMOVD(ecx, xRegisterSSE(tempd)); //receives regd
|
|
xMOVD(eax, xRegisterSSE(tempt)); //receives regt
|
|
|
|
//mask the exponents
|
|
xSHR(ecx, 23);
|
|
xSHR(eax, 23);
|
|
xAND(ecx, 0xff);
|
|
xAND(eax, 0xff);
|
|
|
|
xSUB(ecx, eax); //tempecx = exponent difference
|
|
xCMP(ecx, 25);
|
|
j8Ptr[0] = JGE8(0);
|
|
xCMP(ecx, 0);
|
|
j8Ptr[1] = JG8(0);
|
|
j8Ptr[2] = JE8(0);
|
|
xCMP(ecx, -25);
|
|
j8Ptr[3] = JLE8(0);
|
|
|
|
//diff = -24 .. -1 , expd < expt
|
|
xNEG(ecx);
|
|
xDEC(ecx);
|
|
xMOV(eax, 0xffffffff);
|
|
xSHL(eax, cl); //temp2 = 0xffffffff << tempecx
|
|
xMOVDZX(xRegisterSSE(xmmtemp), eax);
|
|
xAND.PS(xRegisterSSE(tempd), xRegisterSSE(xmmtemp));
|
|
j8Ptr[4] = JMP8(0);
|
|
|
|
x86SetJ8(j8Ptr[0]);
|
|
//diff = 25 .. 255 , expt < expd
|
|
xAND.PS(xRegisterSSE(tempt), ptr[s_const.neg]);
|
|
j8Ptr[5] = JMP8(0);
|
|
|
|
x86SetJ8(j8Ptr[1]);
|
|
//diff = 1 .. 24, expt < expd
|
|
xDEC(ecx);
|
|
xMOV(eax, 0xffffffff);
|
|
xSHL(eax, cl); //temp2 = 0xffffffff << tempecx
|
|
xMOVDZX(xRegisterSSE(xmmtemp), eax);
|
|
xAND.PS(xRegisterSSE(tempt), xRegisterSSE(xmmtemp));
|
|
j8Ptr[6] = JMP8(0);
|
|
|
|
x86SetJ8(j8Ptr[3]);
|
|
//diff = -255 .. -25, expd < expt
|
|
xAND.PS(xRegisterSSE(tempd), ptr[s_const.neg]);
|
|
|
|
x86SetJ8(j8Ptr[2]);
|
|
//diff == 0
|
|
|
|
x86SetJ8(j8Ptr[4]);
|
|
x86SetJ8(j8Ptr[5]);
|
|
x86SetJ8(j8Ptr[6]);
|
|
|
|
_freeXMMreg(xmmtemp);
|
|
}
|
|
|
|
void FPU_MUL(int info, int regd, int sreg, int treg, bool acc)
|
|
{
|
|
u32* endMul = nullptr;
|
|
|
|
if (CHECK_FPUMULHACK)
|
|
{
|
|
// if ((s == 0x3e800000) && (t == 0x40490fdb))
|
|
// return 0x3f490fda; // needed for Tales of Destiny Remake (only in a very specific room late-game)
|
|
// else
|
|
// return 0;
|
|
|
|
alignas(16) static constexpr const u32 result[4] = { 0x3f490fda };
|
|
|
|
xMOVD(ecx, xRegisterSSE(sreg));
|
|
xMOVD(edx, xRegisterSSE(treg));
|
|
|
|
// if (((s ^ 0x3e800000) | (t ^ 0x40490fdb)) != 0) { hack; }
|
|
xXOR(ecx, 0x3e800000);
|
|
xXOR(edx, 0x40490fdb);
|
|
xOR(edx, ecx);
|
|
|
|
u8* noHack = JNZ8(0);
|
|
xMOVAPS(xRegisterSSE(regd), ptr128[result]);
|
|
endMul = JMP32(0);
|
|
x86SetJ8(noHack);
|
|
}
|
|
|
|
ToDouble(sreg); ToDouble(treg);
|
|
xMUL.SD(xRegisterSSE(sreg), xRegisterSSE(treg));
|
|
ToPS2FPU(sreg, true, treg, acc);
|
|
xMOVSS(xRegisterSSE(regd), xRegisterSSE(sreg));
|
|
|
|
if (CHECK_FPUMULHACK)
|
|
x86SetJ32(endMul);
|
|
}
|
|
|
|
//------------------------------------------------------------------
|
|
// CommutativeOp XMM (used for ADD and SUB opcodes. that's it.)
|
|
//------------------------------------------------------------------
|
|
static void (*recFPUOpXMM_to_XMM[])(x86SSERegType, x86SSERegType) = {
|
|
SSE2_ADDSD_XMM_to_XMM, SSE2_SUBSD_XMM_to_XMM};
|
|
|
|
void recFPUOp(int info, int regd, int op, bool acc)
|
|
{
|
|
int sreg, treg;
|
|
ALLOC_S(sreg); ALLOC_T(treg);
|
|
|
|
if (FPU_CORRECT_ADD_SUB)
|
|
FPU_ADD_SUB(sreg, treg);
|
|
|
|
ToDouble(sreg); ToDouble(treg);
|
|
|
|
recFPUOpXMM_to_XMM[op](sreg, treg);
|
|
|
|
ToPS2FPU(sreg, true, treg, acc, true);
|
|
xMOVSS(xRegisterSSE(regd), xRegisterSSE(sreg));
|
|
|
|
_freeXMMreg(sreg); _freeXMMreg(treg);
|
|
}
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// ADD XMM
|
|
//------------------------------------------------------------------
|
|
void recADD_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::ADD_F);
|
|
recFPUOp(info, EEREC_D, 0, false);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(ADD_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
|
|
|
|
void recADDA_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::ADDA_F);
|
|
recFPUOp(info, EEREC_ACC, 0, true);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(ADDA_S, XMMINFO_WRITEACC | XMMINFO_READS | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
void recCMP(int info)
|
|
{
|
|
int sreg, treg;
|
|
ALLOC_S(sreg); ALLOC_T(treg);
|
|
ToDouble(sreg); ToDouble(treg);
|
|
|
|
xUCOMI.SD(xRegisterSSE(sreg), xRegisterSSE(treg));
|
|
|
|
_freeXMMreg(sreg); _freeXMMreg(treg);
|
|
}
|
|
|
|
//------------------------------------------------------------------
|
|
// C.x.S XMM
|
|
//------------------------------------------------------------------
|
|
void recC_EQ_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::CEQ_F);
|
|
recCMP(info);
|
|
|
|
j8Ptr[0] = JZ8(0);
|
|
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
|
|
j8Ptr[1] = JMP8(0);
|
|
x86SetJ8(j8Ptr[0]);
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
|
|
x86SetJ8(j8Ptr[1]);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(C_EQ, XMMINFO_READS | XMMINFO_READT);
|
|
|
|
void recC_LE_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::CLE_F);
|
|
recCMP(info);
|
|
|
|
j8Ptr[0] = JBE8(0);
|
|
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
|
|
j8Ptr[1] = JMP8(0);
|
|
x86SetJ8(j8Ptr[0]);
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
|
|
x86SetJ8(j8Ptr[1]);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(C_LE, XMMINFO_READS | XMMINFO_READT);
|
|
|
|
void recC_LT_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::CLT_F);
|
|
recCMP(info);
|
|
|
|
j8Ptr[0] = JB8(0);
|
|
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
|
|
j8Ptr[1] = JMP8(0);
|
|
x86SetJ8(j8Ptr[0]);
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
|
|
x86SetJ8(j8Ptr[1]);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(C_LT, XMMINFO_READS | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// CVT.x XMM
|
|
//------------------------------------------------------------------
|
|
void recCVT_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::CVTS_F);
|
|
|
|
if (info & PROCESS_EE_D)
|
|
{
|
|
if (info & PROCESS_EE_S)
|
|
xCVTDQ2PS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
|
|
else
|
|
xCVTSI2SS(xRegisterSSE(EEREC_D), ptr32[&fpuRegs.fpr[_Fs_]]);
|
|
}
|
|
else
|
|
{
|
|
const int temp = _allocTempXMMreg(XMMT_FPS);
|
|
xCVTSI2SS(xRegisterSSE(temp), ptr32[&fpuRegs.fpr[_Fs_]]);
|
|
xMOVSS(ptr32[&fpuRegs.fpr[_Fd_]], xRegisterSSE(temp));
|
|
_freeXMMreg(temp);
|
|
}
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(CVT_S, XMMINFO_WRITED | XMMINFO_READS);
|
|
|
|
void recCVT_W() //called from iFPU.cpp's recCVT_W
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::CVTW);
|
|
int regs = _checkXMMreg(XMMTYPE_FPREG, _Fs_, MODE_READ);
|
|
|
|
if (regs >= 0)
|
|
{
|
|
xCVTTSS2SI(eax, xRegisterSSE(regs));
|
|
xMOVMSKPS(edx, xRegisterSSE(regs)); // extract the signs
|
|
xAND(edx, 1); // keep only LSB
|
|
}
|
|
else
|
|
{
|
|
xCVTTSS2SI(eax, ptr32[&fpuRegs.fpr[_Fs_]]);
|
|
xMOV(edx, ptr[&fpuRegs.fpr[_Fs_]]);
|
|
xSHR(edx, 31); //mov sign to lsb
|
|
}
|
|
|
|
//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
|
|
_deleteFPtoXMMreg(_Fd_, DELETE_REG_FREE_NO_WRITEBACK);
|
|
|
|
xADD(edx, 0x7FFFFFFF); // 0x7FFFFFFF if positive, 0x8000 0000 if negative
|
|
|
|
xCMP(eax, 0x80000000); // If the result is indefinitive
|
|
xCMOVE(eax, edx); // Saturate it
|
|
|
|
//Write the result
|
|
xMOV(ptr[&fpuRegs.fpr[_Fd_]], eax);
|
|
}
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// DIV XMM
|
|
//------------------------------------------------------------------
|
|
void recDIVhelper1(int regd, int regt) // Sets flags
|
|
{
|
|
u8 *pjmp1, *pjmp2;
|
|
u32 *ajmp32, *bjmp32;
|
|
const int t1reg = _allocTempXMMreg(XMMT_FPS);
|
|
|
|
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
|
|
|
|
//--- Check for divide by zero ---
|
|
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
|
|
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regt));
|
|
xMOVMSKPS(eax, xRegisterSSE(t1reg));
|
|
xAND(eax, 1); //Check sign (if regt == zero, sign will be set)
|
|
ajmp32 = JZ32(0); //Skip if not set
|
|
|
|
//--- Check for 0/0 ---
|
|
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
|
|
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regd));
|
|
xMOVMSKPS(eax, xRegisterSSE(t1reg));
|
|
xAND(eax, 1); //Check sign (if regd == zero, sign will be set)
|
|
pjmp1 = JZ8(0); //Skip if not set
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags ( 0/0 )
|
|
pjmp2 = JMP8(0);
|
|
x86SetJ8(pjmp1); //x/0 but not 0/0
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagD | FPUflagSD); // Set D and SD flags ( x/0 )
|
|
x86SetJ8(pjmp2);
|
|
|
|
//--- Make regd +/- Maximum ---
|
|
xXOR.PS(xRegisterSSE(regd), xRegisterSSE(regt)); // Make regd Positive or Negative
|
|
SetMaxValue(regd); //clamp to max
|
|
bjmp32 = JMP32(0);
|
|
|
|
x86SetJ32(ajmp32);
|
|
|
|
//--- Normal Divide ---
|
|
ToDouble(regd); ToDouble(regt);
|
|
|
|
xDIV.SD(xRegisterSSE(regd), xRegisterSSE(regt));
|
|
|
|
ToPS2FPU(regd, false, regt, false);
|
|
|
|
x86SetJ32(bjmp32);
|
|
|
|
_freeXMMreg(t1reg);
|
|
}
|
|
|
|
void recDIVhelper2(int regd, int regt) // Doesn't sets flags
|
|
{
|
|
ToDouble(regd); ToDouble(regt);
|
|
|
|
xDIV.SD(xRegisterSSE(regd), xRegisterSSE(regt));
|
|
|
|
ToPS2FPU(regd, false, regt, false);
|
|
}
|
|
|
|
alignas(16) static FPControlRegister roundmode_nearest;
|
|
|
|
void recDIV_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::DIV_F);
|
|
//Console.WriteLn("DIV");
|
|
|
|
if (EmuConfig.Cpu.FPUFPCR.bitmask != EmuConfig.Cpu.FPUDivFPCR.bitmask)
|
|
xLDMXCSR(ptr32[&EmuConfig.Cpu.FPUDivFPCR.bitmask]);
|
|
|
|
int sreg, treg;
|
|
|
|
ALLOC_S(sreg); ALLOC_T(treg);
|
|
|
|
if (FPU_FLAGS_ID)
|
|
recDIVhelper1(sreg, treg);
|
|
else
|
|
recDIVhelper2(sreg, treg);
|
|
|
|
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(sreg));
|
|
|
|
if (EmuConfig.Cpu.FPUFPCR.bitmask != EmuConfig.Cpu.FPUDivFPCR.bitmask)
|
|
xLDMXCSR(ptr32[&EmuConfig.Cpu.FPUFPCR.bitmask]);
|
|
|
|
_freeXMMreg(sreg); _freeXMMreg(treg);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(DIV_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// MADD/MSUB XMM
|
|
//------------------------------------------------------------------
|
|
|
|
// Unlike what the documentation implies, it seems that MADD/MSUB support all numbers just like other operations
|
|
// The complex overflow conditions the document describes apparently test whether the multiplication's result
|
|
// has overflowed and whether the last operation that used ACC as a destination has overflowed.
|
|
// For example, { adda.s -MAX, 0.0 ; madd.s fd, MAX, 1.0 } -> fd = 0
|
|
// while { adda.s -MAX, -MAX ; madd.s fd, MAX, 1.0 } -> fd = -MAX
|
|
// (where MAX is 0x7fffffff and -MAX is 0xffffffff)
|
|
void recMaddsub(int info, int regd, int op, bool acc)
|
|
{
|
|
int sreg, treg;
|
|
ALLOC_S(sreg); ALLOC_T(treg);
|
|
|
|
FPU_MUL(info, sreg, sreg, treg, false);
|
|
|
|
GET_ACC(treg);
|
|
|
|
if (FPU_CORRECT_ADD_SUB)
|
|
FPU_ADD_SUB(treg, sreg); //might be problematic for something!!!!
|
|
|
|
// TEST FOR ACC/MUL OVERFLOWS, PROPOGATE THEM IF THEY OCCUR
|
|
|
|
xTEST(ptr32[&fpuRegs.fprc[31]], FPUflagO);
|
|
u8* mulovf = JNZ8(0);
|
|
ToDouble(sreg); //else, convert
|
|
|
|
xTEST(ptr32[&fpuRegs.ACCflag], 1);
|
|
u8* accovf = JNZ8(0);
|
|
ToDouble(treg); //else, convert
|
|
u8* operation = JMP8(0);
|
|
|
|
x86SetJ8(mulovf);
|
|
if (op == 1) //sub
|
|
xXOR.PS(xRegisterSSE(sreg), ptr[s_const.neg]);
|
|
xMOVAPS(xRegisterSSE(treg), xRegisterSSE(sreg)); //fall through below
|
|
|
|
x86SetJ8(accovf);
|
|
SetMaxValue(treg); //just in case... I think it has to be a MaxValue already here
|
|
CLEAR_OU_FLAGS; //clear U flag
|
|
if (FPU_FLAGS_OVERFLOW)
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagO | FPUflagSO);
|
|
if (FPU_FLAGS_OVERFLOW && acc)
|
|
xOR(ptr32[&fpuRegs.ACCflag], 1);
|
|
u32* skipall = JMP32(0);
|
|
|
|
// PERFORM THE ACCUMULATION AND TEST RESULT. CONVERT TO SINGLE
|
|
|
|
x86SetJ8(operation);
|
|
if (op == 1)
|
|
xSUB.SD(xRegisterSSE(treg), xRegisterSSE(sreg));
|
|
else
|
|
xADD.SD(xRegisterSSE(treg), xRegisterSSE(sreg));
|
|
|
|
ToPS2FPU(treg, true, sreg, acc, true);
|
|
x86SetJ32(skipall);
|
|
|
|
xMOVSS(xRegisterSSE(regd), xRegisterSSE(treg));
|
|
|
|
_freeXMMreg(sreg); _freeXMMreg(treg);
|
|
}
|
|
|
|
void recMADD_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MADD_F);
|
|
recMaddsub(info, EEREC_D, 0, false);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MADD_S, XMMINFO_WRITED | XMMINFO_READACC | XMMINFO_READS | XMMINFO_READT);
|
|
|
|
void recMADDA_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MADDA_F);
|
|
recMaddsub(info, EEREC_ACC, 0, true);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MADDA_S, XMMINFO_WRITEACC | XMMINFO_READACC | XMMINFO_READS | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// MAX / MIN XMM
|
|
//------------------------------------------------------------------
|
|
|
|
alignas(16) static const u32 minmax_mask[8] =
|
|
{
|
|
0xffffffff, 0x80000000, 0, 0,
|
|
0, 0x40000000, 0, 0,
|
|
};
|
|
// FPU's MAX/MIN work with all numbers (including "denormals"). Check VU's logical min max for more info.
|
|
void recMINMAX(int info, bool ismin)
|
|
{
|
|
int sreg, treg;
|
|
ALLOC_S(sreg); ALLOC_T(treg);
|
|
|
|
CLEAR_OU_FLAGS;
|
|
|
|
xPSHUF.D(xRegisterSSE(sreg), xRegisterSSE(sreg), 0x00);
|
|
xPAND(xRegisterSSE(sreg), ptr[minmax_mask]);
|
|
xPOR(xRegisterSSE(sreg), ptr[&minmax_mask[4]]);
|
|
xPSHUF.D(xRegisterSSE(treg), xRegisterSSE(treg), 0x00);
|
|
xPAND(xRegisterSSE(treg), ptr[minmax_mask]);
|
|
xPOR(xRegisterSSE(treg), ptr[&minmax_mask[4]]);
|
|
if (ismin)
|
|
xMIN.SD(xRegisterSSE(sreg), xRegisterSSE(treg));
|
|
else
|
|
xMAX.SD(xRegisterSSE(sreg), xRegisterSSE(treg));
|
|
|
|
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(sreg));
|
|
|
|
_freeXMMreg(sreg); _freeXMMreg(treg);
|
|
}
|
|
|
|
void recMAX_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MAX_F);
|
|
recMINMAX(info, false);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MAX_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
|
|
|
|
void recMIN_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MIN_F);
|
|
recMINMAX(info, true);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MIN_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// MOV XMM
|
|
//------------------------------------------------------------------
|
|
void recMOV_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MOV_F);
|
|
GET_S(EEREC_D);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MOV_S, XMMINFO_WRITED | XMMINFO_READS);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// MSUB XMM
|
|
//------------------------------------------------------------------
|
|
|
|
void recMSUB_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MSUB_F);
|
|
recMaddsub(info, EEREC_D, 1, false);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MSUB_S, XMMINFO_WRITED | XMMINFO_READACC | XMMINFO_READS | XMMINFO_READT);
|
|
|
|
void recMSUBA_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MSUBA_F);
|
|
recMaddsub(info, EEREC_ACC, 1, true);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MSUBA_S, XMMINFO_WRITEACC | XMMINFO_READACC | XMMINFO_READS | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
//------------------------------------------------------------------
|
|
// MUL XMM
|
|
//------------------------------------------------------------------
|
|
void recMUL_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MUL_F);
|
|
int sreg, treg;
|
|
ALLOC_S(sreg); ALLOC_T(treg);
|
|
|
|
FPU_MUL(info, EEREC_D, sreg, treg, false);
|
|
_freeXMMreg(sreg); _freeXMMreg(treg);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MUL_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
|
|
|
|
void recMULA_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::MULA_F);
|
|
int sreg, treg;
|
|
ALLOC_S(sreg); ALLOC_T(treg);
|
|
|
|
FPU_MUL(info, EEREC_ACC, sreg, treg, true);
|
|
_freeXMMreg(sreg); _freeXMMreg(treg);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(MULA_S, XMMINFO_WRITEACC | XMMINFO_READS | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// NEG XMM
|
|
//------------------------------------------------------------------
|
|
void recNEG_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::NEG_F);
|
|
GET_S(EEREC_D);
|
|
|
|
CLEAR_OU_FLAGS;
|
|
|
|
xXOR.PS(xRegisterSSE(EEREC_D), ptr[&s_const.neg[0]]);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(NEG_S, XMMINFO_WRITED | XMMINFO_READS);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// SUB XMM
|
|
//------------------------------------------------------------------
|
|
|
|
void recSUB_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::SUB_F);
|
|
recFPUOp(info, EEREC_D, 1, false);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(SUB_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
|
|
|
|
|
|
void recSUBA_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::SUBA_F);
|
|
recFPUOp(info, EEREC_ACC, 1, true);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(SUBA_S, XMMINFO_WRITEACC | XMMINFO_READS | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// SQRT XMM
|
|
//------------------------------------------------------------------
|
|
void recSQRT_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::SQRT_F);
|
|
int roundmodeFlag = 0;
|
|
const int t1reg = _allocTempXMMreg(XMMT_FPS);
|
|
//Console.WriteLn("FPU: SQRT");
|
|
|
|
if (EmuConfig.Cpu.FPUFPCR.GetRoundMode() != FPRoundMode::Nearest)
|
|
{
|
|
// Set roundmode to nearest if it isn't already
|
|
//Console.WriteLn("sqrt to nearest");
|
|
roundmode_nearest = EmuConfig.Cpu.FPUFPCR;
|
|
roundmode_nearest.SetRoundMode(FPRoundMode::Nearest);
|
|
xLDMXCSR(ptr32[&roundmode_nearest.bitmask]);
|
|
roundmodeFlag = 1;
|
|
}
|
|
|
|
GET_T(EEREC_D);
|
|
|
|
if (FPU_FLAGS_ID)
|
|
{
|
|
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
|
|
|
|
//--- Check for negative SQRT --- (sqrt(-0) = 0, unlike what the docs say)
|
|
xMOVMSKPS(eax, xRegisterSSE(EEREC_D));
|
|
xAND(eax, 1); //Check sign
|
|
u8* pjmp = JZ8(0); //Skip if none are
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags
|
|
xAND.PS(xRegisterSSE(EEREC_D), ptr[&s_const.pos[0]]); // Make EEREC_D Positive
|
|
x86SetJ8(pjmp);
|
|
}
|
|
else
|
|
{
|
|
xAND.PS(xRegisterSSE(EEREC_D), ptr[&s_const.pos[0]]); // Make EEREC_D Positive
|
|
}
|
|
|
|
|
|
ToDouble(EEREC_D);
|
|
|
|
xSQRT.SD(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
|
|
|
|
ToPS2FPU(EEREC_D, false, t1reg, false);
|
|
|
|
if (roundmodeFlag == 1)
|
|
xLDMXCSR(ptr32[&EmuConfig.Cpu.FPUFPCR.bitmask]);
|
|
|
|
_freeXMMreg(t1reg);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(SQRT_S, XMMINFO_WRITED | XMMINFO_READT);
|
|
//------------------------------------------------------------------
|
|
|
|
|
|
//------------------------------------------------------------------
|
|
// RSQRT XMM
|
|
//------------------------------------------------------------------
|
|
void recRSQRThelper1(int regd, int regt) // Preforms the RSQRT function when regd <- Fs and regt <- Ft (Sets correct flags)
|
|
{
|
|
u8 *pjmp1, *pjmp2;
|
|
u8 *qjmp1, *qjmp2;
|
|
u32* pjmp32;
|
|
int t1reg = _allocTempXMMreg(XMMT_FPS);
|
|
|
|
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
|
|
|
|
//--- (first) Check for negative SQRT ---
|
|
xMOVMSKPS(eax, xRegisterSSE(regt));
|
|
xAND(eax, 1); //Check sign
|
|
pjmp2 = JZ8(0); //Skip if not set
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags
|
|
xAND.PS(xRegisterSSE(regt), ptr[&s_const.pos[0]]); // Make regt Positive
|
|
x86SetJ8(pjmp2);
|
|
|
|
//--- Check for zero ---
|
|
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
|
|
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regt));
|
|
xMOVMSKPS(eax, xRegisterSSE(t1reg));
|
|
xAND(eax, 1); //Check sign (if regt == zero, sign will be set)
|
|
pjmp1 = JZ8(0); //Skip if not set
|
|
|
|
//--- Check for 0/0 ---
|
|
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
|
|
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regd));
|
|
xMOVMSKPS(eax, xRegisterSSE(t1reg));
|
|
xAND(eax, 1); //Check sign (if regd == zero, sign will be set)
|
|
qjmp1 = JZ8(0); //Skip if not set
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags ( 0/0 )
|
|
qjmp2 = JMP8(0);
|
|
x86SetJ8(qjmp1); //x/0 but not 0/0
|
|
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagD | FPUflagSD); // Set D and SD flags ( x/0 )
|
|
x86SetJ8(qjmp2);
|
|
|
|
SetMaxValue(regd); //clamp to max
|
|
pjmp32 = JMP32(0);
|
|
x86SetJ8(pjmp1);
|
|
|
|
ToDouble(regt); ToDouble(regd);
|
|
|
|
xSQRT.SD(xRegisterSSE(regt), xRegisterSSE(regt));
|
|
xDIV.SD(xRegisterSSE(regd), xRegisterSSE(regt));
|
|
|
|
ToPS2FPU(regd, false, regt, false);
|
|
x86SetJ32(pjmp32);
|
|
|
|
_freeXMMreg(t1reg);
|
|
}
|
|
|
|
void recRSQRThelper2(int regd, int regt) // Preforms the RSQRT function when regd <- Fs and regt <- Ft (Doesn't set flags)
|
|
{
|
|
xAND.PS(xRegisterSSE(regt), ptr[&s_const.pos[0]]); // Make regt Positive
|
|
|
|
ToDouble(regt); ToDouble(regd);
|
|
|
|
xSQRT.SD(xRegisterSSE(regt), xRegisterSSE(regt));
|
|
xDIV.SD(xRegisterSSE(regd), xRegisterSSE(regt));
|
|
|
|
ToPS2FPU(regd, false, regt, false);
|
|
}
|
|
|
|
void recRSQRT_S_xmm(int info)
|
|
{
|
|
EE::Profiler.EmitOp(eeOpcode::RSQRT_F);
|
|
int sreg, treg;
|
|
|
|
// iFPU (regular FPU) doesn't touch roundmode for rSQRT.
|
|
// Should this do the same? or is changing the roundmode to nearest the better
|
|
// behavior for both recs? --air
|
|
|
|
bool roundmodeFlag = false;
|
|
if (EmuConfig.Cpu.FPUFPCR.GetRoundMode() != FPRoundMode::Nearest)
|
|
{
|
|
// Set roundmode to nearest if it isn't already
|
|
//Console.WriteLn("sqrt to nearest");
|
|
roundmode_nearest = EmuConfig.Cpu.FPUFPCR;
|
|
roundmode_nearest.SetRoundMode(FPRoundMode::Nearest);
|
|
xLDMXCSR(ptr32[&roundmode_nearest.bitmask]);
|
|
roundmodeFlag = true;
|
|
}
|
|
|
|
ALLOC_S(sreg); ALLOC_T(treg);
|
|
|
|
if (FPU_FLAGS_ID)
|
|
recRSQRThelper1(sreg, treg);
|
|
else
|
|
recRSQRThelper2(sreg, treg);
|
|
|
|
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(sreg));
|
|
|
|
_freeXMMreg(treg); _freeXMMreg(sreg);
|
|
|
|
if (roundmodeFlag)
|
|
xLDMXCSR(ptr32[&EmuConfig.Cpu.FPUFPCR.bitmask]);
|
|
}
|
|
|
|
FPURECOMPILE_CONSTCODE(RSQRT_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
|
|
|
|
|
|
} // namespace DOUBLE
|
|
} // namespace COP1
|
|
} // namespace OpcodeImpl
|
|
} // namespace Dynarec
|
|
} // namespace R5900
|
|
#endif
|