pcsx2/pcsx2/x86/iFPU.cpp

1925 lines
56 KiB
C++

// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
// SPDX-License-Identifier: LGPL-3.0+
#include "Common.h"
#include "R5900OpcodeTables.h"
#include "iR5900.h"
#include "iFPU.h"
using namespace x86Emitter;
alignas(16) const u32 g_minvals[4] = {0xff7fffff, 0xff7fffff, 0xff7fffff, 0xff7fffff};
alignas(16) const u32 g_maxvals[4] = {0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff};
//------------------------------------------------------------------
namespace R5900 {
namespace Dynarec {
namespace OpcodeImpl {
namespace COP1 {
namespace DOUBLE
{
void recABS_S_xmm(int info);
void recADD_S_xmm(int info);
void recADDA_S_xmm(int info);
void recC_EQ_xmm(int info);
void recC_LE_xmm(int info);
void recC_LT_xmm(int info);
void recCVT_S_xmm(int info);
void recCVT_W();
void recDIV_S_xmm(int info);
void recMADD_S_xmm(int info);
void recMADDA_S_xmm(int info);
void recMAX_S_xmm(int info);
void recMIN_S_xmm(int info);
void recMOV_S_xmm(int info);
void recMSUB_S_xmm(int info);
void recMSUBA_S_xmm(int info);
void recMUL_S_xmm(int info);
void recMULA_S_xmm(int info);
void recNEG_S_xmm(int info);
void recSUB_S_xmm(int info);
void recSUBA_S_xmm(int info);
void recSQRT_S_xmm(int info);
void recRSQRT_S_xmm(int info);
}; // namespace DOUBLE
//------------------------------------------------------------------
// Helper Macros
//------------------------------------------------------------------
#define _Ft_ _Rt_
#define _Fs_ _Rd_
#define _Fd_ _Sa_
// FCR31 Flags
#define FPUflagC 0x00800000
#define FPUflagI 0x00020000
#define FPUflagD 0x00010000
#define FPUflagO 0x00008000
#define FPUflagU 0x00004000
#define FPUflagSI 0x00000040
#define FPUflagSD 0x00000020
#define FPUflagSO 0x00000010
#define FPUflagSU 0x00000008
// Add/Sub opcodes produce the same results as the ps2
#define FPU_CORRECT_ADD_SUB 1
alignas(16) static const u32 s_neg[4] = {0x80000000, 0xffffffff, 0xffffffff, 0xffffffff};
alignas(16) static const u32 s_pos[4] = {0x7fffffff, 0xffffffff, 0xffffffff, 0xffffffff};
#define REC_FPUBRANCH(f) \
void f(); \
void rec##f() \
{ \
iFlushCall(FLUSH_INTERPRETER); \
xFastCall((void*)(uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \
g_branch = 2; \
}
#define REC_FPUFUNC(f) \
void f(); \
void rec##f() \
{ \
iFlushCall(FLUSH_INTERPRETER); \
xFastCall((void*)(uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// *FPU Opcodes!*
//------------------------------------------------------------------
// Those opcode are marked as special ! But I don't understand why we can't run them in the interpreter
#ifndef FPU_RECOMPILE
REC_FPUFUNC(CFC1);
REC_FPUFUNC(CTC1);
REC_FPUFUNC(MFC1);
REC_FPUFUNC(MTC1);
#else
//------------------------------------------------------------------
// CFC1 / CTC1
//------------------------------------------------------------------
void recCFC1(void)
{
if (!_Rt_)
return;
EE::Profiler.EmitOp(eeOpcode::CFC1);
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
if (_Fs_ >= 16)
{
xMOV(xRegister32(regt), ptr32[&fpuRegs.fprc[31]]);
xAND(xRegister32(regt), 0x0083c078); //remove always-zero bits
xOR(xRegister32(regt), 0x01000001); //set always-one bits
xMOVSX(xRegister64(regt), xRegister32(regt));
}
else
{
xMOVSX(xRegister64(regt), ptr32[&fpuRegs.fprc[0]]);
}
}
void recCTC1()
{
if (_Fs_ != 31)
return;
EE::Profiler.EmitOp(eeOpcode::CTC1);
if (GPR_IS_CONST1(_Rt_))
{
xMOV(ptr32[&fpuRegs.fprc[_Fs_]], g_cpuConstRegs[_Rt_].UL[0]);
}
else
{
int mmreg = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_READ);
if (mmreg >= 0)
{
xMOVSS(ptr[&fpuRegs.fprc[_Fs_]], xRegisterSSE(mmreg));
}
else if ((mmreg = _checkX86reg(X86TYPE_GPR, _Rt_, MODE_READ)) >= 0)
{
xMOV(ptr32[&fpuRegs.fprc[_Fs_]], xRegister32(mmreg));
}
else
{
_deleteGPRtoXMMreg(_Rt_, 1);
xMOV(eax, ptr[&cpuRegs.GPR.r[_Rt_].UL[0]]);
xMOV(ptr[&fpuRegs.fprc[_Fs_]], eax);
}
}
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// MFC1
//------------------------------------------------------------------
void recMFC1()
{
if (!_Rt_)
return;
EE::Profiler.EmitOp(eeOpcode::MFC1);
const int xmmregt = _allocIfUsedGPRtoXMM(_Rt_, MODE_READ | MODE_WRITE);
const int regs = _allocIfUsedFPUtoXMM(_Fs_, MODE_READ);
if (regs >= 0 && xmmregt >= 0)
{
// if we're in xmm, we shouldn't be const
pxAssert(!GPR_IS_CONST1(_Rt_));
// both in xmm, sign extend and insert lower bits
const int temp = _allocTempXMMreg(XMMT_FPS);
xMOVAPS(xRegisterSSE(temp), xRegisterSSE(regs));
xPSRA.D(xRegisterSSE(temp), 31);
xMOVSS(xRegisterSSE(xmmregt), xRegisterSSE(regs));
xINSERTPS(xRegisterSSE(xmmregt), xRegisterSSE(temp), _MM_MK_INSERTPS_NDX(0, 1, 0));
_freeXMMreg(temp);
return;
}
// storing to a gpr..
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
// shouldn't be const after we're writing.
pxAssert(!GPR_IS_CONST1(_Rt_));
if (regs >= 0)
{
// xmm -> gpr
xMOVD(xRegister32(regt), xRegisterSSE(regs));
xMOVSX(xRegister64(regt), xRegister32(regt));
}
else
{
// mem -> gpr
xMOVSX(xRegister64(regt), ptr32[&fpuRegs.fpr[_Fs_].UL]);
}
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// MTC1
//------------------------------------------------------------------
void recMTC1()
{
EE::Profiler.EmitOp(eeOpcode::MTC1);
if (GPR_IS_CONST1(_Rt_))
{
const int xmmreg = _allocIfUsedFPUtoXMM(_Fs_, MODE_WRITE);
if (xmmreg >= 0)
{
// common case: mtc1 zero, fnn
if (g_cpuConstRegs[_Rt_].UL[0] == 0)
{
xPXOR(xRegisterSSE(xmmreg), xRegisterSSE(xmmreg));
}
else
{
// may as well flush the constant register, since we're needing it in a gpr anyway
const int x86reg = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_READ);
xMOVDZX(xRegisterSSE(xmmreg), xRegister32(x86reg));
}
}
else
{
pxAssert(!_hasXMMreg(XMMTYPE_FPREG, _Fs_));
xMOV(ptr32[&fpuRegs.fpr[_Fs_].UL], g_cpuConstRegs[_Rt_].UL[0]);
}
}
else
{
const int xmmgpr = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_READ);
if (xmmgpr >= 0)
{
if (g_pCurInstInfo->regs[_Rt_] & EEINST_LASTUSE)
{
// transfer the reg directly
_deleteFPtoXMMreg(_Fs_, DELETE_REG_FREE_NO_WRITEBACK);
_reallocateXMMreg(xmmgpr, XMMTYPE_FPREG, _Fs_, MODE_WRITE);
}
else
{
const int xmmreg2 = _allocIfUsedFPUtoXMM(_Fs_, MODE_WRITE);
if (xmmreg2 >= 0)
xMOVSS(xRegisterSSE(xmmreg2), xRegisterSSE(xmmgpr));
else
xMOVSS(ptr[&fpuRegs.fpr[_Fs_].UL], xRegisterSSE(xmmgpr));
}
}
else
{
// may as well cache it..
const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_READ);
const int mmreg2 = _allocIfUsedFPUtoXMM(_Fs_, MODE_WRITE);
if (mmreg2 >= 0)
{
xMOVDZX(xRegisterSSE(mmreg2), xRegister32(regt));
}
else
{
xMOV(ptr32[&fpuRegs.fpr[_Fs_].UL], xRegister32(regt));
}
}
}
}
#endif
//------------------------------------------------------------------
#ifndef FPU_RECOMPILE // If FPU_RECOMPILE is not defined, then use the interpreter opcodes. (CFC1, CTC1, MFC1, and MTC1 are special because they work specifically with the EE rec so they're defined above)
REC_FPUFUNC(ABS_S);
REC_FPUFUNC(ADD_S);
REC_FPUFUNC(ADDA_S);
REC_FPUBRANCH(BC1F);
REC_FPUBRANCH(BC1T);
REC_FPUBRANCH(BC1FL);
REC_FPUBRANCH(BC1TL);
REC_FPUFUNC(C_EQ);
REC_FPUFUNC(C_F);
REC_FPUFUNC(C_LE);
REC_FPUFUNC(C_LT);
REC_FPUFUNC(CVT_S);
REC_FPUFUNC(CVT_W);
REC_FPUFUNC(DIV_S);
REC_FPUFUNC(MAX_S);
REC_FPUFUNC(MIN_S);
REC_FPUFUNC(MADD_S);
REC_FPUFUNC(MADDA_S);
REC_FPUFUNC(MOV_S);
REC_FPUFUNC(MSUB_S);
REC_FPUFUNC(MSUBA_S);
REC_FPUFUNC(MUL_S);
REC_FPUFUNC(MULA_S);
REC_FPUFUNC(NEG_S);
REC_FPUFUNC(SUB_S);
REC_FPUFUNC(SUBA_S);
REC_FPUFUNC(SQRT_S);
REC_FPUFUNC(RSQRT_S);
#else // FPU_RECOMPILE
//------------------------------------------------------------------
// Clamp Functions (Converts NaN's and Infinities to Normal Numbers)
//------------------------------------------------------------------
static int fpuCopyToTempForClamp(int fpureg, int xmmreg)
{
if (FPUINST_USEDTEST(fpureg))
{
const int tempreg = _allocTempXMMreg(XMMT_FPS);
xMOVSS(xRegisterSSE(tempreg), xRegisterSSE(xmmreg));
return tempreg;
}
// flush back the original value, before we mess with it below
if (FPUINST_LIVETEST(fpureg))
_flushXMMreg(xmmreg);
// turn it into a temp, so in case the liveness was incorrect, we don't reuse it after clamp
_reallocateXMMreg(xmmreg, XMMTYPE_TEMP, 0, 0, true);
return xmmreg;
}
static void fpuFreeIfTemp(int xmmreg)
{
if (xmmregs[xmmreg].inuse && xmmregs[xmmreg].type == XMMTYPE_TEMP)
_freeXMMreg(xmmreg);
}
__fi void fpuFloat3(int regd) // +NaN -> +fMax, -NaN -> -fMax, +Inf -> +fMax, -Inf -> -fMax
{
xPMIN.SD(xRegisterSSE(regd), ptr128[&g_maxvals[0]]);
xPMIN.UD(xRegisterSSE(regd), ptr128[&g_minvals[0]]);
}
__fi void fpuFloat(int regd) // +/-NaN -> +fMax, +Inf -> +fMax, -Inf -> -fMax
{
if (CHECK_FPU_OVERFLOW)
{
xMIN.SS(xRegisterSSE(regd), ptr[&g_maxvals[0]]); // MIN() must be before MAX()! So that NaN's become +Maximum
xMAX.SS(xRegisterSSE(regd), ptr[&g_minvals[0]]);
}
}
__fi void fpuFloat2(int regd) // +NaN -> +fMax, -NaN -> -fMax, +Inf -> +fMax, -Inf -> -fMax
{
if (CHECK_FPU_OVERFLOW)
{
fpuFloat3(regd);
}
}
void ClampValues(int regd)
{
fpuFloat(regd);
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// ABS XMM
//------------------------------------------------------------------
void recABS_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::ABS_F);
if (info & PROCESS_EE_S)
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Fs_]]);
xAND.PS(xRegisterSSE(EEREC_D), ptr[&s_pos[0]]);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
if (CHECK_FPU_OVERFLOW) // Only need to do positive clamp, since EEREC_D is positive
xMIN.SS(xRegisterSSE(EEREC_D), ptr[&g_maxvals[0]]);
}
FPURECOMPILE_CONSTCODE(ABS_S, XMMINFO_WRITED | XMMINFO_READS);
//------------------------------------------------------------------
//------------------------------------------------------------------
// FPU_ADD_SUB (Used to mimic PS2's FPU add/sub behavior)
//------------------------------------------------------------------
// Compliant IEEE FPU uses, in computations, uses additional "guard" bits to the right of the mantissa
// but EE-FPU doesn't. Substraction (and addition of positive and negative) may shift the mantissa left,
// causing those bits to appear in the result; this function masks out the bits of the mantissa that will
// get shifted right to the guard bits to ensure that the guard bits are empty.
// The difference of the exponents = the amount that the smaller operand will be shifted right by.
// Modification - the PS2 uses a single guard bit? (Coded by Nneeve)
//------------------------------------------------------------------
void FPU_ADD_SUB(int regd, int regt, int issub)
{
const int xmmtemp = _allocTempXMMreg(XMMT_FPS); //temporary for anding with regd/regt
xMOVD(ecx, xRegisterSSE(regd)); // ecx receives regd
xMOVD(eax, xRegisterSSE(regt)); // eax receives regt
//mask the exponents
xSHR(ecx, 23);
xSHR(eax, 23);
xAND(ecx, 0xff);
xAND(eax, 0xff);
xSUB(ecx, eax); //tempecx = exponent difference
xCMP(ecx, 25);
j8Ptr[0] = JGE8(0);
xCMP(ecx, 0);
j8Ptr[1] = JG8(0);
j8Ptr[2] = JE8(0);
xCMP(ecx, -25);
j8Ptr[3] = JLE8(0);
//diff = -24 .. -1 , expd < expt
xNEG(ecx);
xDEC(ecx);
xMOV(eax, 0xffffffff);
xSHL(eax, cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), eax);
xAND.PS(xRegisterSSE(regd), xRegisterSSE(xmmtemp));
if (issub)
xSUB.SS(xRegisterSSE(regd), xRegisterSSE(regt));
else
xADD.SS(xRegisterSSE(regd), xRegisterSSE(regt));
j8Ptr[4] = JMP8(0);
x86SetJ8(j8Ptr[0]);
//diff = 25 .. 255 , expt < expd
xMOVAPS(xRegisterSSE(xmmtemp), xRegisterSSE(regt));
xAND.PS(xRegisterSSE(xmmtemp), ptr[s_neg]);
if (issub)
xSUB.SS(xRegisterSSE(regd), xRegisterSSE(xmmtemp));
else
xADD.SS(xRegisterSSE(regd), xRegisterSSE(xmmtemp));
j8Ptr[5] = JMP8(0);
x86SetJ8(j8Ptr[1]);
//diff = 1 .. 24, expt < expd
xDEC(ecx);
xMOV(eax, 0xffffffff);
xSHL(eax, cl); //temp2 = 0xffffffff << tempecx
xMOVDZX(xRegisterSSE(xmmtemp), eax);
xAND.PS(xRegisterSSE(xmmtemp), xRegisterSSE(regt));
if (issub)
xSUB.SS(xRegisterSSE(regd), xRegisterSSE(xmmtemp));
else
xADD.SS(xRegisterSSE(regd), xRegisterSSE(xmmtemp));
j8Ptr[6] = JMP8(0);
x86SetJ8(j8Ptr[3]);
//diff = -255 .. -25, expd < expt
xAND.PS(xRegisterSSE(regd), ptr[s_neg]);
if (issub)
xSUB.SS(xRegisterSSE(regd), xRegisterSSE(regt));
else
xADD.SS(xRegisterSSE(regd), xRegisterSSE(regt));
j8Ptr[7] = JMP8(0);
x86SetJ8(j8Ptr[2]);
//diff == 0
if (issub)
xSUB.SS(xRegisterSSE(regd), xRegisterSSE(regt));
else
xADD.SS(xRegisterSSE(regd), xRegisterSSE(regt));
x86SetJ8(j8Ptr[4]);
x86SetJ8(j8Ptr[5]);
x86SetJ8(j8Ptr[6]);
x86SetJ8(j8Ptr[7]);
_freeXMMreg(xmmtemp);
}
void FPU_ADD(int regd, int regt)
{
if (FPU_CORRECT_ADD_SUB)
FPU_ADD_SUB(regd, regt, 0);
else
xADD.SS(xRegisterSSE(regd), xRegisterSSE(regt));
}
void FPU_SUB(int regd, int regt)
{
if (FPU_CORRECT_ADD_SUB)
FPU_ADD_SUB(regd, regt, 1);
else
xSUB.SS(xRegisterSSE(regd), xRegisterSSE(regt));
}
//------------------------------------------------------------------
// Note: PS2's multiplication uses some variant of booth multiplication with wallace trees:
// It cuts off some bits, resulting in inaccurate and non-commutative results.
// The PS2's result mantissa is either equal to x86's rounding to zero result mantissa
// or SMALLER (by 0x1). (this means that x86's other rounding modes are only less similar to PS2's mul)
//------------------------------------------------------------------
void FPU_MUL(int regd, int regt, bool reverseOperands)
{
u8 *endMul = nullptr;
if (CHECK_FPUMULHACK)
{
// if ((s == 0x3e800000) && (t == 0x40490fdb))
// return 0x3f490fda; // needed for Tales of Destiny Remake (only in a very specific room late-game)
// else
// return 0;
alignas(16) static constexpr const u32 result[4] = { 0x3f490fda };
xMOVD(ecx, xRegisterSSE(reverseOperands ? regt : regd));
xMOVD(edx, xRegisterSSE(reverseOperands ? regd : regt));
// if (((s ^ 0x3e800000) | (t ^ 0x40490fdb)) != 0) { hack; }
xXOR(ecx, 0x3e800000);
xXOR(edx, 0x40490fdb);
xOR(edx, ecx);
u8* noHack = JNZ8(0);
xMOVAPS(xRegisterSSE(regd), ptr128[result]);
endMul = JMP8(0);
x86SetJ8(noHack);
}
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(regt));
if (CHECK_FPUMULHACK)
x86SetJ8(endMul);
}
void FPU_MUL(int regd, int regt) { FPU_MUL(regd, regt, false); }
void FPU_MUL_REV(int regd, int regt) { FPU_MUL(regd, regt, true); } //reversed operands
//------------------------------------------------------------------
// CommutativeOp XMM (used for ADD, MUL, MAX, and MIN opcodes)
//------------------------------------------------------------------
static void (*recComOpXMM_to_XMM[])(x86SSERegType, x86SSERegType) = {
FPU_ADD, FPU_MUL, SSE_MAXSS_XMM_to_XMM, SSE_MINSS_XMM_to_XMM};
static void (*recComOpXMM_to_XMM_REV[])(x86SSERegType, x86SSERegType) = { //reversed operands
FPU_ADD, FPU_MUL_REV, SSE_MAXSS_XMM_to_XMM, SSE_MINSS_XMM_to_XMM};
//static void (*recComOpM32_to_XMM[] )(x86SSERegType, uptr) = {
// SSE_ADDSS_M32_to_XMM, SSE_MULSS_M32_to_XMM, SSE_MAXSS_M32_to_XMM, SSE_MINSS_M32_to_XMM };
int recCommutativeOp(int info, int regd, int op)
{
int t0reg = _allocTempXMMreg(XMMT_FPS);
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
if (regd == EEREC_S)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW /*&& !CHECK_FPUCLAMPHACK */ || (op >= 2))
{
fpuFloat2(regd);
fpuFloat2(t0reg);
}
recComOpXMM_to_XMM[op](regd, t0reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2))
{
fpuFloat2(regd);
fpuFloat2(EEREC_S);
}
recComOpXMM_to_XMM_REV[op](regd, EEREC_S);
}
break;
case PROCESS_EE_T:
if (regd == EEREC_T)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2))
{
fpuFloat2(regd);
fpuFloat2(t0reg);
}
recComOpXMM_to_XMM_REV[op](regd, t0reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2))
{
fpuFloat2(regd);
fpuFloat2(EEREC_T);
}
recComOpXMM_to_XMM[op](regd, EEREC_T);
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
if (regd == EEREC_T)
{
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2))
{
fpuFloat2(regd);
fpuFloat2(EEREC_S);
}
recComOpXMM_to_XMM_REV[op](regd, EEREC_S);
}
else
{
xMOVSS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2))
{
fpuFloat2(regd);
fpuFloat2(EEREC_T);
}
recComOpXMM_to_XMM[op](regd, EEREC_T);
}
break;
default:
Console.WriteLn(Color_Magenta, "FPU: recCommutativeOp case 4");
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW || (op >= 2))
{
fpuFloat2(regd);
fpuFloat2(t0reg);
}
recComOpXMM_to_XMM[op](regd, t0reg);
break;
}
_freeXMMreg(t0reg);
return regd;
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// ADD XMM
//------------------------------------------------------------------
void recADD_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::ADD_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
ClampValues(recCommutativeOp(info, EEREC_D, 0));
//REC_FPUOP(ADD_S);
}
FPURECOMPILE_CONSTCODE(ADD_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
void recADDA_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::ADDA_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
ClampValues(recCommutativeOp(info, EEREC_ACC, 0));
}
FPURECOMPILE_CONSTCODE(ADDA_S, XMMINFO_WRITEACC | XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// BC1x XMM
//------------------------------------------------------------------
static void _setupBranchTest()
{
_eeFlushAllDirty();
// COP1 branch conditionals are based on the following equation:
// (fpuRegs.fprc[31] & 0x00800000)
// BC2F checks if the statement is false, BC2T checks if the statement is true.
xMOV(eax, ptr[&fpuRegs.fprc[31]]);
xTEST(eax, FPUflagC);
}
void recBC1F()
{
EE::Profiler.EmitOp(eeOpcode::BC1F);
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
const bool swap = TrySwapDelaySlot(0, 0, 0, true);
_setupBranchTest();
recDoBranchImm(branchTo, JNZ32(0), false, swap);
}
void recBC1T()
{
EE::Profiler.EmitOp(eeOpcode::BC1T);
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
const bool swap = TrySwapDelaySlot(0, 0, 0, true);
_setupBranchTest();
recDoBranchImm(branchTo, JZ32(0), false, swap);
}
void recBC1FL()
{
EE::Profiler.EmitOp(eeOpcode::BC1FL);
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
_setupBranchTest();
recDoBranchImm(branchTo, JNZ32(0), true, false);
}
void recBC1TL()
{
EE::Profiler.EmitOp(eeOpcode::BC1TL);
const u32 branchTo = ((s32)_Imm_ * 4) + pc;
_setupBranchTest();
recDoBranchImm(branchTo, JZ32(0), true, false);
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// C.x.S XMM
//------------------------------------------------------------------
void recC_EQ_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CEQ_F);
//Console.WriteLn("recC_EQ_xmm()");
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regs);
}
break;
case PROCESS_EE_T:
{
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(regt));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regt);
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(regt));
fpuFreeIfTemp(regs);
fpuFreeIfTemp(regt);
}
break;
default:
Console.WriteLn(Color_Magenta, "recC_EQ_xmm: Default");
xMOV(eax, ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(eax, ptr[&fpuRegs.fpr[_Ft_]]);
j8Ptr[0] = JZ8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
return;
}
j8Ptr[0] = JZ8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
}
FPURECOMPILE_CONSTCODE(C_EQ, XMMINFO_READS | XMMINFO_READT);
//REC_FPUFUNC(C_EQ);
void recC_F()
{
EE::Profiler.EmitOp(eeOpcode::CF_F);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
}
//REC_FPUFUNC(C_F);
void recC_LE_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CLE_F);
//Console.WriteLn("recC_LE_xmm()");
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regs);
}
break;
case PROCESS_EE_T:
{
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(regt));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regt);
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(regt));
fpuFreeIfTemp(regs);
fpuFreeIfTemp(regt);
}
break;
default: // Untested and incorrect, but this case is never reached AFAIK (cottonvibes)
Console.WriteLn(Color_Magenta, "recC_LE_xmm: Default");
xMOV(eax, ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(eax, ptr[&fpuRegs.fpr[_Ft_]]);
j8Ptr[0] = JLE8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
return;
}
j8Ptr[0] = JBE8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
}
FPURECOMPILE_CONSTCODE(C_LE, XMMINFO_READS | XMMINFO_READT);
//REC_FPUFUNC(C_LE);
void recC_LT_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CLT_F);
//Console.WriteLn("recC_LT_xmm()");
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(t0reg));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regs);
}
break;
case PROCESS_EE_T:
{
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
const int t0reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
fpuFloat3(t0reg);
xUCOMI.SS(xRegisterSSE(t0reg), xRegisterSSE(regt));
_freeXMMreg(t0reg);
fpuFreeIfTemp(regt);
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
{
const int regs = fpuCopyToTempForClamp(_Fs_, EEREC_S);
fpuFloat3(regs);
const int regt = fpuCopyToTempForClamp(_Ft_, EEREC_T);
fpuFloat3(regt);
xUCOMI.SS(xRegisterSSE(regs), xRegisterSSE(regt));
fpuFreeIfTemp(regs);
fpuFreeIfTemp(regt);
}
break;
default:
Console.WriteLn(Color_Magenta, "recC_LT_xmm: Default");
xMOV(eax, ptr[&fpuRegs.fpr[_Fs_]]);
xCMP(eax, ptr[&fpuRegs.fpr[_Ft_]]);
j8Ptr[0] = JL8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
return;
}
j8Ptr[0] = JB8(0);
xAND(ptr32[&fpuRegs.fprc[31]], ~FPUflagC);
j8Ptr[1] = JMP8(0);
x86SetJ8(j8Ptr[0]);
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagC);
x86SetJ8(j8Ptr[1]);
}
FPURECOMPILE_CONSTCODE(C_LT, XMMINFO_READS | XMMINFO_READT);
//REC_FPUFUNC(C_LT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// CVT.x XMM
//------------------------------------------------------------------
void recCVT_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::CVTS_F);
if (info & PROCESS_EE_D)
{
if (info & PROCESS_EE_S)
xCVTDQ2PS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else
xCVTSI2SS(xRegisterSSE(EEREC_D), ptr32[&fpuRegs.fpr[_Fs_]]);
}
else
{
const int temp = _allocTempXMMreg(XMMT_FPS);
xCVTSI2SS(xRegisterSSE(temp), ptr32[&fpuRegs.fpr[_Fs_]]);
xMOVSS(ptr32[&fpuRegs.fpr[_Fd_]], xRegisterSSE(temp));
_freeXMMreg(temp);
}
}
FPURECOMPILE_CONSTCODE(CVT_S, XMMINFO_WRITED | XMMINFO_READS);
void recCVT_W()
{
if (CHECK_FPU_FULL)
{
DOUBLE::recCVT_W();
return;
}
// If we have the following EmitOP() on the top then it'll get calculated twice when CHECK_FPU_FULL is true
// as we also have an EmitOP() at recCVT_W() on iFPUd.cpp. hence we have it below the possible return.
EE::Profiler.EmitOp(eeOpcode::CVTW);
int regs = _checkXMMreg(XMMTYPE_FPREG, _Fs_, MODE_READ);
if (regs >= 0)
{
if (CHECK_FPU_EXTRA_OVERFLOW)
fpuFloat2(regs);
xCVTTSS2SI(eax, xRegisterSSE(regs));
xMOVMSKPS(edx, xRegisterSSE(regs)); //extract the signs
xAND(edx, 1); // keep only LSB
}
else
{
xCVTTSS2SI(eax, ptr32[&fpuRegs.fpr[_Fs_]]);
xMOV(edx, ptr[&fpuRegs.fpr[_Fs_]]);
xSHR(edx, 31); // mov sign to lsb
}
//kill register allocation for dst because we write directly to fpuRegs.fpr[_Fd_]
_deleteFPtoXMMreg(_Fd_, DELETE_REG_FREE_NO_WRITEBACK);
xADD(edx, 0x7FFFFFFF); // 0x7FFFFFFF if positive, 0x8000 0000 if negative
xCMP(eax, 0x80000000); // If the result is indefinitive
xCMOVE(eax, edx); // Saturate it
//Write the result
xMOV(ptr[&fpuRegs.fpr[_Fd_]], eax);
}
//------------------------------------------------------------------
//------------------------------------------------------------------
// DIV XMM
//------------------------------------------------------------------
void recDIVhelper1(int regd, int regt) // Sets flags
{
u8 *pjmp1, *pjmp2;
u32 *ajmp32, *bjmp32;
const int t1reg = _allocTempXMMreg(XMMT_FPS);
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
/*--- Check for divide by zero ---*/
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regt));
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regt == zero, sign will be set)
ajmp32 = JZ32(0); //Skip if not set
/*--- Check for 0/0 ---*/
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regd == zero, sign will be set)
pjmp1 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags ( 0/0 )
pjmp2 = JMP8(0);
x86SetJ8(pjmp1); //x/0 but not 0/0
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagD | FPUflagSD); // Set D and SD flags ( x/0 )
x86SetJ8(pjmp2);
/*--- Make regd +/- Maximum ---*/
xXOR.PS(xRegisterSSE(regd), xRegisterSSE(regt)); // Make regd Positive or Negative
xAND.PS(xRegisterSSE(regd), ptr[&s_neg[0]]); // Get the sign bit
xOR.PS(xRegisterSSE(regd), ptr[&g_maxvals[0]]); // regd = +/- Maximum
//xMOVSSZX(xRegisterSSE(regd), ptr[&g_maxvals[0]]);
bjmp32 = JMP32(0);
x86SetJ32(ajmp32);
/*--- Normal Divide ---*/
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(regt); }
xDIV.SS(xRegisterSSE(regd), xRegisterSSE(regt));
ClampValues(regd);
x86SetJ32(bjmp32);
_freeXMMreg(t1reg);
}
void recDIVhelper2(int regd, int regt) // Doesn't sets flags
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(regt); }
xDIV.SS(xRegisterSSE(regd), xRegisterSSE(regt));
ClampValues(regd);
}
alignas(16) static FPControlRegister roundmode_nearest;
void recDIV_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::DIV_F);
int t0reg = _allocTempXMMreg(XMMT_FPS);
//Console.WriteLn("DIV");
if (EmuConfig.Cpu.FPUFPCR.bitmask != EmuConfig.Cpu.FPUDivFPCR.bitmask)
xLDMXCSR(ptr32[&EmuConfig.Cpu.FPUDivFPCR.bitmask]);
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
//Console.WriteLn("FPU: DIV case 1");
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_FLAGS)
recDIVhelper1(EEREC_D, t0reg);
else
recDIVhelper2(EEREC_D, t0reg);
break;
case PROCESS_EE_T:
//Console.WriteLn("FPU: DIV case 2");
if (EEREC_D == EEREC_T)
{
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_FLAGS)
recDIVhelper1(EEREC_D, t0reg);
else
recDIVhelper2(EEREC_D, t0reg);
}
else
{
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_FLAGS)
recDIVhelper1(EEREC_D, EEREC_T);
else
recDIVhelper2(EEREC_D, EEREC_T);
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
//Console.WriteLn("FPU: DIV case 3");
if (EEREC_D == EEREC_T)
{
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_FLAGS)
recDIVhelper1(EEREC_D, t0reg);
else
recDIVhelper2(EEREC_D, t0reg);
}
else
{
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_FLAGS)
recDIVhelper1(EEREC_D, EEREC_T);
else
recDIVhelper2(EEREC_D, EEREC_T);
}
break;
default:
//Console.WriteLn("FPU: DIV case 4");
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_FLAGS)
recDIVhelper1(EEREC_D, t0reg);
else
recDIVhelper2(EEREC_D, t0reg);
break;
}
if (EmuConfig.Cpu.FPUFPCR.bitmask != EmuConfig.Cpu.FPUDivFPCR.bitmask)
xLDMXCSR(ptr32[&EmuConfig.Cpu.FPUFPCR.bitmask]);
_freeXMMreg(t0reg);
}
FPURECOMPILE_CONSTCODE(DIV_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MADD XMM
//------------------------------------------------------------------
void recMADDtemp(int info, int regd)
{
const int t0reg = _allocTempXMMreg(XMMT_FPS);
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
if (regd == EEREC_S)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(t0reg));
if (info & PROCESS_EE_ACC)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); }
FPU_ADD(regd, EEREC_ACC);
}
else
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
}
else if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_S); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
if (info & PROCESS_EE_ACC)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); }
FPU_ADD(regd, EEREC_ACC);
}
else
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
}
break;
case PROCESS_EE_T:
if (regd == EEREC_T)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(t0reg));
if (info & PROCESS_EE_ACC)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); }
FPU_ADD(regd, EEREC_ACC);
}
else
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
}
else if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_T); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_T));
if (info & PROCESS_EE_ACC)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(regd); }
FPU_ADD(regd, EEREC_ACC);
}
else
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(EEREC_ACC); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
if (regd == EEREC_S)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_T));
if (info & PROCESS_EE_ACC)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); }
FPU_ADD(regd, EEREC_ACC);
}
else
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
}
else if (regd == EEREC_T)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
if (info & PROCESS_EE_ACC)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); }
FPU_ADD(regd, EEREC_ACC);
}
else
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
}
else if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(EEREC_T); }
xMUL.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
else
{
xMOVSS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_T));
if (info & PROCESS_EE_ACC)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); }
FPU_ADD(regd, EEREC_ACC);
}
else
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
}
break;
default:
if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
const int t1reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
xMOVSSZX(xRegisterSSE(t1reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(t1reg); }
xMUL.SS(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
_freeXMMreg(t1reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(t0reg));
if (info & PROCESS_EE_ACC)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(EEREC_ACC); }
FPU_ADD(regd, EEREC_ACC);
}
else
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_ADD(regd, t0reg);
}
}
break;
}
ClampValues(regd);
_freeXMMreg(t0reg);
}
void recMADD_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MADD_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
recMADDtemp(info, EEREC_D);
}
FPURECOMPILE_CONSTCODE(MADD_S, XMMINFO_WRITED | XMMINFO_READACC | XMMINFO_READS | XMMINFO_READT);
void recMADDA_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MADDA_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
recMADDtemp(info, EEREC_ACC);
}
FPURECOMPILE_CONSTCODE(MADDA_S, XMMINFO_WRITEACC | XMMINFO_READACC | XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MAX / MIN XMM
//------------------------------------------------------------------
void recMAX_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MAX_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
recCommutativeOp(info, EEREC_D, 2);
}
FPURECOMPILE_CONSTCODE(MAX_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
void recMIN_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MIN_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
recCommutativeOp(info, EEREC_D, 3);
}
FPURECOMPILE_CONSTCODE(MIN_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MOV XMM
//------------------------------------------------------------------
void recMOV_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MOV_F);
if (info & PROCESS_EE_S)
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Fs_]]);
}
FPURECOMPILE_CONSTCODE(MOV_S, XMMINFO_WRITED | XMMINFO_READS);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MSUB XMM
//------------------------------------------------------------------
void recMSUBtemp(int info, int regd)
{
int t0reg = _allocTempXMMreg(XMMT_FPS);
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
if (regd == EEREC_S)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(t0reg));
if (info & PROCESS_EE_ACC)
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_ACC));
else
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(t0reg, regd);
xMOVSS(xRegisterSSE(regd), xRegisterSSE(t0reg));
}
else if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_S); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(regd, t0reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
if (info & PROCESS_EE_ACC)
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_ACC));
else
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(t0reg, regd);
xMOVSS(xRegisterSSE(regd), xRegisterSSE(t0reg));
}
break;
case PROCESS_EE_T:
if (regd == EEREC_T)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(t0reg));
if (info & PROCESS_EE_ACC)
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_ACC));
else
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(t0reg, regd);
xMOVSS(xRegisterSSE(regd), xRegisterSSE(t0reg));
}
else if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(EEREC_T); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(regd, t0reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_T));
if (info & PROCESS_EE_ACC)
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_ACC));
else
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(t0reg, regd);
xMOVSS(xRegisterSSE(regd), xRegisterSSE(t0reg));
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
if (regd == EEREC_S)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_T));
if (info & PROCESS_EE_ACC)
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_ACC));
else
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(t0reg, regd);
xMOVSS(xRegisterSSE(regd), xRegisterSSE(t0reg));
}
else if (regd == EEREC_T)
{
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_S); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
if (info & PROCESS_EE_ACC)
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_ACC));
else
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(t0reg, regd);
xMOVSS(xRegisterSSE(regd), xRegisterSSE(t0reg));
}
else if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(EEREC_T); }
xMUL.SS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(regd, t0reg);
}
else
{
xMOVSS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(EEREC_T); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(EEREC_T));
if (info & PROCESS_EE_ACC)
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_ACC));
else
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(t0reg, regd);
xMOVSS(xRegisterSSE(regd), xRegisterSSE(t0reg));
}
break;
default:
if ((info & PROCESS_EE_ACC) && regd == EEREC_ACC)
{
const int t1reg = _allocTempXMMreg(XMMT_FPS);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Fs_]]);
xMOVSSZX(xRegisterSSE(t1reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(t0reg); fpuFloat2(t1reg); }
xMUL.SS(xRegisterSSE(t0reg), xRegisterSSE(t1reg));
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(regd, t0reg);
_freeXMMreg(t1reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat2(regd); fpuFloat2(t0reg); }
xMUL.SS(xRegisterSSE(regd), xRegisterSSE(t0reg));
if (info & PROCESS_EE_ACC)
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_ACC));
else
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.ACC]);
if (CHECK_FPU_EXTRA_OVERFLOW) { fpuFloat(regd); fpuFloat(t0reg); }
FPU_SUB(t0reg, regd);
xMOVSS(xRegisterSSE(regd), xRegisterSSE(t0reg));
}
break;
}
ClampValues(regd);
_freeXMMreg(t0reg);
}
void recMSUB_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MSUB_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
recMSUBtemp(info, EEREC_D);
}
FPURECOMPILE_CONSTCODE(MSUB_S, XMMINFO_WRITED | XMMINFO_READACC | XMMINFO_READS | XMMINFO_READT);
void recMSUBA_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MSUBA_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
recMSUBtemp(info, EEREC_ACC);
}
FPURECOMPILE_CONSTCODE(MSUBA_S, XMMINFO_WRITEACC | XMMINFO_READACC | XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// MUL XMM
//------------------------------------------------------------------
void recMUL_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MUL_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
ClampValues(recCommutativeOp(info, EEREC_D, 1));
}
FPURECOMPILE_CONSTCODE(MUL_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
void recMULA_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::MULA_F);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
ClampValues(recCommutativeOp(info, EEREC_ACC, 1));
}
FPURECOMPILE_CONSTCODE(MULA_S, XMMINFO_WRITEACC | XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// NEG XMM
//------------------------------------------------------------------
void recNEG_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::NEG_F);
if (info & PROCESS_EE_S)
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
else
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Fs_]]);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
xXOR.PS(xRegisterSSE(EEREC_D), ptr[&s_neg[0]]);
// Always preserve sign. Using float clamping here would result in
// +inf to become +fMax instead of -fMax, which is definitely wrong.
fpuFloat3(EEREC_D);
}
FPURECOMPILE_CONSTCODE(NEG_S, XMMINFO_WRITED | XMMINFO_READS);
//------------------------------------------------------------------
//------------------------------------------------------------------
// SUB XMM
//------------------------------------------------------------------
void recSUBhelper(int regd, int regt)
{
if (CHECK_FPU_EXTRA_OVERFLOW /*&& !CHECK_FPUCLAMPHACK*/) { fpuFloat2(regd); fpuFloat2(regt); }
FPU_SUB(regd, regt);
}
void recSUBop(int info, int regd)
{
int t0reg = _allocTempXMMreg(XMMT_FPS);
//xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagO|FPUflagU)); // Clear O and U flags
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
//Console.WriteLn("FPU: SUB case 1");
if (regd != EEREC_S)
xMOVSS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
recSUBhelper(regd, t0reg);
break;
case PROCESS_EE_T:
//Console.WriteLn("FPU: SUB case 2");
if (regd == EEREC_T)
{
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
recSUBhelper(regd, t0reg);
}
else
{
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
recSUBhelper(regd, EEREC_T);
}
break;
case (PROCESS_EE_S | PROCESS_EE_T):
//Console.WriteLn("FPU: SUB case 3");
if (regd == EEREC_T)
{
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVSS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
recSUBhelper(regd, t0reg);
}
else
{
xMOVSS(xRegisterSSE(regd), xRegisterSSE(EEREC_S));
recSUBhelper(regd, EEREC_T);
}
break;
default:
Console.Warning("FPU: SUB case 4");
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
xMOVSSZX(xRegisterSSE(regd), ptr[&fpuRegs.fpr[_Fs_]]);
recSUBhelper(regd, t0reg);
break;
}
ClampValues(regd);
_freeXMMreg(t0reg);
}
void recSUB_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::SUB_F);
recSUBop(info, EEREC_D);
}
FPURECOMPILE_CONSTCODE(SUB_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
void recSUBA_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::SUBA_F);
recSUBop(info, EEREC_ACC);
}
FPURECOMPILE_CONSTCODE(SUBA_S, XMMINFO_WRITEACC | XMMINFO_READS | XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// SQRT XMM
//------------------------------------------------------------------
void recSQRT_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::SQRT_F);
bool roundmodeFlag = false;
//Console.WriteLn("FPU: SQRT");
if (EmuConfig.Cpu.FPUFPCR.GetRoundMode() != FPRoundMode::Nearest)
{
// Set roundmode to nearest if it isn't already
//Console.WriteLn("sqrt to nearest");
roundmode_nearest = EmuConfig.Cpu.FPUFPCR;
roundmode_nearest.SetRoundMode(FPRoundMode::Nearest);
xLDMXCSR(ptr32[&roundmode_nearest.bitmask]);
roundmodeFlag = true;
}
if (info & PROCESS_EE_T)
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_T));
else
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_FLAGS)
{
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
/*--- Check for negative SQRT ---*/
xMOVMSKPS(eax, xRegisterSSE(EEREC_D));
xAND(eax, 1); //Check sign
u8* pjmp = JZ8(0); //Skip if none are
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags
xAND.PS(xRegisterSSE(EEREC_D), ptr[&s_pos[0]]); // Make EEREC_D Positive
x86SetJ8(pjmp);
}
else
xAND.PS(xRegisterSSE(EEREC_D), ptr[&s_pos[0]]); // Make EEREC_D Positive
if (CHECK_FPU_OVERFLOW) // Only need to do positive clamp, since EEREC_D is positive
xMIN.SS(xRegisterSSE(EEREC_D), ptr[&g_maxvals[0]]);
xSQRT.SS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_D));
if (CHECK_FPU_EXTRA_OVERFLOW) // Shouldn't need to clamp again since SQRT of a number will always be smaller than the original number, doing it just incase :/
ClampValues(EEREC_D);
if (roundmodeFlag)
xLDMXCSR(ptr32[&EmuConfig.Cpu.FPUFPCR.bitmask]);
}
FPURECOMPILE_CONSTCODE(SQRT_S, XMMINFO_WRITED | XMMINFO_READT);
//------------------------------------------------------------------
//------------------------------------------------------------------
// RSQRT XMM
//------------------------------------------------------------------
void recRSQRThelper1(int regd, int t0reg) // Preforms the RSQRT function when regd <- Fs and t0reg <- Ft (Sets correct flags)
{
u8 *pjmp1, *pjmp2;
u32 *pjmp32;
u8 *qjmp1, *qjmp2;
int t1reg = _allocTempXMMreg(XMMT_FPS);
xAND(ptr32[&fpuRegs.fprc[31]], ~(FPUflagI | FPUflagD)); // Clear I and D flags
/*--- (first) Check for negative SQRT ---*/
xMOVMSKPS(eax, xRegisterSSE(t0reg));
xAND(eax, 1); //Check sign
pjmp2 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags
xAND.PS(xRegisterSSE(t0reg), ptr[&s_pos[0]]); // Make t0reg Positive
x86SetJ8(pjmp2);
/*--- Check for zero ---*/
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(t0reg));
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if t0reg == zero, sign will be set)
pjmp1 = JZ8(0); //Skip if not set
/*--- Check for 0/0 ---*/
xXOR.PS(xRegisterSSE(t1reg), xRegisterSSE(t1reg));
xCMPEQ.SS(xRegisterSSE(t1reg), xRegisterSSE(regd));
xMOVMSKPS(eax, xRegisterSSE(t1reg));
xAND(eax, 1); //Check sign (if regd == zero, sign will be set)
qjmp1 = JZ8(0); //Skip if not set
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagI | FPUflagSI); // Set I and SI flags ( 0/0 )
qjmp2 = JMP8(0);
x86SetJ8(qjmp1); //x/0 but not 0/0
xOR(ptr32[&fpuRegs.fprc[31]], FPUflagD | FPUflagSD); // Set D and SD flags ( x/0 )
x86SetJ8(qjmp2);
/*--- Make regd +/- Maximum ---*/
xAND.PS(xRegisterSSE(regd), ptr[&s_neg[0]]); // Get the sign bit
xOR.PS(xRegisterSSE(regd), ptr[&g_maxvals[0]]); // regd = +/- Maximum
pjmp32 = JMP32(0);
x86SetJ8(pjmp1);
if (CHECK_FPU_EXTRA_OVERFLOW)
{
xMIN.SS(xRegisterSSE(t0reg), ptr[&g_maxvals[0]]); // Only need to do positive clamp, since t0reg is positive
fpuFloat2(regd);
}
xSQRT.SS(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xDIV.SS(xRegisterSSE(regd), xRegisterSSE(t0reg));
ClampValues(regd);
x86SetJ32(pjmp32);
_freeXMMreg(t1reg);
}
void recRSQRThelper2(int regd, int t0reg) // Preforms the RSQRT function when regd <- Fs and t0reg <- Ft (Doesn't set flags)
{
xAND.PS(xRegisterSSE(t0reg), ptr[&s_pos[0]]); // Make t0reg Positive
if (CHECK_FPU_EXTRA_OVERFLOW)
{
xMIN.SS(xRegisterSSE(t0reg), ptr[&g_maxvals[0]]); // Only need to do positive clamp, since t0reg is positive
fpuFloat2(regd);
}
xSQRT.SS(xRegisterSSE(t0reg), xRegisterSSE(t0reg));
xDIV.SS(xRegisterSSE(regd), xRegisterSSE(t0reg));
ClampValues(regd);
}
void recRSQRT_S_xmm(int info)
{
EE::Profiler.EmitOp(eeOpcode::RSQRT_F);
// RSQRT doesn't change the round mode, because RSQRTSS ignores the rounding mode in MXCSR.
const int t0reg = _allocTempXMMreg(XMMT_FPS);
//Console.WriteLn("FPU: RSQRT");
switch (info & (PROCESS_EE_S | PROCESS_EE_T))
{
case PROCESS_EE_S:
//Console.WriteLn("FPU: RSQRT case 1");
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
if (CHECK_FPU_EXTRA_FLAGS)
recRSQRThelper1(EEREC_D, t0reg);
else
recRSQRThelper2(EEREC_D, t0reg);
break;
case PROCESS_EE_T:
//Console.WriteLn("FPU: RSQRT case 2");
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_FLAGS)
recRSQRThelper1(EEREC_D, t0reg);
else
recRSQRThelper2(EEREC_D, t0reg);
break;
case (PROCESS_EE_S | PROCESS_EE_T):
//Console.WriteLn("FPU: RSQRT case 3");
xMOVSS(xRegisterSSE(t0reg), xRegisterSSE(EEREC_T));
xMOVSS(xRegisterSSE(EEREC_D), xRegisterSSE(EEREC_S));
if (CHECK_FPU_EXTRA_FLAGS)
recRSQRThelper1(EEREC_D, t0reg);
else
recRSQRThelper2(EEREC_D, t0reg);
break;
default:
//Console.WriteLn("FPU: RSQRT case 4");
xMOVSSZX(xRegisterSSE(t0reg), ptr[&fpuRegs.fpr[_Ft_]]);
xMOVSSZX(xRegisterSSE(EEREC_D), ptr[&fpuRegs.fpr[_Fs_]]);
if (CHECK_FPU_EXTRA_FLAGS)
recRSQRThelper1(EEREC_D, t0reg);
else
recRSQRThelper2(EEREC_D, t0reg);
break;
}
_freeXMMreg(t0reg);
}
FPURECOMPILE_CONSTCODE(RSQRT_S, XMMINFO_WRITED | XMMINFO_READS | XMMINFO_READT);
#endif // FPU_RECOMPILE
} // namespace COP1
} // namespace OpcodeImpl
} // namespace Dynarec
} // namespace R5900