Jit64: Make DoubleToSingle a common asm routine
This commit is contained in:
parent
9136abf07e
commit
c4799e5977
|
@ -236,6 +236,8 @@ void Jit64AsmRoutineManager::GenerateCommon()
|
||||||
GenFres();
|
GenFres();
|
||||||
mfcr = AlignCode4();
|
mfcr = AlignCode4();
|
||||||
GenMfcr();
|
GenMfcr();
|
||||||
|
cdts = AlignCode4();
|
||||||
|
GenConvertDoubleToSingle();
|
||||||
|
|
||||||
GenQuantizedLoads();
|
GenQuantizedLoads();
|
||||||
GenQuantizedSingleLoads();
|
GenQuantizedSingleLoads();
|
||||||
|
|
|
@ -115,7 +115,8 @@ void Jit64::stfXXX(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
RCX64Reg Rs = fpr.Bind(s, RCMode::Read);
|
RCX64Reg Rs = fpr.Bind(s, RCMode::Read);
|
||||||
RegCache::Realize(Rs);
|
RegCache::Realize(Rs);
|
||||||
ConvertDoubleToSingle(XMM0, Rs);
|
MOVAPD(XMM0, Rs);
|
||||||
|
CALL(asm_routines.cdts);
|
||||||
}
|
}
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
}
|
}
|
||||||
|
|
|
@ -868,89 +868,9 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Since the following float conversion functions are used in non-arithmetic PPC float
|
|
||||||
// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
|
|
||||||
// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
|
|
||||||
// flush-to-zero so we can use FLD+FSTP even on denormals.
|
|
||||||
// If the number is a NaN, make sure to set the QNaN bit back to its original value.
|
|
||||||
|
|
||||||
// Another problem is that officially, converting doubles to single format results in undefined
|
|
||||||
// behavior. Relying on undefined behavior is a bug so no software should ever do this.
|
|
||||||
// Super Mario 64 (on Wii VC) accidentally relies on this behavior. See issue #11173
|
|
||||||
|
|
||||||
alignas(16) static const __m128i double_exponent = _mm_set_epi64x(0, 0x7ff0000000000000);
|
|
||||||
alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
|
|
||||||
alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
|
|
||||||
alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
|
|
||||||
alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
|
|
||||||
alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
|
|
||||||
alignas(16) static const __m128i double_qnan_bit = _mm_set_epi64x(0xffffffffffffffff,
|
alignas(16) static const __m128i double_qnan_bit = _mm_set_epi64x(0xffffffffffffffff,
|
||||||
0xfff7ffffffffffff);
|
0xfff7ffffffffffff);
|
||||||
|
|
||||||
// This is the same algorithm used in the interpreter (and actual hardware)
|
|
||||||
// The documentation states that the conversion of a double with an outside the
|
|
||||||
// valid range for a single (or a single denormal) is undefined.
|
|
||||||
// But testing on actual hardware shows it always picks bits 0..1 and 5..34
|
|
||||||
// unless the exponent is in the range of 874 to 896.
|
|
||||||
void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
|
|
||||||
{
|
|
||||||
MOVAPD(XMM1, R(src));
|
|
||||||
|
|
||||||
// Grab Exponent
|
|
||||||
PAND(XMM1, MConst(double_exponent));
|
|
||||||
PSRLQ(XMM1, 52);
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM1);
|
|
||||||
|
|
||||||
// Check if the double is in the range of valid single subnormal
|
|
||||||
SUB(16, R(RSCRATCH), Imm16(874));
|
|
||||||
CMP(16, R(RSCRATCH), Imm16(896 - 874));
|
|
||||||
FixupBranch NoDenormalize = J_CC(CC_A);
|
|
||||||
|
|
||||||
// Denormalise
|
|
||||||
|
|
||||||
// shift = (905 - Exponent) plus the 21 bit double to single shift
|
|
||||||
MOV(16, R(RSCRATCH), Imm16(905 + 21));
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH));
|
|
||||||
PSUBQ(XMM0, R(XMM1));
|
|
||||||
|
|
||||||
// xmm1 = fraction | 0x0010000000000000
|
|
||||||
MOVAPD(XMM1, R(src));
|
|
||||||
PAND(XMM1, MConst(double_fraction));
|
|
||||||
POR(XMM1, MConst(double_explicit_top_bit));
|
|
||||||
|
|
||||||
// fraction >> shift
|
|
||||||
PSRLQ(XMM1, R(XMM0));
|
|
||||||
|
|
||||||
// OR the sign bit in.
|
|
||||||
MOVAPD(XMM0, R(src));
|
|
||||||
PAND(XMM0, MConst(double_sign_bit));
|
|
||||||
PSRLQ(XMM0, 32);
|
|
||||||
POR(XMM1, R(XMM0));
|
|
||||||
|
|
||||||
FixupBranch end = J(false); // Goto end
|
|
||||||
|
|
||||||
SetJumpTarget(NoDenormalize);
|
|
||||||
|
|
||||||
// Don't Denormalize
|
|
||||||
|
|
||||||
// We want bits 0, 1
|
|
||||||
MOVAPD(XMM1, R(src));
|
|
||||||
PAND(XMM1, MConst(double_top_two_bits));
|
|
||||||
PSRLQ(XMM1, 32);
|
|
||||||
|
|
||||||
// And 5 through to 34
|
|
||||||
MOVAPD(XMM0, R(src));
|
|
||||||
PAND(XMM0, MConst(double_bottom_bits));
|
|
||||||
PSRLQ(XMM0, 29);
|
|
||||||
|
|
||||||
// OR them togther
|
|
||||||
POR(XMM1, R(XMM0));
|
|
||||||
|
|
||||||
// End
|
|
||||||
SetJumpTarget(end);
|
|
||||||
MOVDDUP(dst, R(XMM1));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Converting single->double is a bit easier because all single denormals are double normals.
|
// Converting single->double is a bit easier because all single denormals are double normals.
|
||||||
void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
|
void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr)
|
||||||
{
|
{
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include "Common/CPUDetect.h"
|
#include "Common/CPUDetect.h"
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
#include "Common/FloatUtils.h"
|
#include "Common/FloatUtils.h"
|
||||||
|
#include "Common/Intrinsics.h"
|
||||||
#include "Common/JitRegister.h"
|
#include "Common/JitRegister.h"
|
||||||
#include "Common/x64ABI.h"
|
#include "Common/x64ABI.h"
|
||||||
#include "Common/x64Emitter.h"
|
#include "Common/x64Emitter.h"
|
||||||
|
@ -25,6 +26,87 @@
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
|
alignas(16) static const __m128i double_fraction = _mm_set_epi64x(0, 0x000fffffffffffff);
|
||||||
|
alignas(16) static const __m128i double_sign_bit = _mm_set_epi64x(0, 0x8000000000000000);
|
||||||
|
alignas(16) static const __m128i double_explicit_top_bit = _mm_set_epi64x(0, 0x0010000000000000);
|
||||||
|
alignas(16) static const __m128i double_top_two_bits = _mm_set_epi64x(0, 0xc000000000000000);
|
||||||
|
alignas(16) static const __m128i double_bottom_bits = _mm_set_epi64x(0, 0x07ffffffe0000000);
|
||||||
|
|
||||||
|
// Since the following float conversion functions are used in non-arithmetic PPC float
|
||||||
|
// instructions, they must convert floats bitexact and never flush denormals to zero or turn SNaNs
|
||||||
|
// into QNaNs. This means we can't use CVTSS2SD/CVTSD2SS. The x87 FPU doesn't even support
|
||||||
|
// flush-to-zero so we can use FLD+FSTP even on denormals.
|
||||||
|
// If the number is a NaN, make sure to set the QNaN bit back to its original value.
|
||||||
|
|
||||||
|
// Another problem is that officially, converting doubles to single format results in undefined
|
||||||
|
// behavior. Relying on undefined behavior is a bug so no software should ever do this.
|
||||||
|
// Super Mario 64 (on Wii VC) accidentally relies on this behavior. See issue #11173
|
||||||
|
|
||||||
|
// This is the same algorithm used in the interpreter (and actual hardware)
|
||||||
|
// The documentation states that the conversion of a double with an outside the
|
||||||
|
// valid range for a single (or a single denormal) is undefined.
|
||||||
|
// But testing on actual hardware shows it always picks bits 0..1 and 5..34
|
||||||
|
// unless the exponent is in the range of 874 to 896.
|
||||||
|
|
||||||
|
void CommonAsmRoutines::GenConvertDoubleToSingle()
|
||||||
|
{
|
||||||
|
// Input in XMM0, output to XMM0
|
||||||
|
// Clobbers RSCRATCH/RSCRATCH2/XMM1
|
||||||
|
|
||||||
|
const void* start = GetCodePtr();
|
||||||
|
|
||||||
|
// Grab Exponent
|
||||||
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
|
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
||||||
|
SHR(64, R(RSCRATCH), Imm8(52));
|
||||||
|
AND(16, R(RSCRATCH), Imm16(0x7ff));
|
||||||
|
|
||||||
|
// Check if the double is in the range of valid single subnormal
|
||||||
|
SUB(16, R(RSCRATCH), Imm16(874));
|
||||||
|
CMP(16, R(RSCRATCH), Imm16(896 - 874));
|
||||||
|
FixupBranch Denormalize = J_CC(CC_NA);
|
||||||
|
|
||||||
|
// Don't Denormalize
|
||||||
|
|
||||||
|
// We want bits 0, 1
|
||||||
|
MOVAPD(XMM1, R(XMM0));
|
||||||
|
PAND(XMM1, MConst(double_top_two_bits));
|
||||||
|
PSRLQ(XMM1, 32);
|
||||||
|
|
||||||
|
// And 5 through to 34
|
||||||
|
PAND(XMM0, MConst(double_bottom_bits));
|
||||||
|
PSRLQ(XMM0, 29);
|
||||||
|
|
||||||
|
// OR them togther
|
||||||
|
POR(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
// Denormalise
|
||||||
|
SetJumpTarget(Denormalize);
|
||||||
|
|
||||||
|
// shift = (905 - Exponent) plus the 21 bit double to single shift
|
||||||
|
NEG(16, R(RSCRATCH));
|
||||||
|
ADD(16, R(RSCRATCH), Imm16((905 + 21) - 874));
|
||||||
|
MOVQ_xmm(XMM1, R(RSCRATCH));
|
||||||
|
|
||||||
|
// XMM0 = fraction | 0x0010000000000000
|
||||||
|
PAND(XMM0, MConst(double_fraction));
|
||||||
|
POR(XMM0, MConst(double_explicit_top_bit));
|
||||||
|
|
||||||
|
// fraction >> shift
|
||||||
|
PSRLQ(XMM0, R(XMM1));
|
||||||
|
|
||||||
|
// OR the sign bit in.
|
||||||
|
SHR(64, R(RSCRATCH2), Imm8(32));
|
||||||
|
AND(32, R(RSCRATCH2), Imm32(0x80000000));
|
||||||
|
MOVQ_xmm(XMM1, R(RSCRATCH2));
|
||||||
|
|
||||||
|
POR(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
JitRegister::Register(start, GetCodePtr(), "JIT_cdts");
|
||||||
|
}
|
||||||
|
|
||||||
void CommonAsmRoutines::GenFrsqrte()
|
void CommonAsmRoutines::GenFrsqrte()
|
||||||
{
|
{
|
||||||
const void* start = GetCodePtr();
|
const void* start = GetCodePtr();
|
||||||
|
|
|
@ -31,6 +31,7 @@ public:
|
||||||
void GenMfcr();
|
void GenMfcr();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
void GenConvertDoubleToSingle();
|
||||||
const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
|
const u8* GenQuantizedLoadRuntime(bool single, EQuantizeType type);
|
||||||
const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
|
const u8* GenQuantizedStoreRuntime(bool single, EQuantizeType type);
|
||||||
void GenQuantizedLoads();
|
void GenQuantizedLoads();
|
||||||
|
|
|
@ -25,6 +25,7 @@ struct CommonAsmRoutinesBase
|
||||||
const u8* frsqrte;
|
const u8* frsqrte;
|
||||||
const u8* fres;
|
const u8* fres;
|
||||||
const u8* mfcr;
|
const u8* mfcr;
|
||||||
|
const u8* cdts;
|
||||||
|
|
||||||
// In: array index: GQR to use.
|
// In: array index: GQR to use.
|
||||||
// In: ECX: Address to read from.
|
// In: ECX: Address to read from.
|
||||||
|
|
Loading…
Reference in New Issue