Jit64AsmCommon: Reduce branches in fast-path and inline most behavior

The only behavior requires a function call is denormal behaviour. We also fix fpscr exception raising.
This commit is contained in:
MerryMage 2018-09-23 20:11:09 +01:00
parent 826bcad3a5
commit 89a0b0de32
1 changed files with 43 additions and 17 deletions

View File

@ -32,17 +32,14 @@ void CommonAsmRoutines::GenFrsqrte()
// This function clobbers all three RSCRATCH.
MOVQ_xmm(R(RSCRATCH), XMM0);
// Negative and zero inputs set an exception and take the complex path.
TEST(64, R(RSCRATCH), R(RSCRATCH));
FixupBranch zero = J_CC(CC_Z, true);
FixupBranch negative = J_CC(CC_S, true);
// Extract exponent
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
// Zero and max exponents (non-normal floats) take the complex path.
FixupBranch complex1 = J_CC(CC_Z, true);
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
FixupBranch complex2 = J_CC(CC_E, true);
// Negatives, zeros, denormals, infinities and NaNs take the complex path.
LEA(32, RSCRATCH2, MDisp(RSCRATCH_EXTRA, -1));
CMP(32, R(RSCRATCH2), Imm32(0x7FE));
FixupBranch complex = J_CC(CC_AE, true);
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
@ -75,24 +72,53 @@ void CommonAsmRoutines::GenFrsqrte()
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for zero input.
SetJumpTarget(zero);
SetJumpTarget(complex);
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
FixupBranch nan_or_inf = J_CC(CC_E);
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHL(64, R(RSCRATCH2), Imm8(1));
FixupBranch nonzero = J_CC(CC_NZ);
// +0.0 or -0.0
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
FixupBranch complex3 = J();
SetJumpTarget(skip_set_fx1);
MOV(64, R(RSCRATCH2), Imm64(0x7FF0'0000'0000'0000));
OR(64, R(RSCRATCH2), R(RSCRATCH));
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
// Exception flags for negative input.
// SNaN or QNaN or +Inf or -Inf
SetJumpTarget(nan_or_inf);
MOV(64, R(RSCRATCH2), R(RSCRATCH));
SHL(64, R(RSCRATCH2), Imm8(12));
FixupBranch inf = J_CC(CC_Z);
BTS(64, R(RSCRATCH), Imm8(51));
MOVQ_xmm(XMM0, R(RSCRATCH));
RET();
SetJumpTarget(inf);
BT(64, R(RSCRATCH), Imm8(63));
FixupBranch negative = J_CC(CC_C);
XORPD(XMM0, R(XMM0));
RET();
SetJumpTarget(nonzero);
FixupBranch denormal = J_CC(CC_NC);
// Negative sign
SetJumpTarget(negative);
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
SetJumpTarget(skip_set_fx1);
SetJumpTarget(skip_set_fx2);
SetJumpTarget(complex1);
SetJumpTarget(complex2);
SetJumpTarget(complex3);
MOV(64, R(RSCRATCH2), Imm64(0x7FF8'0000'0000'0000));
MOVQ_xmm(XMM0, R(RSCRATCH2));
RET();
SetJumpTarget(denormal);
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
ABI_CallFunction(Common::ApproximateReciprocalSquareRoot);
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);