Merge pull request #1958 from Sonicadvance1/Rearchitect_asmcommon
Rearchitect a bit of our AsmCommon routines.
This commit is contained in:
commit
b1fc18cbaa
|
@ -199,6 +199,7 @@ if(_M_X86)
|
||||||
PowerPC/Jit64/Jit_Paired.cpp
|
PowerPC/Jit64/Jit_Paired.cpp
|
||||||
PowerPC/Jit64/JitRegCache.cpp
|
PowerPC/Jit64/JitRegCache.cpp
|
||||||
PowerPC/Jit64/Jit_SystemRegisters.cpp
|
PowerPC/Jit64/Jit_SystemRegisters.cpp
|
||||||
|
PowerPC/Jit64Common/Jit64AsmCommon.cpp
|
||||||
PowerPC/JitCommon/JitBackpatch.cpp
|
PowerPC/JitCommon/JitBackpatch.cpp
|
||||||
PowerPC/JitCommon/Jit_Util.cpp
|
PowerPC/JitCommon/Jit_Util.cpp
|
||||||
PowerPC/JitCommon/TrampolineCache.cpp)
|
PowerPC/JitCommon/TrampolineCache.cpp)
|
||||||
|
|
|
@ -235,6 +235,7 @@
|
||||||
<ClCompile Include="PowerPC\Jit64\Jit_LoadStorePaired.cpp" />
|
<ClCompile Include="PowerPC\Jit64\Jit_LoadStorePaired.cpp" />
|
||||||
<ClCompile Include="PowerPC\Jit64\Jit_Paired.cpp" />
|
<ClCompile Include="PowerPC\Jit64\Jit_Paired.cpp" />
|
||||||
<ClCompile Include="PowerPC\Jit64\Jit_SystemRegisters.cpp" />
|
<ClCompile Include="PowerPC\Jit64\Jit_SystemRegisters.cpp" />
|
||||||
|
<ClCompile Include="PowerPC\Jit64Common\Jit64AsmCommon.cpp" />
|
||||||
<ClCompile Include="PowerPC\JitCommon\JitAsmCommon.cpp" />
|
<ClCompile Include="PowerPC\JitCommon\JitAsmCommon.cpp" />
|
||||||
<ClCompile Include="PowerPC\JitCommon\JitBackpatch.cpp" />
|
<ClCompile Include="PowerPC\JitCommon\JitBackpatch.cpp" />
|
||||||
<ClCompile Include="PowerPC\JitCommon\JitBase.cpp" />
|
<ClCompile Include="PowerPC\JitCommon\JitBase.cpp" />
|
||||||
|
@ -417,6 +418,7 @@
|
||||||
<ClInclude Include="PowerPC\Jit64\JitRegCache.h" />
|
<ClInclude Include="PowerPC\Jit64\JitRegCache.h" />
|
||||||
<ClInclude Include="PowerPC\JitILCommon\IR.h" />
|
<ClInclude Include="PowerPC\JitILCommon\IR.h" />
|
||||||
<ClInclude Include="PowerPC\JitILCommon\JitILBase.h" />
|
<ClInclude Include="PowerPC\JitILCommon\JitILBase.h" />
|
||||||
|
<ClInclude Include="PowerPC\Jit64Common\Jit64AsmCommon.h" />
|
||||||
<ClInclude Include="PowerPC\JitCommon\JitAsmCommon.h" />
|
<ClInclude Include="PowerPC\JitCommon\JitAsmCommon.h" />
|
||||||
<ClInclude Include="PowerPC\JitCommon\JitBase.h" />
|
<ClInclude Include="PowerPC\JitCommon\JitBase.h" />
|
||||||
<ClInclude Include="PowerPC\JitCommon\JitCache.h" />
|
<ClInclude Include="PowerPC\JitCommon\JitCache.h" />
|
||||||
|
|
|
@ -631,6 +631,9 @@
|
||||||
<ClCompile Include="PowerPC\SignatureDB.cpp">
|
<ClCompile Include="PowerPC\SignatureDB.cpp">
|
||||||
<Filter>PowerPC</Filter>
|
<Filter>PowerPC</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
<ClCompile Include="PowerPC\Jit64Common\Jit64AsmCommon.cpp">
|
||||||
|
<Filter>PowerPC\Jit64Common</Filter>
|
||||||
|
</ClCompile>
|
||||||
<ClCompile Include="PowerPC\JitCommon\Jit_Util.cpp">
|
<ClCompile Include="PowerPC\JitCommon\Jit_Util.cpp">
|
||||||
<Filter>PowerPC\JitCommon</Filter>
|
<Filter>PowerPC\JitCommon</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
@ -1184,6 +1187,9 @@
|
||||||
<ClInclude Include="PowerPC\SignatureDB.h">
|
<ClInclude Include="PowerPC\SignatureDB.h">
|
||||||
<Filter>PowerPC</Filter>
|
<Filter>PowerPC</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
<ClInclude Include="PowerPC\Jit64Common\Jit64AsmCommon.h">
|
||||||
|
<Filter>PowerPC\Jit64Common</Filter>
|
||||||
|
</ClInclude>
|
||||||
<ClInclude Include="PowerPC\JitCommon\Jit_Util.h">
|
<ClInclude Include="PowerPC\JitCommon\Jit_Util.h">
|
||||||
<Filter>PowerPC\JitCommon</Filter>
|
<Filter>PowerPC\JitCommon</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
@ -1229,4 +1235,4 @@
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Text Include="CMakeLists.txt" />
|
<Text Include="CMakeLists.txt" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
</Project>
|
</Project>
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
|
||||||
|
|
||||||
// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
|
// In Dolphin, we don't use inline assembly. Instead, we generate all machine-near
|
||||||
// code at runtime. In the case of fixed code like this, after writing it, we write
|
// code at runtime. In the case of fixed code like this, after writing it, we write
|
||||||
|
|
|
@ -0,0 +1,601 @@
|
||||||
|
// Copyright 2013 Dolphin Emulator Project
|
||||||
|
// Licensed under GPLv2
|
||||||
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
|
#include "Common/MathUtil.h"
|
||||||
|
#include "Common/x64ABI.h"
|
||||||
|
#include "Common/x64Emitter.h"
|
||||||
|
|
||||||
|
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
|
||||||
|
#include "Core/PowerPC/JitCommon/JitBase.h"
|
||||||
|
|
||||||
|
#define QUANTIZED_REGS_TO_SAVE \
|
||||||
|
(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
|
||||||
|
RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
|
||||||
|
})
|
||||||
|
|
||||||
|
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
|
||||||
|
|
||||||
|
using namespace Gen;
|
||||||
|
|
||||||
|
void CommonAsmRoutines::GenFifoWrite(int size)
|
||||||
|
{
|
||||||
|
// Assume value in RSCRATCH
|
||||||
|
u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
|
||||||
|
_assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
|
||||||
|
MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
|
||||||
|
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
|
||||||
|
ADD(32, R(RSCRATCH2), Imm8(size >> 3));
|
||||||
|
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
|
||||||
|
RET();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CommonAsmRoutines::GenFrsqrte()
|
||||||
|
{
|
||||||
|
// Assume input in XMM0.
|
||||||
|
// This function clobbers all three RSCRATCH.
|
||||||
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
|
|
||||||
|
// Negative and zero inputs set an exception and take the complex path.
|
||||||
|
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
||||||
|
FixupBranch zero = J_CC(CC_Z, true);
|
||||||
|
FixupBranch negative = J_CC(CC_S, true);
|
||||||
|
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||||
|
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
||||||
|
|
||||||
|
// Zero and max exponents (non-normal floats) take the complex path.
|
||||||
|
FixupBranch complex1 = J_CC(CC_Z, true);
|
||||||
|
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
||||||
|
FixupBranch complex2 = J_CC(CC_E, true);
|
||||||
|
|
||||||
|
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
|
||||||
|
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
|
||||||
|
MOV(32, R(RSCRATCH2), Imm32(0x3FF));
|
||||||
|
SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
||||||
|
SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
|
||||||
|
|
||||||
|
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||||
|
SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
|
||||||
|
AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
|
||||||
|
XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
|
||||||
|
|
||||||
|
SHR(64, R(RSCRATCH), Imm8(37));
|
||||||
|
AND(32, R(RSCRATCH), Imm32(0x7FF));
|
||||||
|
IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec));
|
||||||
|
MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base));
|
||||||
|
SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||||
|
SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
|
||||||
|
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26;
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
// Exception flags for zero input.
|
||||||
|
SetJumpTarget(zero);
|
||||||
|
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
||||||
|
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
||||||
|
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
||||||
|
FixupBranch complex3 = J();
|
||||||
|
|
||||||
|
// Exception flags for negative input.
|
||||||
|
SetJumpTarget(negative);
|
||||||
|
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
|
||||||
|
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
|
||||||
|
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
|
||||||
|
|
||||||
|
SetJumpTarget(skip_set_fx1);
|
||||||
|
SetJumpTarget(skip_set_fx2);
|
||||||
|
SetJumpTarget(complex1);
|
||||||
|
SetJumpTarget(complex2);
|
||||||
|
SetJumpTarget(complex3);
|
||||||
|
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||||
|
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
|
||||||
|
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||||
|
RET();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CommonAsmRoutines::GenFres()
|
||||||
|
{
|
||||||
|
// Assume input in XMM0.
|
||||||
|
// This function clobbers all three RSCRATCH.
|
||||||
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
|
|
||||||
|
// Zero inputs set an exception and take the complex path.
|
||||||
|
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
||||||
|
FixupBranch zero = J_CC(CC_Z);
|
||||||
|
|
||||||
|
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
||||||
|
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
||||||
|
MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
||||||
|
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
|
||||||
|
AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
|
||||||
|
CMP(32, R(RSCRATCH_EXTRA), Imm32(895));
|
||||||
|
// Take the complex path for very large/small exponents.
|
||||||
|
FixupBranch complex1 = J_CC(CC_L);
|
||||||
|
CMP(32, R(RSCRATCH_EXTRA), Imm32(1149));
|
||||||
|
FixupBranch complex2 = J_CC(CC_GE);
|
||||||
|
|
||||||
|
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD));
|
||||||
|
NEG(32, R(RSCRATCH_EXTRA));
|
||||||
|
OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
|
||||||
|
SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
|
||||||
|
|
||||||
|
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
||||||
|
SHR(64, R(RSCRATCH), Imm8(37));
|
||||||
|
SHR(64, R(RSCRATCH2), Imm8(47));
|
||||||
|
AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
|
||||||
|
AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
|
||||||
|
|
||||||
|
IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec));
|
||||||
|
ADD(32, R(RSCRATCH), Imm8(1));
|
||||||
|
SHR(32, R(RSCRATCH), Imm8(1));
|
||||||
|
|
||||||
|
MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base));
|
||||||
|
SUB(32, R(RSCRATCH2), R(RSCRATCH));
|
||||||
|
SHL(64, R(RSCRATCH2), Imm8(29));
|
||||||
|
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
// Exception flags for zero input.
|
||||||
|
SetJumpTarget(zero);
|
||||||
|
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
||||||
|
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
||||||
|
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
||||||
|
SetJumpTarget(skip_set_fx1);
|
||||||
|
|
||||||
|
SetJumpTarget(complex1);
|
||||||
|
SetJumpTarget(complex2);
|
||||||
|
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||||
|
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
|
||||||
|
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
||||||
|
RET();
|
||||||
|
}
|
||||||
|
|
||||||
|
void CommonAsmRoutines::GenMfcr()
|
||||||
|
{
|
||||||
|
// Input: none
|
||||||
|
// Output: RSCRATCH
|
||||||
|
// This function clobbers all three RSCRATCH.
|
||||||
|
X64Reg dst = RSCRATCH;
|
||||||
|
X64Reg tmp = RSCRATCH2;
|
||||||
|
X64Reg cr_val = RSCRATCH_EXTRA;
|
||||||
|
XOR(32, R(dst), R(dst));
|
||||||
|
// we only need to zero the high bits of tmp once
|
||||||
|
XOR(32, R(tmp), R(tmp));
|
||||||
|
for (int i = 0; i < 8; i++)
|
||||||
|
{
|
||||||
|
static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 };
|
||||||
|
if (i != 0)
|
||||||
|
SHL(32, R(dst), Imm8(4));
|
||||||
|
|
||||||
|
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
|
||||||
|
|
||||||
|
// EQ: Bits 31-0 == 0; set flag bit 1
|
||||||
|
TEST(32, R(cr_val), R(cr_val));
|
||||||
|
// FIXME: is there a better way to do this without the partial register merging?
|
||||||
|
SETcc(CC_Z, R(tmp));
|
||||||
|
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
|
||||||
|
|
||||||
|
// GT: Value > 0; set flag bit 2
|
||||||
|
TEST(64, R(cr_val), R(cr_val));
|
||||||
|
SETcc(CC_G, R(tmp));
|
||||||
|
LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
|
||||||
|
|
||||||
|
// SO: Bit 61 set; set flag bit 0
|
||||||
|
// LT: Bit 62 set; set flag bit 3
|
||||||
|
SHR(64, R(cr_val), Imm8(61));
|
||||||
|
OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable));
|
||||||
|
}
|
||||||
|
RET();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Safe + Fast Quantizers, originally from JITIL by magumagu
|
||||||
|
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
||||||
|
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
||||||
|
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
||||||
|
static const float GC_ALIGNED16(m_255) = 255.0f;
|
||||||
|
static const float GC_ALIGNED16(m_127) = 127.0f;
|
||||||
|
static const float GC_ALIGNED16(m_m128) = -128.0f;
|
||||||
|
|
||||||
|
#define QUANTIZE_OVERFLOW_SAFE
|
||||||
|
|
||||||
|
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
|
||||||
|
// while it's OK for large negatives, it isn't for positives
|
||||||
|
// I don't know whether the overflow actually happens in any games
|
||||||
|
// but it potentially can cause problems, so we need some clamping
|
||||||
|
|
||||||
|
// See comment in header for in/outs.
|
||||||
|
void CommonAsmRoutines::GenQuantizedStores()
|
||||||
|
{
|
||||||
|
const u8* storePairedIllegal = AlignCode4();
|
||||||
|
UD2();
|
||||||
|
|
||||||
|
const u8* storePairedFloat = AlignCode4();
|
||||||
|
if (cpu_info.bSSSE3)
|
||||||
|
{
|
||||||
|
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
||||||
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
|
ROL(64, R(RSCRATCH), Imm8(32));
|
||||||
|
BSWAP(64, RSCRATCH);
|
||||||
|
}
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* storePairedU8 = AlignCode4();
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
|
MINPS(XMM0, M(m_65535));
|
||||||
|
#endif
|
||||||
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
PACKUSWB(XMM0, R(XMM0));
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* storePairedS8 = AlignCode4();
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
|
MINPS(XMM0, M(m_65535));
|
||||||
|
#endif
|
||||||
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
PACKSSWB(XMM0, R(XMM0));
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* storePairedU16 = AlignCode4();
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
|
MINPS(XMM0, M(m_65535));
|
||||||
|
#endif
|
||||||
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
|
PACKUSDW(XMM0, R(XMM0));
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
BSWAP(32, RSCRATCH);
|
||||||
|
ROL(32, R(RSCRATCH), Imm8(16));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
XORPS(XMM1, R(XMM1));
|
||||||
|
MAXPS(XMM0, R(XMM1));
|
||||||
|
MINPS(XMM0, M(m_65535));
|
||||||
|
|
||||||
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
|
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
BSWAP(32, RSCRATCH);
|
||||||
|
}
|
||||||
|
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* storePairedS16 = AlignCode4();
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
#ifdef QUANTIZE_OVERFLOW_SAFE
|
||||||
|
MINPS(XMM0, M(m_65535));
|
||||||
|
#endif
|
||||||
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
|
PACKSSDW(XMM0, R(XMM0));
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
BSWAP(32, RSCRATCH);
|
||||||
|
ROL(32, R(RSCRATCH), Imm8(16));
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
|
||||||
|
RET();
|
||||||
|
|
||||||
|
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||||
|
ReserveCodeSpace(8 * sizeof(u8*));
|
||||||
|
|
||||||
|
pairedStoreQuantized[0] = storePairedFloat;
|
||||||
|
pairedStoreQuantized[1] = storePairedIllegal;
|
||||||
|
pairedStoreQuantized[2] = storePairedIllegal;
|
||||||
|
pairedStoreQuantized[3] = storePairedIllegal;
|
||||||
|
pairedStoreQuantized[4] = storePairedU8;
|
||||||
|
pairedStoreQuantized[5] = storePairedU16;
|
||||||
|
pairedStoreQuantized[6] = storePairedS8;
|
||||||
|
pairedStoreQuantized[7] = storePairedS16;
|
||||||
|
}
|
||||||
|
|
||||||
|
// See comment in header for in/outs.
|
||||||
|
void CommonAsmRoutines::GenQuantizedSingleStores()
|
||||||
|
{
|
||||||
|
const u8* storeSingleIllegal = AlignCode4();
|
||||||
|
UD2();
|
||||||
|
|
||||||
|
// Easy!
|
||||||
|
const u8* storeSingleFloat = AlignCode4();
|
||||||
|
MOVD_xmm(R(RSCRATCH), XMM0);
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
XORPS(XMM1, R(XMM1));
|
||||||
|
MAXSS(XMM0, R(XMM1));
|
||||||
|
MINSS(XMM0, M(&m_255));
|
||||||
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* storeSingleS8 = AlignCode4();
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
MAXSS(XMM0, M(&m_m128));
|
||||||
|
MINSS(XMM0, M(&m_127));
|
||||||
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
XORPS(XMM1, R(XMM1));
|
||||||
|
MAXSS(XMM0, R(XMM1));
|
||||||
|
MINSS(XMM0, M(m_65535));
|
||||||
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* storeSingleS16 = AlignCode4();
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
||||||
|
MAXSS(XMM0, M(&m_m32768));
|
||||||
|
MINSS(XMM0, M(&m_32767));
|
||||||
|
CVTTSS2SI(RSCRATCH, R(XMM0));
|
||||||
|
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
||||||
|
RET();
|
||||||
|
|
||||||
|
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||||
|
ReserveCodeSpace(8 * sizeof(u8*));
|
||||||
|
|
||||||
|
singleStoreQuantized[0] = storeSingleFloat;
|
||||||
|
singleStoreQuantized[1] = storeSingleIllegal;
|
||||||
|
singleStoreQuantized[2] = storeSingleIllegal;
|
||||||
|
singleStoreQuantized[3] = storeSingleIllegal;
|
||||||
|
singleStoreQuantized[4] = storeSingleU8;
|
||||||
|
singleStoreQuantized[5] = storeSingleU16;
|
||||||
|
singleStoreQuantized[6] = storeSingleS8;
|
||||||
|
singleStoreQuantized[7] = storeSingleS16;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CommonAsmRoutines::GenQuantizedLoads()
|
||||||
|
{
|
||||||
|
const u8* loadPairedIllegal = AlignCode4();
|
||||||
|
UD2();
|
||||||
|
|
||||||
|
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
||||||
|
// don't need hardware access handling. This will definitely crash if paired loads occur
|
||||||
|
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
||||||
|
// for a good reason, or merely because no game does this.
|
||||||
|
// If we find something that actually does do this, maybe this should be changed. How
|
||||||
|
// much of a performance hit would it be?
|
||||||
|
const u8* loadPairedFloatTwo = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
{
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
}
|
||||||
|
else if (cpu_info.bSSSE3)
|
||||||
|
{
|
||||||
|
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||||
|
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||||
|
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
||||||
|
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
}
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedFloatOne = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
{
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
}
|
||||||
|
else if (cpu_info.bSSSE3)
|
||||||
|
{
|
||||||
|
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||||
|
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
}
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedU8Two = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
{
|
||||||
|
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||||
|
}
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVZXBD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PXOR(XMM1, R(XMM1));
|
||||||
|
PUNPCKLBW(XMM0, R(XMM1));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedU8One = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
||||||
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedS8Two = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
{
|
||||||
|
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
||||||
|
}
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVSXBD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PUNPCKLBW(XMM0, R(XMM0));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
|
PSRAD(XMM0, 24);
|
||||||
|
}
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedS8One = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
|
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
||||||
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedU16Two = AlignCode4();
|
||||||
|
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||||
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVZXWD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PXOR(XMM1, R(XMM1));
|
||||||
|
PUNPCKLWD(XMM0, R(XMM1));
|
||||||
|
}
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedU16One = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
||||||
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedS16Two = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
||||||
|
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
||||||
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
if (cpu_info.bSSE4_1)
|
||||||
|
{
|
||||||
|
PMOVSXWD(XMM0, R(XMM0));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PUNPCKLWD(XMM0, R(XMM0));
|
||||||
|
PSRAD(XMM0, 16);
|
||||||
|
}
|
||||||
|
CVTDQ2PS(XMM0, R(XMM0));
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
MULPS(XMM0, R(XMM1));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
const u8* loadPairedS16One = AlignCode4();
|
||||||
|
if (jit->js.memcheck)
|
||||||
|
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
||||||
|
else
|
||||||
|
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
||||||
|
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
||||||
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
|
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
||||||
|
UNPCKLPS(XMM0, M(m_one));
|
||||||
|
RET();
|
||||||
|
|
||||||
|
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
||||||
|
ReserveCodeSpace(16 * sizeof(u8*));
|
||||||
|
|
||||||
|
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
||||||
|
pairedLoadQuantized[1] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[2] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[3] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[4] = loadPairedU8Two;
|
||||||
|
pairedLoadQuantized[5] = loadPairedU16Two;
|
||||||
|
pairedLoadQuantized[6] = loadPairedS8Two;
|
||||||
|
pairedLoadQuantized[7] = loadPairedS16Two;
|
||||||
|
|
||||||
|
pairedLoadQuantized[8] = loadPairedFloatOne;
|
||||||
|
pairedLoadQuantized[9] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[10] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[11] = loadPairedIllegal;
|
||||||
|
pairedLoadQuantized[12] = loadPairedU8One;
|
||||||
|
pairedLoadQuantized[13] = loadPairedU16One;
|
||||||
|
pairedLoadQuantized[14] = loadPairedS8One;
|
||||||
|
pairedLoadQuantized[15] = loadPairedS16One;
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
// Copyright 2013 Dolphin Emulator Project
|
||||||
|
// Licensed under GPLv2
|
||||||
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||||
|
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
||||||
|
|
||||||
|
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
|
||||||
|
{
|
||||||
|
protected:
|
||||||
|
void GenQuantizedLoads();
|
||||||
|
void GenQuantizedStores();
|
||||||
|
void GenQuantizedSingleStores();
|
||||||
|
|
||||||
|
public:
|
||||||
|
void GenFifoWrite(int size);
|
||||||
|
void GenFrsqrte();
|
||||||
|
void GenFres();
|
||||||
|
void GenMfcr();
|
||||||
|
};
|
|
@ -2,194 +2,7 @@
|
||||||
// Licensed under GPLv2
|
// Licensed under GPLv2
|
||||||
// Refer to the license.txt file included.
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
#include "Common/CPUDetect.h"
|
|
||||||
#include "Common/MathUtil.h"
|
|
||||||
#include "Common/MemoryUtil.h"
|
|
||||||
|
|
||||||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
||||||
#include "Core/PowerPC/JitCommon/JitBase.h"
|
|
||||||
|
|
||||||
#define QUANTIZED_REGS_TO_SAVE \
|
|
||||||
(ABI_ALL_CALLER_SAVED & ~BitSet32 { \
|
|
||||||
RSCRATCH, RSCRATCH2, RSCRATCH_EXTRA, XMM0+16, XMM1+16 \
|
|
||||||
})
|
|
||||||
|
|
||||||
#define QUANTIZED_REGS_TO_SAVE_LOAD (QUANTIZED_REGS_TO_SAVE | BitSet32 { RSCRATCH2 })
|
|
||||||
|
|
||||||
using namespace Gen;
|
|
||||||
|
|
||||||
void CommonAsmRoutines::GenFifoWrite(int size)
|
|
||||||
{
|
|
||||||
// Assume value in RSCRATCH
|
|
||||||
u32 gather_pipe = (u32)(u64)GPFifo::m_gatherPipe;
|
|
||||||
_assert_msg_(DYNA_REC, gather_pipe <= 0x7FFFFFFF, "Gather pipe not in low 2GB of memory!");
|
|
||||||
MOV(32, R(RSCRATCH2), M(&GPFifo::m_gatherPipeCount));
|
|
||||||
SwapAndStore(size, MDisp(RSCRATCH2, gather_pipe), RSCRATCH);
|
|
||||||
ADD(32, R(RSCRATCH2), Imm8(size >> 3));
|
|
||||||
MOV(32, M(&GPFifo::m_gatherPipeCount), R(RSCRATCH2));
|
|
||||||
RET();
|
|
||||||
}
|
|
||||||
|
|
||||||
void CommonAsmRoutines::GenFrsqrte()
|
|
||||||
{
|
|
||||||
// Assume input in XMM0.
|
|
||||||
// This function clobbers all three RSCRATCH.
|
|
||||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
|
||||||
|
|
||||||
// Negative and zero inputs set an exception and take the complex path.
|
|
||||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
|
||||||
FixupBranch zero = J_CC(CC_Z, true);
|
|
||||||
FixupBranch negative = J_CC(CC_S, true);
|
|
||||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
|
||||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
|
||||||
|
|
||||||
// Zero and max exponents (non-normal floats) take the complex path.
|
|
||||||
FixupBranch complex1 = J_CC(CC_Z, true);
|
|
||||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(0x7FF));
|
|
||||||
FixupBranch complex2 = J_CC(CC_E, true);
|
|
||||||
|
|
||||||
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x3FD));
|
|
||||||
SAR(32, R(RSCRATCH_EXTRA), Imm8(1));
|
|
||||||
MOV(32, R(RSCRATCH2), Imm32(0x3FF));
|
|
||||||
SUB(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
|
||||||
SHL(64, R(RSCRATCH2), Imm8(52)); // exponent = ((0x3FFLL << 52) - ((exponent - (0x3FELL << 52)) / 2)) & (0x7FFLL << 52);
|
|
||||||
|
|
||||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
|
||||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(48));
|
|
||||||
AND(32, R(RSCRATCH_EXTRA), Imm8(0x1F));
|
|
||||||
XOR(32, R(RSCRATCH_EXTRA), Imm8(0x10)); // int index = i / 2048 + (odd_exponent ? 16 : 0);
|
|
||||||
|
|
||||||
SHR(64, R(RSCRATCH), Imm8(37));
|
|
||||||
AND(32, R(RSCRATCH), Imm32(0x7FF));
|
|
||||||
IMUL(32, RSCRATCH, MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_dec));
|
|
||||||
MOV(32, R(RSCRATCH_EXTRA), MScaled(RSCRATCH_EXTRA, SCALE_4, (u32)(u64)MathUtil::frsqrte_expected_base));
|
|
||||||
SUB(32, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
|
||||||
SHL(64, R(RSCRATCH_EXTRA), Imm8(26));
|
|
||||||
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(frsqrte_expected_base[index] - frsqrte_expected_dec[index] * (i % 2048)) << 26;
|
|
||||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
// Exception flags for zero input.
|
|
||||||
SetJumpTarget(zero);
|
|
||||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
|
||||||
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
|
||||||
FixupBranch complex3 = J();
|
|
||||||
|
|
||||||
// Exception flags for negative input.
|
|
||||||
SetJumpTarget(negative);
|
|
||||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_VXSQRT));
|
|
||||||
FixupBranch skip_set_fx2 = J_CC(CC_NZ);
|
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_VXSQRT));
|
|
||||||
|
|
||||||
SetJumpTarget(skip_set_fx1);
|
|
||||||
SetJumpTarget(skip_set_fx2);
|
|
||||||
SetJumpTarget(complex1);
|
|
||||||
SetJumpTarget(complex2);
|
|
||||||
SetJumpTarget(complex3);
|
|
||||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
|
||||||
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocalSquareRoot);
|
|
||||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
|
||||||
RET();
|
|
||||||
}
|
|
||||||
|
|
||||||
void CommonAsmRoutines::GenFres()
|
|
||||||
{
|
|
||||||
// Assume input in XMM0.
|
|
||||||
// This function clobbers all three RSCRATCH.
|
|
||||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
|
||||||
|
|
||||||
// Zero inputs set an exception and take the complex path.
|
|
||||||
TEST(64, R(RSCRATCH), R(RSCRATCH));
|
|
||||||
FixupBranch zero = J_CC(CC_Z);
|
|
||||||
|
|
||||||
MOV(64, R(RSCRATCH_EXTRA), R(RSCRATCH));
|
|
||||||
SHR(64, R(RSCRATCH_EXTRA), Imm8(52));
|
|
||||||
MOV(32, R(RSCRATCH2), R(RSCRATCH_EXTRA));
|
|
||||||
AND(32, R(RSCRATCH_EXTRA), Imm32(0x7FF)); // exp
|
|
||||||
AND(32, R(RSCRATCH2), Imm32(0x800)); // sign
|
|
||||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(895));
|
|
||||||
// Take the complex path for very large/small exponents.
|
|
||||||
FixupBranch complex1 = J_CC(CC_L);
|
|
||||||
CMP(32, R(RSCRATCH_EXTRA), Imm32(1149));
|
|
||||||
FixupBranch complex2 = J_CC(CC_GE);
|
|
||||||
|
|
||||||
SUB(32, R(RSCRATCH_EXTRA), Imm32(0x7FD));
|
|
||||||
NEG(32, R(RSCRATCH_EXTRA));
|
|
||||||
OR(32, R(RSCRATCH_EXTRA), R(RSCRATCH2));
|
|
||||||
SHL(64, R(RSCRATCH_EXTRA), Imm8(52)); // vali = sign | exponent
|
|
||||||
|
|
||||||
MOV(64, R(RSCRATCH2), R(RSCRATCH));
|
|
||||||
SHR(64, R(RSCRATCH), Imm8(37));
|
|
||||||
SHR(64, R(RSCRATCH2), Imm8(47));
|
|
||||||
AND(32, R(RSCRATCH), Imm32(0x3FF)); // i % 1024
|
|
||||||
AND(32, R(RSCRATCH2), Imm8(0x1F)); // i / 1024
|
|
||||||
|
|
||||||
IMUL(32, RSCRATCH, MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_dec));
|
|
||||||
ADD(32, R(RSCRATCH), Imm8(1));
|
|
||||||
SHR(32, R(RSCRATCH), Imm8(1));
|
|
||||||
|
|
||||||
MOV(32, R(RSCRATCH2), MScaled(RSCRATCH2, SCALE_4, (u32)(u64)MathUtil::fres_expected_base));
|
|
||||||
SUB(32, R(RSCRATCH2), R(RSCRATCH));
|
|
||||||
SHL(64, R(RSCRATCH2), Imm8(29));
|
|
||||||
OR(64, R(RSCRATCH2), R(RSCRATCH_EXTRA)); // vali |= (s64)(fres_expected_base[i / 1024] - (fres_expected_dec[i / 1024] * (i % 1024) + 1) / 2) << 29
|
|
||||||
MOVQ_xmm(XMM0, R(RSCRATCH2));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
// Exception flags for zero input.
|
|
||||||
SetJumpTarget(zero);
|
|
||||||
TEST(32, PPCSTATE(fpscr), Imm32(FPSCR_ZX));
|
|
||||||
FixupBranch skip_set_fx1 = J_CC(CC_NZ);
|
|
||||||
OR(32, PPCSTATE(fpscr), Imm32(FPSCR_FX | FPSCR_ZX));
|
|
||||||
SetJumpTarget(skip_set_fx1);
|
|
||||||
|
|
||||||
SetJumpTarget(complex1);
|
|
||||||
SetJumpTarget(complex2);
|
|
||||||
ABI_PushRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
|
||||||
ABI_CallFunction((void *)&MathUtil::ApproximateReciprocal);
|
|
||||||
ABI_PopRegistersAndAdjustStack(QUANTIZED_REGS_TO_SAVE, 8);
|
|
||||||
RET();
|
|
||||||
}
|
|
||||||
|
|
||||||
void CommonAsmRoutines::GenMfcr()
|
|
||||||
{
|
|
||||||
// Input: none
|
|
||||||
// Output: RSCRATCH
|
|
||||||
// This function clobbers all three RSCRATCH.
|
|
||||||
X64Reg dst = RSCRATCH;
|
|
||||||
X64Reg tmp = RSCRATCH2;
|
|
||||||
X64Reg cr_val = RSCRATCH_EXTRA;
|
|
||||||
XOR(32, R(dst), R(dst));
|
|
||||||
// we only need to zero the high bits of tmp once
|
|
||||||
XOR(32, R(tmp), R(tmp));
|
|
||||||
for (int i = 0; i < 8; i++)
|
|
||||||
{
|
|
||||||
static const u32 m_flagTable[8] = { 0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9 };
|
|
||||||
if (i != 0)
|
|
||||||
SHL(32, R(dst), Imm8(4));
|
|
||||||
|
|
||||||
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
|
|
||||||
|
|
||||||
// EQ: Bits 31-0 == 0; set flag bit 1
|
|
||||||
TEST(32, R(cr_val), R(cr_val));
|
|
||||||
// FIXME: is there a better way to do this without the partial register merging?
|
|
||||||
SETcc(CC_Z, R(tmp));
|
|
||||||
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
|
|
||||||
|
|
||||||
// GT: Value > 0; set flag bit 2
|
|
||||||
TEST(64, R(cr_val), R(cr_val));
|
|
||||||
SETcc(CC_G, R(tmp));
|
|
||||||
LEA(32, dst, MComplex(dst, tmp, SCALE_4, 0));
|
|
||||||
|
|
||||||
// SO: Bit 61 set; set flag bit 0
|
|
||||||
// LT: Bit 62 set; set flag bit 3
|
|
||||||
SHR(64, R(cr_val), Imm8(61));
|
|
||||||
OR(32, R(dst), MScaled(cr_val, SCALE_4, (u32)(u64)m_flagTable));
|
|
||||||
}
|
|
||||||
RET();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Safe + Fast Quantizers, originally from JITIL by magumagu
|
|
||||||
|
|
||||||
const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
|
const u8 GC_ALIGNED16(pbswapShuffle1x4[16]) = { 3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
|
||||||
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
|
const u8 GC_ALIGNED16(pbswapShuffle2x4[16]) = { 3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15 };
|
||||||
|
@ -250,414 +63,4 @@ const float GC_ALIGNED16(m_dequantizeTableS[]) =
|
||||||
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
(1ULL << 4), (1ULL << 4), (1ULL << 3), (1ULL << 3), (1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
||||||
};
|
};
|
||||||
|
|
||||||
static const float GC_ALIGNED16(m_65535[4]) = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
|
|
||||||
static const float GC_ALIGNED16(m_32767) = 32767.0f;
|
|
||||||
static const float GC_ALIGNED16(m_m32768) = -32768.0f;
|
|
||||||
static const float GC_ALIGNED16(m_255) = 255.0f;
|
|
||||||
static const float GC_ALIGNED16(m_127) = 127.0f;
|
|
||||||
static const float GC_ALIGNED16(m_m128) = -128.0f;
|
|
||||||
|
|
||||||
const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f };
|
const float GC_ALIGNED16(m_one[]) = { 1.0f, 0.0f, 0.0f, 0.0f };
|
||||||
|
|
||||||
#define QUANTIZE_OVERFLOW_SAFE
|
|
||||||
|
|
||||||
// according to Intel Docs CVTPS2DQ writes 0x80000000 if the source floating point value is out of int32 range
|
|
||||||
// while it's OK for large negatives, it isn't for positives
|
|
||||||
// I don't know whether the overflow actually happens in any games
|
|
||||||
// but it potentially can cause problems, so we need some clamping
|
|
||||||
|
|
||||||
// See comment in header for in/outs.
|
|
||||||
void CommonAsmRoutines::GenQuantizedStores()
|
|
||||||
{
|
|
||||||
const u8* storePairedIllegal = AlignCode4();
|
|
||||||
UD2();
|
|
||||||
|
|
||||||
const u8* storePairedFloat = AlignCode4();
|
|
||||||
if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
PSHUFB(XMM0, M((void *)pbswapShuffle2x4));
|
|
||||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
|
||||||
ROL(64, R(RSCRATCH), Imm8(32));
|
|
||||||
BSWAP(64, RSCRATCH);
|
|
||||||
}
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 64, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storePairedU8 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
#endif
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
|
||||||
PACKUSWB(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storePairedS8 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
#endif
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
|
||||||
PACKSSWB(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storePairedU16 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
#endif
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PACKUSDW(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
BSWAP(32, RSCRATCH);
|
|
||||||
ROL(32, R(RSCRATCH), Imm8(16));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
XORPS(XMM1, R(XMM1));
|
|
||||||
MAXPS(XMM0, R(XMM1));
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PSHUFLW(XMM0, R(XMM0), 2); // AABBCCDD -> CCAA____
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
BSWAP(32, RSCRATCH);
|
|
||||||
}
|
|
||||||
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storePairedS16 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
#ifdef QUANTIZE_OVERFLOW_SAFE
|
|
||||||
MINPS(XMM0, M(m_65535));
|
|
||||||
#endif
|
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
|
||||||
PACKSSDW(XMM0, R(XMM0));
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
BSWAP(32, RSCRATCH);
|
|
||||||
ROL(32, R(RSCRATCH), Imm8(16));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_SWAP | SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
|
|
||||||
RET();
|
|
||||||
|
|
||||||
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
||||||
ReserveCodeSpace(8 * sizeof(u8*));
|
|
||||||
|
|
||||||
pairedStoreQuantized[0] = storePairedFloat;
|
|
||||||
pairedStoreQuantized[1] = storePairedIllegal;
|
|
||||||
pairedStoreQuantized[2] = storePairedIllegal;
|
|
||||||
pairedStoreQuantized[3] = storePairedIllegal;
|
|
||||||
pairedStoreQuantized[4] = storePairedU8;
|
|
||||||
pairedStoreQuantized[5] = storePairedU16;
|
|
||||||
pairedStoreQuantized[6] = storePairedS8;
|
|
||||||
pairedStoreQuantized[7] = storePairedS16;
|
|
||||||
}
|
|
||||||
|
|
||||||
// See comment in header for in/outs.
|
|
||||||
void CommonAsmRoutines::GenQuantizedSingleStores()
|
|
||||||
{
|
|
||||||
const u8* storeSingleIllegal = AlignCode4();
|
|
||||||
UD2();
|
|
||||||
|
|
||||||
// Easy!
|
|
||||||
const u8* storeSingleFloat = AlignCode4();
|
|
||||||
MOVD_xmm(R(RSCRATCH), XMM0);
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 32, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleU8 = AlignCode4(); // Used by MKWii
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
XORPS(XMM1, R(XMM1));
|
|
||||||
MAXSS(XMM0, R(XMM1));
|
|
||||||
MINSS(XMM0, M(&m_255));
|
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleS8 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MAXSS(XMM0, M(&m_m128));
|
|
||||||
MINSS(XMM0, M(&m_127));
|
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 8, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleU16 = AlignCode4(); // Used by MKWii
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
XORPS(XMM1, R(XMM1));
|
|
||||||
MAXSS(XMM0, R(XMM1));
|
|
||||||
MINSS(XMM0, M(m_65535));
|
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* storeSingleS16 = AlignCode4();
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_quantizeTableS));
|
|
||||||
MAXSS(XMM0, M(&m_m32768));
|
|
||||||
MINSS(XMM0, M(&m_32767));
|
|
||||||
CVTTSS2SI(RSCRATCH, R(XMM0));
|
|
||||||
SafeWriteRegToReg(RSCRATCH, RSCRATCH_EXTRA, 16, 0, QUANTIZED_REGS_TO_SAVE, SAFE_LOADSTORE_NO_PROLOG | SAFE_LOADSTORE_NO_FASTMEM);
|
|
||||||
RET();
|
|
||||||
|
|
||||||
singleStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
||||||
ReserveCodeSpace(8 * sizeof(u8*));
|
|
||||||
|
|
||||||
singleStoreQuantized[0] = storeSingleFloat;
|
|
||||||
singleStoreQuantized[1] = storeSingleIllegal;
|
|
||||||
singleStoreQuantized[2] = storeSingleIllegal;
|
|
||||||
singleStoreQuantized[3] = storeSingleIllegal;
|
|
||||||
singleStoreQuantized[4] = storeSingleU8;
|
|
||||||
singleStoreQuantized[5] = storeSingleU16;
|
|
||||||
singleStoreQuantized[6] = storeSingleS8;
|
|
||||||
singleStoreQuantized[7] = storeSingleS16;
|
|
||||||
}
|
|
||||||
|
|
||||||
void CommonAsmRoutines::GenQuantizedLoads()
|
|
||||||
{
|
|
||||||
const u8* loadPairedIllegal = AlignCode4();
|
|
||||||
UD2();
|
|
||||||
|
|
||||||
// FIXME? This code (in non-MMU mode) assumes all accesses are directly to RAM, i.e.
|
|
||||||
// don't need hardware access handling. This will definitely crash if paired loads occur
|
|
||||||
// from non-RAM areas, but as far as I know, this never happens. I don't know if this is
|
|
||||||
// for a good reason, or merely because no game does this.
|
|
||||||
// If we find something that actually does do this, maybe this should be changed. How
|
|
||||||
// much of a performance hit would it be?
|
|
||||||
const u8* loadPairedFloatTwo = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
{
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 64, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
|
||||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
}
|
|
||||||
else if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
MOVQ_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
|
||||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
LoadAndSwap(64, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
|
||||||
ROL(64, R(RSCRATCH_EXTRA), Imm8(32));
|
|
||||||
MOVQ_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
}
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedFloatOne = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
{
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
}
|
|
||||||
else if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
MOVD_xmm(XMM0, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
|
||||||
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
LoadAndSwap(32, RSCRATCH_EXTRA, MComplex(RMEM, RSCRATCH_EXTRA, 1, 0));
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
}
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedU8Two = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
{
|
|
||||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
|
||||||
}
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PMOVZXBD(XMM0, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PXOR(XMM1, R(XMM1));
|
|
||||||
PUNPCKLBW(XMM0, R(XMM1));
|
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
|
||||||
}
|
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedU8One = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0); // RSCRATCH_EXTRA = 0x000000xx
|
|
||||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedS8Two = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
{
|
|
||||||
// TODO: Support not swapping in safeLoadToReg to avoid bswapping twice
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
ROR(16, R(RSCRATCH_EXTRA), Imm8(8));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0);
|
|
||||||
}
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PMOVSXBD(XMM0, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PUNPCKLBW(XMM0, R(XMM0));
|
|
||||||
PUNPCKLWD(XMM0, R(XMM0));
|
|
||||||
PSRAD(XMM0, 24);
|
|
||||||
}
|
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedS8One = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 8, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToRegNoSwap(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 8, 0, true);
|
|
||||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedU16Two = AlignCode4();
|
|
||||||
// TODO: Support not swapping in (un)safeLoadToReg to avoid bswapping twice
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
|
||||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PMOVZXWD(XMM0, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PXOR(XMM1, R(XMM1));
|
|
||||||
PUNPCKLWD(XMM0, R(XMM1));
|
|
||||||
}
|
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedU16One = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, false);
|
|
||||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedS16Two = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 32, 0, QUANTIZED_REGS_TO_SAVE_LOAD, false, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 32, 0, false);
|
|
||||||
ROL(32, R(RSCRATCH_EXTRA), Imm8(16));
|
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
if (cpu_info.bSSE4_1)
|
|
||||||
{
|
|
||||||
PMOVSXWD(XMM0, R(XMM0));
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
PUNPCKLWD(XMM0, R(XMM0));
|
|
||||||
PSRAD(XMM0, 16);
|
|
||||||
}
|
|
||||||
CVTDQ2PS(XMM0, R(XMM0));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
MULPS(XMM0, R(XMM1));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
const u8* loadPairedS16One = AlignCode4();
|
|
||||||
if (jit->js.memcheck)
|
|
||||||
SafeLoadToReg(RSCRATCH_EXTRA, R(RSCRATCH_EXTRA), 16, 0, QUANTIZED_REGS_TO_SAVE_LOAD, true, SAFE_LOADSTORE_NO_FASTMEM | SAFE_LOADSTORE_NO_PROLOG);
|
|
||||||
else
|
|
||||||
UnsafeLoadRegToReg(RSCRATCH_EXTRA, RSCRATCH_EXTRA, 16, 0, true);
|
|
||||||
CVTSI2SS(XMM0, R(RSCRATCH_EXTRA));
|
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, (u32)(u64)m_dequantizeTableS));
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
|
||||||
RET();
|
|
||||||
|
|
||||||
pairedLoadQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
|
|
||||||
ReserveCodeSpace(16 * sizeof(u8*));
|
|
||||||
|
|
||||||
pairedLoadQuantized[0] = loadPairedFloatTwo;
|
|
||||||
pairedLoadQuantized[1] = loadPairedIllegal;
|
|
||||||
pairedLoadQuantized[2] = loadPairedIllegal;
|
|
||||||
pairedLoadQuantized[3] = loadPairedIllegal;
|
|
||||||
pairedLoadQuantized[4] = loadPairedU8Two;
|
|
||||||
pairedLoadQuantized[5] = loadPairedU16Two;
|
|
||||||
pairedLoadQuantized[6] = loadPairedS8Two;
|
|
||||||
pairedLoadQuantized[7] = loadPairedS16Two;
|
|
||||||
|
|
||||||
pairedLoadQuantized[8] = loadPairedFloatOne;
|
|
||||||
pairedLoadQuantized[9] = loadPairedIllegal;
|
|
||||||
pairedLoadQuantized[10] = loadPairedIllegal;
|
|
||||||
pairedLoadQuantized[11] = loadPairedIllegal;
|
|
||||||
pairedLoadQuantized[12] = loadPairedU8One;
|
|
||||||
pairedLoadQuantized[13] = loadPairedU16One;
|
|
||||||
pairedLoadQuantized[14] = loadPairedS8One;
|
|
||||||
pairedLoadQuantized[15] = loadPairedS16One;
|
|
||||||
}
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
#include "Common/CommonTypes.h"
|
||||||
|
|
||||||
extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]);
|
extern const u8 GC_ALIGNED16(pbswapShuffle1x4[16]);
|
||||||
extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]);
|
extern const u8 GC_ALIGNED16(pbswapShuffle2x4[16]);
|
||||||
|
@ -15,7 +15,6 @@ extern const float GC_ALIGNED16(m_dequantizeTableS[]);
|
||||||
class CommonAsmRoutinesBase
|
class CommonAsmRoutinesBase
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
|
||||||
const u8 *fifoDirectWrite8;
|
const u8 *fifoDirectWrite8;
|
||||||
const u8 *fifoDirectWrite16;
|
const u8 *fifoDirectWrite16;
|
||||||
const u8 *fifoDirectWrite32;
|
const u8 *fifoDirectWrite32;
|
||||||
|
@ -51,19 +50,5 @@ public:
|
||||||
// In: ECX: Address to write to.
|
// In: ECX: Address to write to.
|
||||||
// In: XMM0: Bottom 32-bit slot holds the float to be written.
|
// In: XMM0: Bottom 32-bit slot holds the float to be written.
|
||||||
const u8 **singleStoreQuantized;
|
const u8 **singleStoreQuantized;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
class CommonAsmRoutines : public CommonAsmRoutinesBase, public EmuCodeBlock
|
|
||||||
{
|
|
||||||
protected:
|
|
||||||
void GenQuantizedLoads();
|
|
||||||
void GenQuantizedStores();
|
|
||||||
void GenQuantizedSingleStores();
|
|
||||||
|
|
||||||
public:
|
|
||||||
void GenFifoWrite(int size);
|
|
||||||
void GenFrsqrte();
|
|
||||||
void GenFres();
|
|
||||||
void GenMfcr();
|
|
||||||
};
|
|
||||||
|
|
|
@ -23,8 +23,8 @@
|
||||||
#include "Core/PowerPC/PowerPC.h"
|
#include "Core/PowerPC/PowerPC.h"
|
||||||
#include "Core/PowerPC/PPCAnalyst.h"
|
#include "Core/PowerPC/PPCAnalyst.h"
|
||||||
#include "Core/PowerPC/PPCTables.h"
|
#include "Core/PowerPC/PPCTables.h"
|
||||||
|
#include "Core/PowerPC/Jit64Common/Jit64AsmCommon.h"
|
||||||
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
#include "Core/PowerPC/JitCommon/Jit_Util.h"
|
||||||
#include "Core/PowerPC/JitCommon/JitAsmCommon.h"
|
|
||||||
#include "Core/PowerPC/JitCommon/JitCache.h"
|
#include "Core/PowerPC/JitCommon/JitCache.h"
|
||||||
#include "Core/PowerPC/JitCommon/TrampolineCache.h"
|
#include "Core/PowerPC/JitCommon/TrampolineCache.h"
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue