Merge pull request #5110 from MerryMage/const-pool
Jit64: Implement a constant pool
This commit is contained in:
commit
d2690568f9
|
@ -42,7 +42,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
// Call this before you generate any code.
|
// Call this before you generate any code.
|
||||||
void AllocCodeSpace(size_t size, bool need_low = true)
|
virtual void AllocCodeSpace(size_t size, bool need_low = true)
|
||||||
{
|
{
|
||||||
region_size = size;
|
region_size = size;
|
||||||
region = static_cast<u8*>(Common::AllocateExecutableMemory(region_size, need_low));
|
region = static_cast<u8*>(Common::AllocateExecutableMemory(region_size, need_low));
|
||||||
|
@ -51,7 +51,7 @@ public:
|
||||||
|
|
||||||
// Always clear code space with breakpoints, so that if someone accidentally executes
|
// Always clear code space with breakpoints, so that if someone accidentally executes
|
||||||
// uninitialized, it just breaks into the debugger.
|
// uninitialized, it just breaks into the debugger.
|
||||||
void ClearCodeSpace()
|
virtual void ClearCodeSpace()
|
||||||
{
|
{
|
||||||
PoisonMemory();
|
PoisonMemory();
|
||||||
ResetCodePtr();
|
ResetCodePtr();
|
||||||
|
|
|
@ -245,6 +245,7 @@ if(_M_X86)
|
||||||
PowerPC/Jit64/JitRegCache.cpp
|
PowerPC/Jit64/JitRegCache.cpp
|
||||||
PowerPC/Jit64/Jit_SystemRegisters.cpp
|
PowerPC/Jit64/Jit_SystemRegisters.cpp
|
||||||
PowerPC/Jit64Common/BlockCache.cpp
|
PowerPC/Jit64Common/BlockCache.cpp
|
||||||
|
PowerPC/Jit64Common/ConstantPool.cpp
|
||||||
PowerPC/Jit64Common/EmuCodeBlock.cpp
|
PowerPC/Jit64Common/EmuCodeBlock.cpp
|
||||||
PowerPC/Jit64Common/FarCodeCache.cpp
|
PowerPC/Jit64Common/FarCodeCache.cpp
|
||||||
PowerPC/Jit64Common/Jit64AsmCommon.cpp
|
PowerPC/Jit64Common/Jit64AsmCommon.cpp
|
||||||
|
|
|
@ -244,6 +244,7 @@
|
||||||
<ClCompile Include="PowerPC\Interpreter\Interpreter_Paired.cpp" />
|
<ClCompile Include="PowerPC\Interpreter\Interpreter_Paired.cpp" />
|
||||||
<ClCompile Include="PowerPC\Interpreter\Interpreter_SystemRegisters.cpp" />
|
<ClCompile Include="PowerPC\Interpreter\Interpreter_SystemRegisters.cpp" />
|
||||||
<ClCompile Include="PowerPC\Interpreter\Interpreter_Tables.cpp" />
|
<ClCompile Include="PowerPC\Interpreter\Interpreter_Tables.cpp" />
|
||||||
|
<ClCompile Include="PowerPC\Jit64Common\ConstantPool.cpp" />
|
||||||
<ClCompile Include="PowerPC\JitILCommon\IR.cpp" />
|
<ClCompile Include="PowerPC\JitILCommon\IR.cpp" />
|
||||||
<ClCompile Include="PowerPC\JitILCommon\JitILBase_Branch.cpp" />
|
<ClCompile Include="PowerPC\JitILCommon\JitILBase_Branch.cpp" />
|
||||||
<ClCompile Include="PowerPC\JitILCommon\JitILBase_FloatingPoint.cpp" />
|
<ClCompile Include="PowerPC\JitILCommon\JitILBase_FloatingPoint.cpp" />
|
||||||
|
@ -486,6 +487,7 @@
|
||||||
<ClInclude Include="PowerPC\CachedInterpreter\InterpreterBlockCache.h" />
|
<ClInclude Include="PowerPC\CachedInterpreter\InterpreterBlockCache.h" />
|
||||||
<ClInclude Include="PowerPC\Interpreter\Interpreter.h" />
|
<ClInclude Include="PowerPC\Interpreter\Interpreter.h" />
|
||||||
<ClInclude Include="PowerPC\Interpreter\Interpreter_FPUtils.h" />
|
<ClInclude Include="PowerPC\Interpreter\Interpreter_FPUtils.h" />
|
||||||
|
<ClInclude Include="PowerPC\Jit64Common\ConstantPool.h" />
|
||||||
<ClInclude Include="PowerPC\Jit64IL\JitIL.h" />
|
<ClInclude Include="PowerPC\Jit64IL\JitIL.h" />
|
||||||
<ClInclude Include="PowerPC\Jit64\FPURegCache.h" />
|
<ClInclude Include="PowerPC\Jit64\FPURegCache.h" />
|
||||||
<ClInclude Include="PowerPC\Jit64\GPRRegCache.h" />
|
<ClInclude Include="PowerPC\Jit64\GPRRegCache.h" />
|
||||||
|
|
|
@ -870,6 +870,9 @@
|
||||||
<ClCompile Include="IOS\USB\Bluetooth\WiimoteHIDAttr.cpp">
|
<ClCompile Include="IOS\USB\Bluetooth\WiimoteHIDAttr.cpp">
|
||||||
<Filter>IOS\USB\Bluetooth</Filter>
|
<Filter>IOS\USB\Bluetooth</Filter>
|
||||||
</ClCompile>
|
</ClCompile>
|
||||||
|
<ClCompile Include="PowerPC\Jit64Common\ConstantPool.cpp">
|
||||||
|
<Filter>PowerPC\Jit64Common</Filter>
|
||||||
|
</ClCompile>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<ClInclude Include="BootManager.h" />
|
<ClInclude Include="BootManager.h" />
|
||||||
|
@ -1493,6 +1496,9 @@
|
||||||
<ClInclude Include="IOS\MIOS.h">
|
<ClInclude Include="IOS\MIOS.h">
|
||||||
<Filter>IOS</Filter>
|
<Filter>IOS</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
<ClInclude Include="PowerPC\Jit64Common\ConstantPool.h">
|
||||||
|
<Filter>PowerPC\Jit64Common</Filter>
|
||||||
|
</ClInclude>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<Text Include="CMakeLists.txt" />
|
<Text Include="CMakeLists.txt" />
|
||||||
|
|
|
@ -108,7 +108,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Re
|
||||||
UCOMISD(xmm, R(xmm));
|
UCOMISD(xmm, R(xmm));
|
||||||
fixups.push_back(J_CC(CC_P));
|
fixups.push_back(J_CC(CC_P));
|
||||||
}
|
}
|
||||||
MOVDDUP(xmm, M(psGeneratedQNaN));
|
MOVDDUP(xmm, MConst(psGeneratedQNaN));
|
||||||
for (FixupBranch fixup : fixups)
|
for (FixupBranch fixup : fixups)
|
||||||
SetJumpTarget(fixup);
|
SetJumpTarget(fixup);
|
||||||
FixupBranch done = J(true);
|
FixupBranch done = J(true);
|
||||||
|
@ -127,7 +127,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Re
|
||||||
SwitchToFarCode();
|
SwitchToFarCode();
|
||||||
SetJumpTarget(handle_nan);
|
SetJumpTarget(handle_nan);
|
||||||
_assert_msg_(DYNA_REC, clobber == XMM0, "BLENDVPD implicitly uses XMM0");
|
_assert_msg_(DYNA_REC, clobber == XMM0, "BLENDVPD implicitly uses XMM0");
|
||||||
BLENDVPD(xmm, M(psGeneratedQNaN));
|
BLENDVPD(xmm, MConst(psGeneratedQNaN));
|
||||||
for (u32 x : inputs)
|
for (u32 x : inputs)
|
||||||
{
|
{
|
||||||
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, fpr.R(x), fpr.R(x), CMP_UNORD);
|
avx_op(&XEmitter::VCMPPD, &XEmitter::CMPPD, clobber, fpr.R(x), fpr.R(x), CMP_UNORD);
|
||||||
|
@ -151,7 +151,7 @@ void Jit64::HandleNaNs(UGeckoInstruction inst, X64Reg xmm_out, X64Reg xmm, X64Re
|
||||||
SetJumpTarget(handle_nan);
|
SetJumpTarget(handle_nan);
|
||||||
MOVAPD(tmp, R(clobber));
|
MOVAPD(tmp, R(clobber));
|
||||||
ANDNPD(clobber, R(xmm));
|
ANDNPD(clobber, R(xmm));
|
||||||
ANDPD(tmp, M(psGeneratedQNaN));
|
ANDPD(tmp, MConst(psGeneratedQNaN));
|
||||||
ORPD(tmp, R(clobber));
|
ORPD(tmp, R(clobber));
|
||||||
MOVAPD(xmm, R(tmp));
|
MOVAPD(xmm, R(tmp));
|
||||||
for (u32 x : inputs)
|
for (u32 x : inputs)
|
||||||
|
@ -350,7 +350,7 @@ void Jit64::fmaddXX(UGeckoInstruction inst)
|
||||||
ADDSD(XMM1, fpr.R(b));
|
ADDSD(XMM1, fpr.R(b));
|
||||||
}
|
}
|
||||||
if (inst.SUBOP5 == 31) // nmadd
|
if (inst.SUBOP5 == 31) // nmadd
|
||||||
XORPD(XMM1, M(packed ? psSignBits2 : psSignBits));
|
XORPD(XMM1, MConst(packed ? psSignBits2 : psSignBits));
|
||||||
}
|
}
|
||||||
fpr.BindToRegister(d, !single);
|
fpr.BindToRegister(d, !single);
|
||||||
if (single)
|
if (single)
|
||||||
|
@ -385,15 +385,15 @@ void Jit64::fsign(UGeckoInstruction inst)
|
||||||
{
|
{
|
||||||
case 40: // neg
|
case 40: // neg
|
||||||
avx_op(&XEmitter::VXORPD, &XEmitter::XORPD, fpr.RX(d), src,
|
avx_op(&XEmitter::VXORPD, &XEmitter::XORPD, fpr.RX(d), src,
|
||||||
M(packed ? psSignBits2 : psSignBits), packed);
|
MConst(packed ? psSignBits2 : psSignBits), packed);
|
||||||
break;
|
break;
|
||||||
case 136: // nabs
|
case 136: // nabs
|
||||||
avx_op(&XEmitter::VORPD, &XEmitter::ORPD, fpr.RX(d), src, M(packed ? psSignBits2 : psSignBits),
|
avx_op(&XEmitter::VORPD, &XEmitter::ORPD, fpr.RX(d), src,
|
||||||
packed);
|
MConst(packed ? psSignBits2 : psSignBits), packed);
|
||||||
break;
|
break;
|
||||||
case 264: // abs
|
case 264: // abs
|
||||||
avx_op(&XEmitter::VANDPD, &XEmitter::ANDPD, fpr.RX(d), src, M(packed ? psAbsMask2 : psAbsMask),
|
avx_op(&XEmitter::VANDPD, &XEmitter::ANDPD, fpr.RX(d), src,
|
||||||
packed);
|
MConst(packed ? psAbsMask2 : psAbsMask), packed);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
PanicAlert("fsign bleh");
|
PanicAlert("fsign bleh");
|
||||||
|
@ -608,7 +608,7 @@ void Jit64::fctiwx(UGeckoInstruction inst)
|
||||||
// The upper 32 bits of the result are set to 0xfff80000,
|
// The upper 32 bits of the result are set to 0xfff80000,
|
||||||
// except for -0.0 where they are set to 0xfff80001 (TODO).
|
// except for -0.0 where they are set to 0xfff80001 (TODO).
|
||||||
|
|
||||||
MOVAPD(XMM0, M(half_qnan_and_s32_max));
|
MOVAPD(XMM0, MConst(half_qnan_and_s32_max));
|
||||||
MINSD(XMM0, fpr.R(b));
|
MINSD(XMM0, fpr.R(b));
|
||||||
switch (inst.SUBOP10)
|
switch (inst.SUBOP10)
|
||||||
{
|
{
|
||||||
|
|
|
@ -623,7 +623,7 @@ void Jit64::mcrfs(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
AND(32, R(RSCRATCH), Imm32(mask));
|
AND(32, R(RSCRATCH), Imm32(mask));
|
||||||
MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
|
MOV(32, PPCSTATE(fpscr), R(RSCRATCH));
|
||||||
LEA(64, RSCRATCH, M(m_crTable.data()));
|
LEA(64, RSCRATCH, MConst(m_crTable));
|
||||||
MOV(64, R(RSCRATCH), MComplex(RSCRATCH, RSCRATCH2, SCALE_8, 0));
|
MOV(64, R(RSCRATCH), MComplex(RSCRATCH, RSCRATCH2, SCALE_8, 0));
|
||||||
MOV(64, PPCSTATE(cr_val[inst.CRFD]), R(RSCRATCH));
|
MOV(64, PPCSTATE(cr_val[inst.CRFD]), R(RSCRATCH));
|
||||||
}
|
}
|
||||||
|
@ -657,14 +657,14 @@ void Jit64::mffsx(UGeckoInstruction inst)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MXCSR = s_fpscr_to_mxcsr[FPSCR & 7]
|
// MXCSR = s_fpscr_to_mxcsr[FPSCR & 7]
|
||||||
static const u32 s_fpscr_to_mxcsr[] = {
|
static const u32 s_fpscr_to_mxcsr[8] = {
|
||||||
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
0x1F80, 0x7F80, 0x5F80, 0x3F80, 0x9F80, 0xFF80, 0xDF80, 0xBF80,
|
||||||
};
|
};
|
||||||
|
|
||||||
// Needs value of FPSCR in RSCRATCH.
|
// Needs value of FPSCR in RSCRATCH.
|
||||||
void Jit64::UpdateMXCSR()
|
void Jit64::UpdateMXCSR()
|
||||||
{
|
{
|
||||||
LEA(64, RSCRATCH2, M(&s_fpscr_to_mxcsr));
|
LEA(64, RSCRATCH2, MConst(s_fpscr_to_mxcsr));
|
||||||
AND(32, R(RSCRATCH), Imm32(7));
|
AND(32, R(RSCRATCH), Imm32(7));
|
||||||
LDMXCSR(MComplex(RSCRATCH2, RSCRATCH, SCALE_4, 0));
|
LDMXCSR(MComplex(RSCRATCH2, RSCRATCH, SCALE_4, 0));
|
||||||
}
|
}
|
||||||
|
@ -730,7 +730,7 @@ void Jit64::mtfsfix(UGeckoInstruction inst)
|
||||||
|
|
||||||
// Field 7 contains NI and RN.
|
// Field 7 contains NI and RN.
|
||||||
if (inst.CRFD == 7)
|
if (inst.CRFD == 7)
|
||||||
LDMXCSR(M(&s_fpscr_to_mxcsr[imm & 7]));
|
LDMXCSR(MConst(s_fpscr_to_mxcsr, imm & 7));
|
||||||
}
|
}
|
||||||
|
|
||||||
void Jit64::mtfsfx(UGeckoInstruction inst)
|
void Jit64::mtfsfx(UGeckoInstruction inst)
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
// Copyright 2017 Dolphin Emulator Project
|
||||||
|
// Licensed under GPLv2+
|
||||||
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <memory>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "Common/Assert.h"
|
||||||
|
#include "Common/x64Emitter.h"
|
||||||
|
#include "Core/PowerPC/Jit64Common/ConstantPool.h"
|
||||||
|
|
||||||
|
ConstantPool::ConstantPool(Gen::X64CodeBlock* parent) : m_parent(parent)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
ConstantPool::~ConstantPool() = default;
|
||||||
|
|
||||||
|
void ConstantPool::AllocCodeSpace()
|
||||||
|
{
|
||||||
|
_assert_(!m_current_ptr);
|
||||||
|
Init();
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConstantPool::ClearCodeSpace()
|
||||||
|
{
|
||||||
|
Init();
|
||||||
|
}
|
||||||
|
|
||||||
|
Gen::OpArg ConstantPool::GetConstantOpArg(const void* value, size_t element_size,
|
||||||
|
size_t num_elements, size_t index)
|
||||||
|
{
|
||||||
|
const size_t value_size = element_size * num_elements;
|
||||||
|
auto iter = m_const_info.find(value);
|
||||||
|
|
||||||
|
if (iter == m_const_info.end())
|
||||||
|
{
|
||||||
|
void* ptr = std::align(ALIGNMENT, value_size, m_current_ptr, m_remaining_size);
|
||||||
|
_assert_msg_(DYNA_REC, ptr, "Constant pool has run out of space.");
|
||||||
|
|
||||||
|
m_current_ptr = static_cast<u8*>(m_current_ptr) + value_size;
|
||||||
|
m_remaining_size -= value_size;
|
||||||
|
|
||||||
|
std::memcpy(ptr, value, value_size);
|
||||||
|
iter = m_const_info.emplace(std::make_pair(value, ConstantInfo{ptr, value_size})).first;
|
||||||
|
}
|
||||||
|
|
||||||
|
const ConstantInfo& info = iter->second;
|
||||||
|
_assert_msg_(DYNA_REC, info.m_size == value_size,
|
||||||
|
"Constant has incorrect size in constant pool.");
|
||||||
|
u8* location = static_cast<u8*>(info.m_location);
|
||||||
|
return Gen::M(location + element_size * index);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ConstantPool::Init()
|
||||||
|
{
|
||||||
|
// If execution happens to run to the start of the constant pool, halt.
|
||||||
|
m_parent->INT3();
|
||||||
|
m_parent->AlignCode16();
|
||||||
|
|
||||||
|
// Reserve a block of memory CONST_POOL_SIZE in size.
|
||||||
|
m_current_ptr = m_parent->GetWritableCodePtr();
|
||||||
|
m_parent->ReserveCodeSpace(CONST_POOL_SIZE);
|
||||||
|
|
||||||
|
m_remaining_size = CONST_POOL_SIZE;
|
||||||
|
m_const_info.clear();
|
||||||
|
}
|
|
@ -0,0 +1,52 @@
|
||||||
|
// Copyright 2017 Dolphin Emulator Project
|
||||||
|
// Licensed under GPLv2+
|
||||||
|
// Refer to the license.txt file included.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
namespace Gen
|
||||||
|
{
|
||||||
|
struct OpArg;
|
||||||
|
class X64CodeBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Constants are copied into this pool so that they live at a memory location
|
||||||
|
// that is close to the code that references it. This ensures that the 32-bit
|
||||||
|
// limitation on RIP addressing is not an issue.
|
||||||
|
class ConstantPool
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
static constexpr size_t CONST_POOL_SIZE = 1024 * 32;
|
||||||
|
static constexpr size_t ALIGNMENT = 16;
|
||||||
|
|
||||||
|
explicit ConstantPool(Gen::X64CodeBlock* parent);
|
||||||
|
~ConstantPool();
|
||||||
|
|
||||||
|
// ConstantPool reserves CONST_POOL_SIZE bytes from parent, and uses
|
||||||
|
// that space to store its constants.
|
||||||
|
void AllocCodeSpace();
|
||||||
|
void ClearCodeSpace();
|
||||||
|
|
||||||
|
// Copies the value into the pool if it doesn't exist. Returns a pointer
|
||||||
|
// to existing values if they were already copied. Pointer equality is
|
||||||
|
// used to determine if two constants are the same.
|
||||||
|
Gen::OpArg GetConstantOpArg(const void* value, size_t element_size, size_t num_elements,
|
||||||
|
size_t index);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void Init();
|
||||||
|
|
||||||
|
struct ConstantInfo
|
||||||
|
{
|
||||||
|
void* m_location;
|
||||||
|
size_t m_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
Gen::X64CodeBlock* m_parent;
|
||||||
|
void* m_current_ptr = nullptr;
|
||||||
|
size_t m_remaining_size = CONST_POOL_SIZE;
|
||||||
|
std::map<const void*, ConstantInfo> m_const_info;
|
||||||
|
};
|
|
@ -40,6 +40,18 @@ OpArg FixImmediate(int access_size, OpArg arg)
|
||||||
}
|
}
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
|
||||||
|
void EmuCodeBlock::ClearCodeSpace()
|
||||||
|
{
|
||||||
|
X64CodeBlock::ClearCodeSpace();
|
||||||
|
m_const_pool.ClearCodeSpace();
|
||||||
|
}
|
||||||
|
|
||||||
|
void EmuCodeBlock::AllocCodeSpace(size_t size, bool need_low)
|
||||||
|
{
|
||||||
|
X64CodeBlock::AllocCodeSpace(size + ConstantPool::CONST_POOL_SIZE, need_low);
|
||||||
|
m_const_pool.AllocCodeSpace();
|
||||||
|
}
|
||||||
|
|
||||||
void EmuCodeBlock::MemoryExceptionCheck()
|
void EmuCodeBlock::MemoryExceptionCheck()
|
||||||
{
|
{
|
||||||
// TODO: We really should untangle the trampolines, exception handlers and
|
// TODO: We really should untangle the trampolines, exception handlers and
|
||||||
|
@ -836,16 +848,16 @@ void EmuCodeBlock::Force25BitPrecision(X64Reg output, const OpArg& input, X64Reg
|
||||||
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
|
// mantissa = (mantissa & ~0xFFFFFFF) + ((mantissa & (1ULL << 27)) << 1);
|
||||||
if (input.IsSimpleReg() && cpu_info.bAVX)
|
if (input.IsSimpleReg() && cpu_info.bAVX)
|
||||||
{
|
{
|
||||||
VPAND(tmp, input.GetSimpleReg(), M(psRoundBit));
|
VPAND(tmp, input.GetSimpleReg(), MConst(psRoundBit));
|
||||||
VPAND(output, input.GetSimpleReg(), M(psMantissaTruncate));
|
VPAND(output, input.GetSimpleReg(), MConst(psMantissaTruncate));
|
||||||
PADDQ(output, R(tmp));
|
PADDQ(output, R(tmp));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if (!input.IsSimpleReg(output))
|
if (!input.IsSimpleReg(output))
|
||||||
MOVAPD(output, input);
|
MOVAPD(output, input);
|
||||||
avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), M(psRoundBit), true, true);
|
avx_op(&XEmitter::VPAND, &XEmitter::PAND, tmp, R(output), MConst(psRoundBit), true, true);
|
||||||
PAND(output, M(psMantissaTruncate));
|
PAND(output, MConst(psMantissaTruncate));
|
||||||
PADDQ(output, R(tmp));
|
PADDQ(output, R(tmp));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -890,7 +902,7 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
|
||||||
MOVSD(XMM1, R(src));
|
MOVSD(XMM1, R(src));
|
||||||
|
|
||||||
// Grab Exponent
|
// Grab Exponent
|
||||||
PAND(XMM1, M(&double_exponent));
|
PAND(XMM1, MConst(double_exponent));
|
||||||
PSRLQ(XMM1, 52);
|
PSRLQ(XMM1, 52);
|
||||||
MOVD_xmm(R(RSCRATCH), XMM1);
|
MOVD_xmm(R(RSCRATCH), XMM1);
|
||||||
|
|
||||||
|
@ -909,15 +921,15 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
|
||||||
|
|
||||||
// xmm1 = fraction | 0x0010000000000000
|
// xmm1 = fraction | 0x0010000000000000
|
||||||
MOVSD(XMM1, R(src));
|
MOVSD(XMM1, R(src));
|
||||||
PAND(XMM1, M(&double_fraction));
|
PAND(XMM1, MConst(double_fraction));
|
||||||
POR(XMM1, M(&double_explicit_top_bit));
|
POR(XMM1, MConst(double_explicit_top_bit));
|
||||||
|
|
||||||
// fraction >> shift
|
// fraction >> shift
|
||||||
PSRLQ(XMM1, R(XMM0));
|
PSRLQ(XMM1, R(XMM0));
|
||||||
|
|
||||||
// OR the sign bit in.
|
// OR the sign bit in.
|
||||||
MOVSD(XMM0, R(src));
|
MOVSD(XMM0, R(src));
|
||||||
PAND(XMM0, M(&double_sign_bit));
|
PAND(XMM0, MConst(double_sign_bit));
|
||||||
PSRLQ(XMM0, 32);
|
PSRLQ(XMM0, 32);
|
||||||
POR(XMM1, R(XMM0));
|
POR(XMM1, R(XMM0));
|
||||||
|
|
||||||
|
@ -930,12 +942,12 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
|
||||||
|
|
||||||
// We want bits 0, 1
|
// We want bits 0, 1
|
||||||
MOVSD(XMM1, R(src));
|
MOVSD(XMM1, R(src));
|
||||||
PAND(XMM1, M(&double_top_two_bits));
|
PAND(XMM1, MConst(double_top_two_bits));
|
||||||
PSRLQ(XMM1, 32);
|
PSRLQ(XMM1, 32);
|
||||||
|
|
||||||
// And 5 through to 34
|
// And 5 through to 34
|
||||||
MOVSD(XMM0, R(src));
|
MOVSD(XMM0, R(src));
|
||||||
PAND(XMM0, M(&double_bottom_bits));
|
PAND(XMM0, MConst(double_bottom_bits));
|
||||||
PSRLQ(XMM0, 29);
|
PSRLQ(XMM0, 29);
|
||||||
|
|
||||||
// OR them togther
|
// OR them togther
|
||||||
|
@ -967,8 +979,8 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
|
||||||
// Here, check to see if the source is small enough that it will result in a denormal, and pass it
|
// Here, check to see if the source is small enough that it will result in a denormal, and pass it
|
||||||
// to the x87 unit
|
// to the x87 unit
|
||||||
// if it is.
|
// if it is.
|
||||||
avx_op(&XEmitter::VPAND, &XEmitter::PAND, XMM0, R(src), M(&double_sign_bit), true, true);
|
avx_op(&XEmitter::VPAND, &XEmitter::PAND, XMM0, R(src), MConst(double_sign_bit), true, true);
|
||||||
UCOMISD(XMM0, M(&min_norm_single));
|
UCOMISD(XMM0, MConst(min_norm_single));
|
||||||
FixupBranch nanConversion = J_CC(CC_P, true);
|
FixupBranch nanConversion = J_CC(CC_P, true);
|
||||||
FixupBranch denormalConversion = J_CC(CC_B, true);
|
FixupBranch denormalConversion = J_CC(CC_B, true);
|
||||||
CVTSD2SS(dst, R(src));
|
CVTSD2SS(dst, R(src));
|
||||||
|
@ -982,7 +994,7 @@ void EmuCodeBlock::ConvertDoubleToSingle(X64Reg dst, X64Reg src)
|
||||||
FixupBranch continue1 = J_CC(CC_C, true);
|
FixupBranch continue1 = J_CC(CC_C, true);
|
||||||
// Clear the quiet bit of the SNaN, which was 0 (signalling) but got set to 1 (quiet) by
|
// Clear the quiet bit of the SNaN, which was 0 (signalling) but got set to 1 (quiet) by
|
||||||
// conversion.
|
// conversion.
|
||||||
ANDPS(dst, M(&single_qnan_bit));
|
ANDPS(dst, MConst(single_qnan_bit));
|
||||||
FixupBranch continue2 = J(true);
|
FixupBranch continue2 = J(true);
|
||||||
|
|
||||||
SetJumpTarget(denormalConversion);
|
SetJumpTarget(denormalConversion);
|
||||||
|
@ -1025,7 +1037,7 @@ void EmuCodeBlock::ConvertSingleToDouble(X64Reg dst, X64Reg src, bool src_is_gpr
|
||||||
SetJumpTarget(nanConversion);
|
SetJumpTarget(nanConversion);
|
||||||
TEST(32, R(gprsrc), Imm32(0x00400000));
|
TEST(32, R(gprsrc), Imm32(0x00400000));
|
||||||
FixupBranch continue1 = J_CC(CC_NZ, true);
|
FixupBranch continue1 = J_CC(CC_NZ, true);
|
||||||
ANDPD(dst, M(&double_qnan_bit));
|
ANDPD(dst, MConst(double_qnan_bit));
|
||||||
FixupBranch continue2 = J(true);
|
FixupBranch continue2 = J(true);
|
||||||
SwitchToNearCode();
|
SwitchToNearCode();
|
||||||
|
|
||||||
|
@ -1057,7 +1069,7 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
|
||||||
{
|
{
|
||||||
MOVQ_xmm(R(RSCRATCH), xmm);
|
MOVQ_xmm(R(RSCRATCH), xmm);
|
||||||
SHR(64, R(RSCRATCH), Imm8(63)); // Get the sign bit; almost all the branches need it.
|
SHR(64, R(RSCRATCH), Imm8(63)); // Get the sign bit; almost all the branches need it.
|
||||||
PTEST(xmm, M(psDoubleExp));
|
PTEST(xmm, MConst(psDoubleExp));
|
||||||
FixupBranch maxExponent = J_CC(CC_C);
|
FixupBranch maxExponent = J_CC(CC_C);
|
||||||
FixupBranch zeroExponent = J_CC(CC_Z);
|
FixupBranch zeroExponent = J_CC(CC_Z);
|
||||||
|
|
||||||
|
@ -1067,7 +1079,7 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
|
||||||
continue1 = J();
|
continue1 = J();
|
||||||
|
|
||||||
SetJumpTarget(maxExponent);
|
SetJumpTarget(maxExponent);
|
||||||
PTEST(xmm, M(psDoubleFrac));
|
PTEST(xmm, MConst(psDoubleFrac));
|
||||||
FixupBranch notNAN = J_CC(CC_Z);
|
FixupBranch notNAN = J_CC(CC_Z);
|
||||||
|
|
||||||
// Max exponent + mantissa: PPC_FPCLASS_QNAN
|
// Max exponent + mantissa: PPC_FPCLASS_QNAN
|
||||||
|
@ -1097,10 +1109,10 @@ void EmuCodeBlock::SetFPRF(Gen::X64Reg xmm)
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MOVQ_xmm(R(RSCRATCH), xmm);
|
MOVQ_xmm(R(RSCRATCH), xmm);
|
||||||
TEST(64, R(RSCRATCH), M(psDoubleExp));
|
TEST(64, R(RSCRATCH), MConst(psDoubleExp));
|
||||||
FixupBranch zeroExponent = J_CC(CC_Z);
|
FixupBranch zeroExponent = J_CC(CC_Z);
|
||||||
AND(64, R(RSCRATCH), M(psDoubleNoSign));
|
AND(64, R(RSCRATCH), MConst(psDoubleNoSign));
|
||||||
CMP(64, R(RSCRATCH), M(psDoubleExp));
|
CMP(64, R(RSCRATCH), MConst(psDoubleExp));
|
||||||
FixupBranch nan =
|
FixupBranch nan =
|
||||||
J_CC(CC_G); // This works because if the sign bit is set, RSCRATCH is negative
|
J_CC(CC_G); // This works because if the sign bit is set, RSCRATCH is negative
|
||||||
FixupBranch infinity = J_CC(CC_E);
|
FixupBranch infinity = J_CC(CC_E);
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
#include "Common/x64Emitter.h"
|
#include "Common/x64Emitter.h"
|
||||||
|
|
||||||
|
#include "Core/PowerPC/Jit64Common/ConstantPool.h"
|
||||||
#include "Core/PowerPC/Jit64Common/FarCodeCache.h"
|
#include "Core/PowerPC/Jit64Common/FarCodeCache.h"
|
||||||
#include "Core/PowerPC/Jit64Common/TrampolineInfo.h"
|
#include "Core/PowerPC/Jit64Common/TrampolineInfo.h"
|
||||||
|
|
||||||
|
@ -22,12 +23,27 @@ class Mapping;
|
||||||
class EmuCodeBlock : public Gen::X64CodeBlock
|
class EmuCodeBlock : public Gen::X64CodeBlock
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
|
void ClearCodeSpace() override;
|
||||||
|
void AllocCodeSpace(size_t size, bool need_low = true) override;
|
||||||
|
|
||||||
void MemoryExceptionCheck();
|
void MemoryExceptionCheck();
|
||||||
|
|
||||||
// Simple functions to switch between near and far code emitting
|
// Simple functions to switch between near and far code emitting
|
||||||
void SwitchToFarCode();
|
void SwitchToFarCode();
|
||||||
void SwitchToNearCode();
|
void SwitchToNearCode();
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
Gen::OpArg MConst(const T& value)
|
||||||
|
{
|
||||||
|
return m_const_pool.GetConstantOpArg(&value, sizeof(T), 1, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, size_t N>
|
||||||
|
Gen::OpArg MConst(const T (&value)[N], size_t index = 0)
|
||||||
|
{
|
||||||
|
return m_const_pool.GetConstantOpArg(&value, sizeof(T), N, index);
|
||||||
|
}
|
||||||
|
|
||||||
Gen::FixupBranch CheckIfSafeAddress(const Gen::OpArg& reg_value, Gen::X64Reg reg_addr,
|
Gen::FixupBranch CheckIfSafeAddress(const Gen::OpArg& reg_value, Gen::X64Reg reg_addr,
|
||||||
BitSet32 registers_in_use);
|
BitSet32 registers_in_use);
|
||||||
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize,
|
void UnsafeLoadRegToReg(Gen::X64Reg reg_addr, Gen::X64Reg reg_value, int accessSize,
|
||||||
|
@ -105,6 +121,7 @@ public:
|
||||||
void Clear();
|
void Clear();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
|
ConstantPool m_const_pool{this};
|
||||||
FarCodeCache m_far_code;
|
FarCodeCache m_far_code;
|
||||||
u8* m_near_code; // Backed up when we switch to far code.
|
u8* m_near_code; // Backed up when we switch to far code.
|
||||||
|
|
||||||
|
|
|
@ -180,8 +180,6 @@ void CommonAsmRoutines::GenMfcr()
|
||||||
X64Reg tmp = RSCRATCH2;
|
X64Reg tmp = RSCRATCH2;
|
||||||
X64Reg cr_val = RSCRATCH_EXTRA;
|
X64Reg cr_val = RSCRATCH_EXTRA;
|
||||||
XOR(32, R(dst), R(dst));
|
XOR(32, R(dst), R(dst));
|
||||||
// we only need to zero the high bits of tmp once
|
|
||||||
XOR(32, R(tmp), R(tmp));
|
|
||||||
for (int i = 0; i < 8; i++)
|
for (int i = 0; i < 8; i++)
|
||||||
{
|
{
|
||||||
static const u32 m_flagTable[8] = {0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9};
|
static const u32 m_flagTable[8] = {0x0, 0x1, 0x8, 0x9, 0x0, 0x1, 0x8, 0x9};
|
||||||
|
@ -190,9 +188,13 @@ void CommonAsmRoutines::GenMfcr()
|
||||||
|
|
||||||
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
|
MOV(64, R(cr_val), PPCSTATE(cr_val[i]));
|
||||||
|
|
||||||
|
// Upper bits of tmp need to be zeroed.
|
||||||
|
// Note: tmp is used later for address calculations and thus
|
||||||
|
// can't be zero-ed once. This also prevents partial
|
||||||
|
// register stalls due to SETcc.
|
||||||
|
XOR(32, R(tmp), R(tmp));
|
||||||
// EQ: Bits 31-0 == 0; set flag bit 1
|
// EQ: Bits 31-0 == 0; set flag bit 1
|
||||||
TEST(32, R(cr_val), R(cr_val));
|
TEST(32, R(cr_val), R(cr_val));
|
||||||
// FIXME: is there a better way to do this without the partial register merging?
|
|
||||||
SETcc(CC_Z, R(tmp));
|
SETcc(CC_Z, R(tmp));
|
||||||
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
|
LEA(32, dst, MComplex(dst, tmp, SCALE_2, 0));
|
||||||
|
|
||||||
|
@ -204,7 +206,8 @@ void CommonAsmRoutines::GenMfcr()
|
||||||
// SO: Bit 61 set; set flag bit 0
|
// SO: Bit 61 set; set flag bit 0
|
||||||
// LT: Bit 62 set; set flag bit 3
|
// LT: Bit 62 set; set flag bit 3
|
||||||
SHR(64, R(cr_val), Imm8(61));
|
SHR(64, R(cr_val), Imm8(61));
|
||||||
OR(32, R(dst), MScaled(cr_val, SCALE_4, PtrOffset(m_flagTable)));
|
LEA(64, tmp, MConst(m_flagTable));
|
||||||
|
OR(32, R(dst), MComplex(tmp, cr_val, SCALE_4, 0));
|
||||||
}
|
}
|
||||||
RET();
|
RET();
|
||||||
|
|
||||||
|
@ -297,11 +300,12 @@ void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type,
|
||||||
if (quantize == -1)
|
if (quantize == -1)
|
||||||
{
|
{
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, PtrOffset(m_quantizeTableS)));
|
LEA(64, RSCRATCH, MConst(m_quantizeTableS));
|
||||||
|
MULSS(XMM0, MRegSum(RSCRATCH2, RSCRATCH));
|
||||||
}
|
}
|
||||||
else if (quantize > 0)
|
else if (quantize > 0)
|
||||||
{
|
{
|
||||||
MULSS(XMM0, M(&m_quantizeTableS[quantize * 2]));
|
MULSS(XMM0, MConst(m_quantizeTableS, quantize * 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (type)
|
switch (type)
|
||||||
|
@ -309,20 +313,20 @@ void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type,
|
||||||
case QUANTIZE_U8:
|
case QUANTIZE_U8:
|
||||||
XORPS(XMM1, R(XMM1));
|
XORPS(XMM1, R(XMM1));
|
||||||
MAXSS(XMM0, R(XMM1));
|
MAXSS(XMM0, R(XMM1));
|
||||||
MINSS(XMM0, M(&m_255));
|
MINSS(XMM0, MConst(m_255));
|
||||||
break;
|
break;
|
||||||
case QUANTIZE_S8:
|
case QUANTIZE_S8:
|
||||||
MAXSS(XMM0, M(&m_m128));
|
MAXSS(XMM0, MConst(m_m128));
|
||||||
MINSS(XMM0, M(&m_127));
|
MINSS(XMM0, MConst(m_127));
|
||||||
break;
|
break;
|
||||||
case QUANTIZE_U16:
|
case QUANTIZE_U16:
|
||||||
XORPS(XMM1, R(XMM1));
|
XORPS(XMM1, R(XMM1));
|
||||||
MAXSS(XMM0, R(XMM1));
|
MAXSS(XMM0, R(XMM1));
|
||||||
MINSS(XMM0, M(m_65535));
|
MINSS(XMM0, MConst(m_65535));
|
||||||
break;
|
break;
|
||||||
case QUANTIZE_S16:
|
case QUANTIZE_S16:
|
||||||
MAXSS(XMM0, M(&m_m32768));
|
MAXSS(XMM0, MConst(m_m32768));
|
||||||
MINSS(XMM0, M(&m_32767));
|
MINSS(XMM0, MConst(m_32767));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
@ -335,12 +339,13 @@ void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type,
|
||||||
if (quantize == -1)
|
if (quantize == -1)
|
||||||
{
|
{
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, PtrOffset(m_quantizeTableS)));
|
LEA(64, RSCRATCH, MConst(m_quantizeTableS));
|
||||||
|
MOVQ_xmm(XMM1, MRegSum(RSCRATCH2, RSCRATCH));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
}
|
}
|
||||||
else if (quantize > 0)
|
else if (quantize > 0)
|
||||||
{
|
{
|
||||||
MOVQ_xmm(XMM1, M(&m_quantizeTableS[quantize * 2]));
|
MOVQ_xmm(XMM1, MConst(m_quantizeTableS, quantize * 2));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -358,7 +363,7 @@ void QuantizedMemoryRoutines::GenQuantizedStore(bool single, EQuantizeType type,
|
||||||
// is out of int32 range while it's OK for large negatives, it isn't for positives
|
// is out of int32 range while it's OK for large negatives, it isn't for positives
|
||||||
// I don't know whether the overflow actually happens in any games but it potentially can
|
// I don't know whether the overflow actually happens in any games but it potentially can
|
||||||
// cause problems, so we need some clamping
|
// cause problems, so we need some clamping
|
||||||
MINPS(XMM0, M(m_65535));
|
MINPS(XMM0, MConst(m_65535));
|
||||||
CVTTPS2DQ(XMM0, R(XMM0));
|
CVTTPS2DQ(XMM0, R(XMM0));
|
||||||
|
|
||||||
switch (type)
|
switch (type)
|
||||||
|
@ -419,7 +424,7 @@ void QuantizedMemoryRoutines::GenQuantizedStoreFloat(bool single, bool isInline)
|
||||||
{
|
{
|
||||||
if (cpu_info.bSSSE3)
|
if (cpu_info.bSSSE3)
|
||||||
{
|
{
|
||||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
PSHUFB(XMM0, MConst(pbswapShuffle2x4));
|
||||||
MOVQ_xmm(R(RSCRATCH), XMM0);
|
MOVQ_xmm(R(RSCRATCH), XMM0);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -492,13 +497,14 @@ void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type,
|
||||||
if (quantize == -1)
|
if (quantize == -1)
|
||||||
{
|
{
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MULSS(XMM0, MDisp(RSCRATCH2, PtrOffset(m_dequantizeTableS)));
|
LEA(64, RSCRATCH, MConst(m_dequantizeTableS));
|
||||||
|
MULSS(XMM0, MRegSum(RSCRATCH2, RSCRATCH));
|
||||||
}
|
}
|
||||||
else if (quantize > 0)
|
else if (quantize > 0)
|
||||||
{
|
{
|
||||||
MULSS(XMM0, M(&m_dequantizeTableS[quantize * 2]));
|
MULSS(XMM0, MConst(m_dequantizeTableS, quantize * 2));
|
||||||
}
|
}
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
UNPCKLPS(XMM0, MConst(m_one));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -564,12 +570,13 @@ void QuantizedMemoryRoutines::GenQuantizedLoad(bool single, EQuantizeType type,
|
||||||
if (quantize == -1)
|
if (quantize == -1)
|
||||||
{
|
{
|
||||||
SHR(32, R(RSCRATCH2), Imm8(5));
|
SHR(32, R(RSCRATCH2), Imm8(5));
|
||||||
MOVQ_xmm(XMM1, MDisp(RSCRATCH2, PtrOffset(m_dequantizeTableS)));
|
LEA(64, RSCRATCH, MConst(m_dequantizeTableS));
|
||||||
|
MOVQ_xmm(XMM1, MRegSum(RSCRATCH2, RSCRATCH));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
}
|
}
|
||||||
else if (quantize > 0)
|
else if (quantize > 0)
|
||||||
{
|
{
|
||||||
MOVQ_xmm(XMM1, M(&m_dequantizeTableS[quantize * 2]));
|
MOVQ_xmm(XMM1, MConst(m_dequantizeTableS, quantize * 2));
|
||||||
MULPS(XMM0, R(XMM1));
|
MULPS(XMM0, R(XMM1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -597,7 +604,7 @@ void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
|
||||||
else if (cpu_info.bSSSE3)
|
else if (cpu_info.bSSSE3)
|
||||||
{
|
{
|
||||||
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||||
PSHUFB(XMM0, M(pbswapShuffle1x4));
|
PSHUFB(XMM0, MConst(pbswapShuffle1x4));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -605,7 +612,7 @@ void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
|
||||||
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
MOVD_xmm(XMM0, R(RSCRATCH_EXTRA));
|
||||||
}
|
}
|
||||||
|
|
||||||
UNPCKLPS(XMM0, M(m_one));
|
UNPCKLPS(XMM0, MConst(m_one));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -623,7 +630,7 @@ void QuantizedMemoryRoutines::GenQuantizedLoadFloat(bool single, bool isInline)
|
||||||
else if (cpu_info.bSSSE3)
|
else if (cpu_info.bSSSE3)
|
||||||
{
|
{
|
||||||
MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
MOVQ_xmm(XMM0, MRegSum(RMEM, RSCRATCH_EXTRA));
|
||||||
PSHUFB(XMM0, M(pbswapShuffle2x4));
|
PSHUFB(XMM0, MConst(pbswapShuffle2x4));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
|
@ -1753,7 +1753,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
|
||||||
|
|
||||||
X64Reg reg = fregURegWithMov(RI, I);
|
X64Reg reg = fregURegWithMov(RI, I);
|
||||||
alignas(16) static const u32 ssSignBits[4] = {0x80000000};
|
alignas(16) static const u32 ssSignBits[4] = {0x80000000};
|
||||||
Jit->PXOR(reg, M(ssSignBits));
|
Jit->PXOR(reg, Jit->MConst(ssSignBits));
|
||||||
RI.fregs[reg] = I;
|
RI.fregs[reg] = I;
|
||||||
fregNormalRegClear(RI, I);
|
fregNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
|
@ -1765,7 +1765,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
|
||||||
|
|
||||||
X64Reg reg = fregURegWithMov(RI, I);
|
X64Reg reg = fregURegWithMov(RI, I);
|
||||||
alignas(16) static const u64 sdSignBits[2] = {0x8000000000000000ULL};
|
alignas(16) static const u64 sdSignBits[2] = {0x8000000000000000ULL};
|
||||||
Jit->PXOR(reg, M(sdSignBits));
|
Jit->PXOR(reg, Jit->MConst(sdSignBits));
|
||||||
RI.fregs[reg] = I;
|
RI.fregs[reg] = I;
|
||||||
fregNormalRegClear(RI, I);
|
fregNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
|
@ -1777,7 +1777,7 @@ static void DoWriteCode(IRBuilder* ibuild, JitIL* Jit, u32 exitAddress)
|
||||||
|
|
||||||
X64Reg reg = fregURegWithMov(RI, I);
|
X64Reg reg = fregURegWithMov(RI, I);
|
||||||
alignas(16) static const u32 psSignBits[4] = {0x80000000, 0x80000000};
|
alignas(16) static const u32 psSignBits[4] = {0x80000000, 0x80000000};
|
||||||
Jit->PXOR(reg, M(psSignBits));
|
Jit->PXOR(reg, Jit->MConst(psSignBits));
|
||||||
RI.fregs[reg] = I;
|
RI.fregs[reg] = I;
|
||||||
fregNormalRegClear(RI, I);
|
fregNormalRegClear(RI, I);
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
alignas(16) const u8 pbswapShuffle1x4[16] = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
alignas(16) const u8 pbswapShuffle1x4[16] = {3, 2, 1, 0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||||
alignas(16) const u8 pbswapShuffle2x4[16] = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
|
alignas(16) const u8 pbswapShuffle2x4[16] = {3, 2, 1, 0, 7, 6, 5, 4, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||||
|
|
||||||
alignas(16) const float m_quantizeTableS[] = {
|
alignas(16) const float m_quantizeTableS[128] = {
|
||||||
(1ULL << 0), (1ULL << 0), (1ULL << 1), (1ULL << 1),
|
(1ULL << 0), (1ULL << 0), (1ULL << 1), (1ULL << 1),
|
||||||
(1ULL << 2), (1ULL << 2), (1ULL << 3), (1ULL << 3),
|
(1ULL << 2), (1ULL << 2), (1ULL << 3), (1ULL << 3),
|
||||||
(1ULL << 4), (1ULL << 4), (1ULL << 5), (1ULL << 5),
|
(1ULL << 4), (1ULL << 4), (1ULL << 5), (1ULL << 5),
|
||||||
|
@ -42,7 +42,7 @@ alignas(16) const float m_quantizeTableS[] = {
|
||||||
1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1),
|
1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1),
|
||||||
};
|
};
|
||||||
|
|
||||||
alignas(16) const float m_dequantizeTableS[] = {
|
alignas(16) const float m_dequantizeTableS[128] = {
|
||||||
1.0 / (1ULL << 0), 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1),
|
1.0 / (1ULL << 0), 1.0 / (1ULL << 0), 1.0 / (1ULL << 1), 1.0 / (1ULL << 1),
|
||||||
1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3),
|
1.0 / (1ULL << 2), 1.0 / (1ULL << 2), 1.0 / (1ULL << 3), 1.0 / (1ULL << 3),
|
||||||
1.0 / (1ULL << 4), 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 5),
|
1.0 / (1ULL << 4), 1.0 / (1ULL << 4), 1.0 / (1ULL << 5), 1.0 / (1ULL << 5),
|
||||||
|
@ -77,4 +77,4 @@ alignas(16) const float m_dequantizeTableS[] = {
|
||||||
(1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
(1ULL << 2), (1ULL << 2), (1ULL << 1), (1ULL << 1),
|
||||||
};
|
};
|
||||||
|
|
||||||
alignas(16) const float m_one[] = {1.0f, 0.0f, 0.0f, 0.0f};
|
alignas(16) const float m_one[4] = {1.0f, 0.0f, 0.0f, 0.0f};
|
||||||
|
|
|
@ -8,9 +8,9 @@
|
||||||
|
|
||||||
alignas(16) extern const u8 pbswapShuffle1x4[16];
|
alignas(16) extern const u8 pbswapShuffle1x4[16];
|
||||||
alignas(16) extern const u8 pbswapShuffle2x4[16];
|
alignas(16) extern const u8 pbswapShuffle2x4[16];
|
||||||
alignas(16) extern const float m_one[];
|
alignas(16) extern const float m_one[4];
|
||||||
alignas(16) extern const float m_quantizeTableS[];
|
alignas(16) extern const float m_quantizeTableS[128];
|
||||||
alignas(16) extern const float m_dequantizeTableS[];
|
alignas(16) extern const float m_dequantizeTableS[128];
|
||||||
|
|
||||||
class CommonAsmRoutinesBase
|
class CommonAsmRoutinesBase
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue