VertexLoaderX64: support SSE2 as a fallback

With suggestions by Fiora and magumagu.
This commit is contained in:
Tillmann Karras 2015-03-18 11:26:21 +01:00
parent 9793fed742
commit 7030542546
2 changed files with 99 additions and 40 deletions

View File

@ -1,3 +1,4 @@
#include "Common/BitSet.h"
#include "Common/CPUDetect.h" #include "Common/CPUDetect.h"
#include "Common/Intrinsics.h" #include "Common/Intrinsics.h"
#include "Common/JitRegister.h" #include "Common/JitRegister.h"
@ -6,8 +7,6 @@
using namespace Gen; using namespace Gen;
#define VERTEX_LOADER_REGS {XMM0+16}
static const X64Reg src_reg = ABI_PARAM1; static const X64Reg src_reg = ABI_PARAM1;
static const X64Reg dst_reg = ABI_PARAM2; static const X64Reg dst_reg = ABI_PARAM2;
static const X64Reg scratch1 = RAX; static const X64Reg scratch1 = RAX;
@ -98,47 +97,110 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count
int elem_size = 1 << (format / 2); int elem_size = 1 << (format / 2);
int load_bytes = elem_size * count_in; int load_bytes = elem_size * count_in;
if (load_bytes > 8)
MOVDQU(coords, data);
else if (load_bytes > 4)
MOVQ_xmm(coords, data);
else
MOVD_xmm(coords, data);
PSHUFB(coords, M(&shuffle_lut[format][count_in - 1]));
if (format != FORMAT_FLOAT)
{
// Sign extend
if (format == FORMAT_BYTE)
PSRAD(coords, 24);
if (format == FORMAT_SHORT)
PSRAD(coords, 16);
CVTDQ2PS(coords, R(coords));
if (dequantize && scaling_exponent)
MULPS(coords, M(&scale_factors[scaling_exponent]));
}
OpArg dest = MDisp(dst_reg, m_dst_ofs); OpArg dest = MDisp(dst_reg, m_dst_ofs);
switch (count_out)
{
case 1: MOVSS(dest, coords); break;
case 2: MOVLPS(dest, coords); break;
case 3: MOVUPS(dest, coords); break;
}
native_format->components = count_out; native_format->components = count_out;
native_format->enable = true; native_format->enable = true;
native_format->offset = m_dst_ofs; native_format->offset = m_dst_ofs;
native_format->type = VAR_FLOAT; native_format->type = VAR_FLOAT;
native_format->integer = false; native_format->integer = false;
m_dst_ofs += sizeof(float) * count_out; m_dst_ofs += sizeof(float) * count_out;
if (attribute == DIRECT) if (attribute == DIRECT)
m_src_ofs += load_bytes; m_src_ofs += load_bytes;
if (cpu_info.bSSSE3)
{
if (load_bytes > 8)
MOVDQU(coords, data);
else if (load_bytes > 4)
MOVQ_xmm(coords, data);
else
MOVD_xmm(coords, data);
PSHUFB(coords, M(&shuffle_lut[format][count_in - 1]));
// Sign-extend.
if (format == FORMAT_BYTE)
PSRAD(coords, 24);
if (format == FORMAT_SHORT)
PSRAD(coords, 16);
}
else
{
// SSE2
X64Reg temp = XMM1;
switch (format)
{
case FORMAT_UBYTE:
MOVD_xmm(coords, data);
PXOR(temp, R(temp));
PUNPCKLBW(coords, R(temp));
PUNPCKLWD(coords, R(temp));
break;
case FORMAT_BYTE:
MOVD_xmm(coords, data);
PUNPCKLBW(coords, R(coords));
PUNPCKLWD(coords, R(coords));
PSRAD(coords, 24);
break;
case FORMAT_USHORT:
case FORMAT_SHORT:
switch (count_in)
{
case 1:
LoadAndSwap(32, scratch3, data);
MOVD_xmm(coords, R(scratch3)); // ......X.
break;
case 2:
LoadAndSwap(32, scratch3, data);
MOVD_xmm(coords, R(scratch3)); // ......XY
PSHUFLW(coords, R(coords), 0x24); // ....Y.X.
break;
case 3:
LoadAndSwap(64, scratch3, data);
MOVQ_xmm(coords, R(scratch3)); // ....XYZ.
PUNPCKLQDQ(coords, R(coords)); // ..Z.XYZ.
PSHUFLW(coords, R(coords), 0xAC); // ..Z.Y.X.
break;
}
if (format == FORMAT_SHORT)
PSRAD(coords, 16);
else
PSRLD(coords, 16);
break;
case FORMAT_FLOAT:
// Floats don't need to be scaled or converted,
// so we can just load/swap/store them directly
// and return early.
// (In SSSE3 we still need to store them.)
for (int i = 0; i < count_in; i++)
{
LoadAndSwap(32, scratch3, data);
MOV(32, dest, R(scratch3));
data.offset += sizeof(float);
dest.offset += sizeof(float);
}
return load_bytes;
}
}
if (format != FORMAT_FLOAT)
{
CVTDQ2PS(coords, R(coords));
if (dequantize && scaling_exponent)
MULPS(coords, M(&scale_factors[scaling_exponent]));
}
switch (count_out)
{
case 1: MOVSS(dest, coords); break;
case 2: MOVLPS(dest, coords); break;
case 3: MOVUPS(dest, coords); break;
}
return load_bytes; return load_bytes;
} }
@ -290,7 +352,10 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
void VertexLoaderX64::GenerateVertexLoader() void VertexLoaderX64::GenerateVertexLoader()
{ {
ABI_PushRegistersAndAdjustStack(VERTEX_LOADER_REGS, 8); BitSet32 xmm_regs;
xmm_regs[XMM0+16] = true;
xmm_regs[XMM1+16] = !cpu_info.bSSSE3;
ABI_PushRegistersAndAdjustStack(xmm_regs, 8);
// Backup count since we're going to count it down. // Backup count since we're going to count it down.
PUSH(32, R(ABI_PARAM3)); PUSH(32, R(ABI_PARAM3));
@ -427,7 +492,7 @@ void VertexLoaderX64::GenerateVertexLoader()
// Get the original count. // Get the original count.
POP(32, R(ABI_RETURN)); POP(32, R(ABI_RETURN));
ABI_PopRegistersAndAdjustStack(VERTEX_LOADER_REGS, 8); ABI_PopRegistersAndAdjustStack(xmm_regs, 8);
if (m_VtxDesc.Position & MASK_INDEXED) if (m_VtxDesc.Position & MASK_INDEXED)
{ {
@ -447,12 +512,6 @@ void VertexLoaderX64::GenerateVertexLoader()
m_native_vtx_decl.stride = m_dst_ofs; m_native_vtx_decl.stride = m_dst_ofs;
} }
bool VertexLoaderX64::IsInitialized()
{
// Uses PSHUFB.
return cpu_info.bSSSE3;
}
int VertexLoaderX64::RunVertices(DataReader src, DataReader dst, int count, int primitive) int VertexLoaderX64::RunVertices(DataReader src, DataReader dst, int count, int primitive)
{ {
m_numLoadedVertices += count; m_numLoadedVertices += count;

View File

@ -8,7 +8,7 @@ public:
protected: protected:
std::string GetName() const override { return "VertexLoaderX64"; } std::string GetName() const override { return "VertexLoaderX64"; }
bool IsInitialized() override; bool IsInitialized() override { return true; }
int RunVertices(DataReader src, DataReader dst, int count, int primitive) override; int RunVertices(DataReader src, DataReader dst, int count, int primitive) override;
private: private: