VertexLoaderX64: support SSE2 as a fallback
With suggestions by Fiora and magumagu.
This commit is contained in:
parent
9793fed742
commit
7030542546
|
@ -1,3 +1,4 @@
|
||||||
|
#include "Common/BitSet.h"
|
||||||
#include "Common/CPUDetect.h"
|
#include "Common/CPUDetect.h"
|
||||||
#include "Common/Intrinsics.h"
|
#include "Common/Intrinsics.h"
|
||||||
#include "Common/JitRegister.h"
|
#include "Common/JitRegister.h"
|
||||||
|
@ -6,8 +7,6 @@
|
||||||
|
|
||||||
using namespace Gen;
|
using namespace Gen;
|
||||||
|
|
||||||
#define VERTEX_LOADER_REGS {XMM0+16}
|
|
||||||
|
|
||||||
static const X64Reg src_reg = ABI_PARAM1;
|
static const X64Reg src_reg = ABI_PARAM1;
|
||||||
static const X64Reg dst_reg = ABI_PARAM2;
|
static const X64Reg dst_reg = ABI_PARAM2;
|
||||||
static const X64Reg scratch1 = RAX;
|
static const X64Reg scratch1 = RAX;
|
||||||
|
@ -98,6 +97,21 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count
|
||||||
|
|
||||||
int elem_size = 1 << (format / 2);
|
int elem_size = 1 << (format / 2);
|
||||||
int load_bytes = elem_size * count_in;
|
int load_bytes = elem_size * count_in;
|
||||||
|
OpArg dest = MDisp(dst_reg, m_dst_ofs);
|
||||||
|
|
||||||
|
native_format->components = count_out;
|
||||||
|
native_format->enable = true;
|
||||||
|
native_format->offset = m_dst_ofs;
|
||||||
|
native_format->type = VAR_FLOAT;
|
||||||
|
native_format->integer = false;
|
||||||
|
|
||||||
|
m_dst_ofs += sizeof(float) * count_out;
|
||||||
|
|
||||||
|
if (attribute == DIRECT)
|
||||||
|
m_src_ofs += load_bytes;
|
||||||
|
|
||||||
|
if (cpu_info.bSSSE3)
|
||||||
|
{
|
||||||
if (load_bytes > 8)
|
if (load_bytes > 8)
|
||||||
MOVDQU(coords, data);
|
MOVDQU(coords, data);
|
||||||
else if (load_bytes > 4)
|
else if (load_bytes > 4)
|
||||||
|
@ -107,21 +121,79 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count
|
||||||
|
|
||||||
PSHUFB(coords, M(&shuffle_lut[format][count_in - 1]));
|
PSHUFB(coords, M(&shuffle_lut[format][count_in - 1]));
|
||||||
|
|
||||||
if (format != FORMAT_FLOAT)
|
// Sign-extend.
|
||||||
{
|
|
||||||
// Sign extend
|
|
||||||
if (format == FORMAT_BYTE)
|
if (format == FORMAT_BYTE)
|
||||||
PSRAD(coords, 24);
|
PSRAD(coords, 24);
|
||||||
if (format == FORMAT_SHORT)
|
if (format == FORMAT_SHORT)
|
||||||
PSRAD(coords, 16);
|
PSRAD(coords, 16);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// SSE2
|
||||||
|
X64Reg temp = XMM1;
|
||||||
|
switch (format)
|
||||||
|
{
|
||||||
|
case FORMAT_UBYTE:
|
||||||
|
MOVD_xmm(coords, data);
|
||||||
|
PXOR(temp, R(temp));
|
||||||
|
PUNPCKLBW(coords, R(temp));
|
||||||
|
PUNPCKLWD(coords, R(temp));
|
||||||
|
break;
|
||||||
|
case FORMAT_BYTE:
|
||||||
|
MOVD_xmm(coords, data);
|
||||||
|
PUNPCKLBW(coords, R(coords));
|
||||||
|
PUNPCKLWD(coords, R(coords));
|
||||||
|
PSRAD(coords, 24);
|
||||||
|
break;
|
||||||
|
case FORMAT_USHORT:
|
||||||
|
case FORMAT_SHORT:
|
||||||
|
switch (count_in)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
LoadAndSwap(32, scratch3, data);
|
||||||
|
MOVD_xmm(coords, R(scratch3)); // ......X.
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
LoadAndSwap(32, scratch3, data);
|
||||||
|
MOVD_xmm(coords, R(scratch3)); // ......XY
|
||||||
|
PSHUFLW(coords, R(coords), 0x24); // ....Y.X.
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
LoadAndSwap(64, scratch3, data);
|
||||||
|
MOVQ_xmm(coords, R(scratch3)); // ....XYZ.
|
||||||
|
PUNPCKLQDQ(coords, R(coords)); // ..Z.XYZ.
|
||||||
|
PSHUFLW(coords, R(coords), 0xAC); // ..Z.Y.X.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (format == FORMAT_SHORT)
|
||||||
|
PSRAD(coords, 16);
|
||||||
|
else
|
||||||
|
PSRLD(coords, 16);
|
||||||
|
break;
|
||||||
|
case FORMAT_FLOAT:
|
||||||
|
// Floats don't need to be scaled or converted,
|
||||||
|
// so we can just load/swap/store them directly
|
||||||
|
// and return early.
|
||||||
|
// (In SSSE3 we still need to store them.)
|
||||||
|
for (int i = 0; i < count_in; i++)
|
||||||
|
{
|
||||||
|
LoadAndSwap(32, scratch3, data);
|
||||||
|
MOV(32, dest, R(scratch3));
|
||||||
|
data.offset += sizeof(float);
|
||||||
|
dest.offset += sizeof(float);
|
||||||
|
}
|
||||||
|
return load_bytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (format != FORMAT_FLOAT)
|
||||||
|
{
|
||||||
CVTDQ2PS(coords, R(coords));
|
CVTDQ2PS(coords, R(coords));
|
||||||
|
|
||||||
if (dequantize && scaling_exponent)
|
if (dequantize && scaling_exponent)
|
||||||
MULPS(coords, M(&scale_factors[scaling_exponent]));
|
MULPS(coords, M(&scale_factors[scaling_exponent]));
|
||||||
}
|
}
|
||||||
|
|
||||||
OpArg dest = MDisp(dst_reg, m_dst_ofs);
|
|
||||||
switch (count_out)
|
switch (count_out)
|
||||||
{
|
{
|
||||||
case 1: MOVSS(dest, coords); break;
|
case 1: MOVSS(dest, coords); break;
|
||||||
|
@ -129,16 +201,6 @@ int VertexLoaderX64::ReadVertex(OpArg data, u64 attribute, int format, int count
|
||||||
case 3: MOVUPS(dest, coords); break;
|
case 3: MOVUPS(dest, coords); break;
|
||||||
}
|
}
|
||||||
|
|
||||||
native_format->components = count_out;
|
|
||||||
native_format->enable = true;
|
|
||||||
native_format->offset = m_dst_ofs;
|
|
||||||
native_format->type = VAR_FLOAT;
|
|
||||||
native_format->integer = false;
|
|
||||||
m_dst_ofs += sizeof(float) * count_out;
|
|
||||||
|
|
||||||
if (attribute == DIRECT)
|
|
||||||
m_src_ofs += load_bytes;
|
|
||||||
|
|
||||||
return load_bytes;
|
return load_bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -290,7 +352,10 @@ void VertexLoaderX64::ReadColor(OpArg data, u64 attribute, int format)
|
||||||
|
|
||||||
void VertexLoaderX64::GenerateVertexLoader()
|
void VertexLoaderX64::GenerateVertexLoader()
|
||||||
{
|
{
|
||||||
ABI_PushRegistersAndAdjustStack(VERTEX_LOADER_REGS, 8);
|
BitSet32 xmm_regs;
|
||||||
|
xmm_regs[XMM0+16] = true;
|
||||||
|
xmm_regs[XMM1+16] = !cpu_info.bSSSE3;
|
||||||
|
ABI_PushRegistersAndAdjustStack(xmm_regs, 8);
|
||||||
|
|
||||||
// Backup count since we're going to count it down.
|
// Backup count since we're going to count it down.
|
||||||
PUSH(32, R(ABI_PARAM3));
|
PUSH(32, R(ABI_PARAM3));
|
||||||
|
@ -427,7 +492,7 @@ void VertexLoaderX64::GenerateVertexLoader()
|
||||||
// Get the original count.
|
// Get the original count.
|
||||||
POP(32, R(ABI_RETURN));
|
POP(32, R(ABI_RETURN));
|
||||||
|
|
||||||
ABI_PopRegistersAndAdjustStack(VERTEX_LOADER_REGS, 8);
|
ABI_PopRegistersAndAdjustStack(xmm_regs, 8);
|
||||||
|
|
||||||
if (m_VtxDesc.Position & MASK_INDEXED)
|
if (m_VtxDesc.Position & MASK_INDEXED)
|
||||||
{
|
{
|
||||||
|
@ -447,12 +512,6 @@ void VertexLoaderX64::GenerateVertexLoader()
|
||||||
m_native_vtx_decl.stride = m_dst_ofs;
|
m_native_vtx_decl.stride = m_dst_ofs;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool VertexLoaderX64::IsInitialized()
|
|
||||||
{
|
|
||||||
// Uses PSHUFB.
|
|
||||||
return cpu_info.bSSSE3;
|
|
||||||
}
|
|
||||||
|
|
||||||
int VertexLoaderX64::RunVertices(DataReader src, DataReader dst, int count, int primitive)
|
int VertexLoaderX64::RunVertices(DataReader src, DataReader dst, int count, int primitive)
|
||||||
{
|
{
|
||||||
m_numLoadedVertices += count;
|
m_numLoadedVertices += count;
|
||||||
|
|
|
@ -8,7 +8,7 @@ public:
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
std::string GetName() const override { return "VertexLoaderX64"; }
|
std::string GetName() const override { return "VertexLoaderX64"; }
|
||||||
bool IsInitialized() override;
|
bool IsInitialized() override { return true; }
|
||||||
int RunVertices(DataReader src, DataReader dst, int count, int primitive) override;
|
int RunVertices(DataReader src, DataReader dst, int count, int primitive) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
Loading…
Reference in New Issue