gsdx sw: port code to the new constant object

This commit is contained in:
Gregory Hainaut 2016-11-24 10:16:35 +01:00
parent 3b5bc9c38d
commit 8431299b92
17 changed files with 82 additions and 211 deletions

View File

@ -131,12 +131,10 @@ EXPORT_C_(int) GSinit()
GSBlock::InitVectors();
GSClut::InitVectors();
GSDrawScanlineCodeGenerator::InitVectors();
#ifdef ENABLE_OPENCL
GSRendererCL::InitVectors();
#endif
GSRendererSW::InitVectors();
GSSetupPrimCodeGenerator::InitVectors();
GSVector4i::InitVectors();
GSVector4::InitVectors();
#if _M_SSE >= 0x500
@ -147,6 +145,9 @@ EXPORT_C_(int) GSinit()
#endif
GSVertexTrace::InitVectors();
if (g_const == nullptr)
return -1;
#ifdef _WIN32
s_hr = ::CoInitializeEx(NULL, COINIT_MULTITHREADED);
@ -165,8 +166,7 @@ EXPORT_C GSshutdown()
gsopen_done = false;
delete s_gs;
s_gs = NULL;
s_gs = nullptr;
s_renderer = GSRendererType::Undefined;

View File

@ -117,7 +117,7 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
#if _M_SSE >= 0x501
const GSVector8* shift = GSSetupPrimCodeGenerator::m_shift;
const GSVector8* shift = (GSVector8*)g_const->m_shift_256b;
if(has_z || has_f)
{
@ -271,7 +271,7 @@ void GSDrawScanline::SetupPrim(const GSVertexSW* vertex, const uint32* index, co
#else
const GSVector4* shift = GSSetupPrimCodeGenerator::m_shift;
const GSVector4* shift = (GSVector4*)g_const->m_shift_128b;
if(has_z || has_f)
{
@ -441,7 +441,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
skip = left & 7;
steps = pixels + skip - 8;
left -= skip;
test = GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[skip]) | GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]);
test = GSVector8i::i8to32c(g_const->m_test_256b[skip]) | GSVector8i::i8to32c(g_const->m_test_256b[15 + (steps & (steps >> 31))]);
}
else
{
@ -1532,12 +1532,13 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
if(!sel.notest)
{
test = GSVector8i::i8to32c(GSDrawScanlineCodeGenerator::m_test[15 + (steps & (steps >> 31))]);
test = GSVector8i::i8to32c(g_const->m_test_256b[15 + (steps & (steps >> 31))]);
}
}
#else
const GSVector4i* const_test = (GSVector4i*)g_const->m_test_128b;
GSVector4i test;
GSVector4 zo;
GSVector4i f;
@ -1555,7 +1556,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
skip = left & 3;
steps = pixels + skip - 4;
left -= skip;
test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
test = const_test[skip] | const_test[7 + (steps & (steps >> 31))];
}
else
{
@ -2625,7 +2626,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
if(!sel.notest)
{
test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
test = const_test[7 + (steps & (steps >> 31))];
}
}

View File

@ -33,78 +33,6 @@ void GSDrawScanlineCodeGenerator::Generate()
}
#endif
#if _M_SSE >= 0x501
alignas(8) const uint8 GSDrawScanlineCodeGenerator::m_test[16][8] =
{
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
{0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00},
{0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00},
{0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00},
{0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00},
{0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff},
{0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff},
{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff},
{0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff},
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
};
GSVector8 GSDrawScanlineCodeGenerator::m_log2_coef[4];
#else
GSVector4i GSDrawScanlineCodeGenerator::m_test[8];
GSVector4 GSDrawScanlineCodeGenerator::m_log2_coef[4];
#endif
void GSDrawScanlineCodeGenerator::InitVectors()
{
#if _M_SSE >= 0x501
GSVector8 log2_coef[4] =
{
GSVector8(0.204446009836232697516f),
GSVector8(-1.04913055217340124191f),
GSVector8(2.28330284476918490682f),
GSVector8(1.0f),
};
for (size_t n = 0; n < countof(log2_coef); ++n)
m_log2_coef[n] = log2_coef[n];
#else
GSVector4i test[8] =
{
GSVector4i::zero(),
GSVector4i(0xffffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
GSVector4i(0x00000000, 0xffffffff, 0xffffffff, 0xffffffff),
GSVector4i(0x00000000, 0x00000000, 0xffffffff, 0xffffffff),
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0xffffffff),
GSVector4i::zero(),
};
GSVector4 log2_coef[4] =
{
GSVector4(0.204446009836232697516f),
GSVector4(-1.04913055217340124191f),
GSVector4(2.28330284476918490682f),
GSVector4(1.0f),
};
for (size_t n = 0; n < countof(test); ++n)
m_test[n] = test[n];
for (size_t n = 0; n < countof(log2_coef); ++n)
m_log2_coef[n] = log2_coef[n];
#endif
}
GSDrawScanlineCodeGenerator::GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_local(*(GSScanlineLocalData*)param)

View File

@ -143,14 +143,4 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator
public:
GSDrawScanlineCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
#if _M_SSE >= 0x501
alignas(8) static const uint8 m_test[16][8];
static GSVector8 m_log2_coef[4];
#else
static GSVector4i m_test[8];
static GSVector4 m_log2_coef[4];
#endif
static void InitVectors();
};

View File

@ -96,7 +96,7 @@ void GSDrawScanlineCodeGenerator::Generate_AVX()
}
#endif
mov(r10, (size_t)&m_test[0]);
mov(r10, (size_t)g_const->m_test_128b[0]);
if (!m_rip)
{
mov(_m_local, (size_t)&m_local);

View File

@ -103,7 +103,7 @@ void GSDrawScanlineCodeGenerator::Generate()
}
#endif
mov(r10, (size_t)&m_test[0]);
mov(r10, (size_t)g_const->m_test_256b[0]);
if (!m_rip)
{
mov(_m_local, (size_t)&m_local);
@ -363,8 +363,8 @@ void GSDrawScanlineCodeGenerator::Init()
sar(eax, 31);
and(eax, ecx);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)&m_test[0]]);
vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)&m_test[15]]);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[0]]);
vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)g_const->m_test_256b[15]]);
vpor(ymm7, ymm0);
shl(edx, 5);
@ -683,7 +683,7 @@ void GSDrawScanlineCodeGenerator::Step()
sar(edx, 31);
and(edx, ecx);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)&m_test[15]]);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[15]]);
}
}
@ -1255,31 +1255,31 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
vpslld(ymm0, ymm4, 1);
vpsrld(ymm0, ymm0, 24);
vpsubd(ymm0, ymm1);
vcvtdq2ps(ymm0, ymm0);
vcvtdq2ps(ymm0, ymm0);
// ymm0 = (float)(exp(q) - 127)
vpslld(ymm4, ymm4, 9);
vpsrld(ymm4, ymm4, 9);
vorps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
vorps(ymm4, ptr[g_const->m_log2_coef_256b[3]]);
// ymm4 = mant(q) | 1.0f
if(m_cpu.has(util::Cpu::tFMA))
{
vmovaps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); // c0
vfmadd213ps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); // c0 * ymm4 + c1
vfmadd213ps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); // (c0 * ymm4 + c1) * ymm4 + c2
vsubps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); // ymm4 - 1.0f
vmovaps(ymm5, ptr[g_const->m_log2_coef_256b[0]]); // c0
vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[1]]); // c0 * ymm4 + c1
vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[2]]); // (c0 * ymm4 + c1) * ymm4 + c2
vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); // ymm4 - 1.0f
vfmadd213ps(ymm4, ymm5, ymm0); // ((c0 * ymm4 + c1) * ymm4 + c2) * (ymm4 - 1.0f) + ymm0
}
else
{
vmulps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
vaddps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
vmulps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[0]]);
vaddps(ymm5, ptr[g_const->m_log2_coef_256b[1]]);
vmulps(ymm5, ymm4);
vsubps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
vaddps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]);
vaddps(ymm5, ptr[g_const->m_log2_coef_256b[2]]);
vmulps(ymm4, ymm5);
vaddps(ymm4, ymm0);
}
@ -1289,7 +1289,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
if(m_cpu.has(util::Cpu::tFMA))
{
vmovaps(ymm5, ptr[&m_local.gd->l]);
vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]);
vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]);
}
else
{

View File

@ -269,14 +269,14 @@ void GSDrawScanlineCodeGenerator::Init_AVX()
shl(edx, 4);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
vmovdqa(xmm7, ptr[edx + (size_t)g_const->m_test_128b[0]]);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
vpor(xmm7, ptr[eax + (size_t)&m_test[7]]);
vpor(xmm7, ptr[eax + (size_t)g_const->m_test_128b[7]]);
}
else
{
@ -591,7 +591,7 @@ void GSDrawScanlineCodeGenerator::Step_AVX()
and(edx, ecx);
shl(edx, 4);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
vmovdqa(xmm7, ptr[edx + (size_t)g_const->m_test_128b[7]]);
}
}
@ -1141,31 +1141,31 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX()
vpslld(xmm0, xmm4, 1);
vpsrld(xmm0, xmm0, 24);
vpsubd(xmm0, xmm1);
vcvtdq2ps(xmm0, xmm0);
vcvtdq2ps(xmm0, xmm0);
// xmm0 = (float)(exp(q) - 127)
vpslld(xmm4, xmm4, 9);
vpsrld(xmm4, xmm4, 9);
vorps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
vorps(xmm4, ptr[g_const->m_log2_coef_128b[3]]);
// xmm4 = mant(q) | 1.0f
if(m_cpu.has(util::Cpu::tFMA))
{
vmovaps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); // c0
vfmadd213ps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); // c0 * xmm4 + c1
vfmadd213ps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); // (c0 * xmm4 + c1) * xmm4 + c2
vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); // xmm4 - 1.0f
vmovaps(xmm5, ptr[g_const->m_log2_coef_128b[0]]); // c0
vfmadd213ps(xmm5, xmm4, ptr[g_const->m_log2_coef_128b[1]]); // c0 * xmm4 + c1
vfmadd213ps(xmm5, xmm4, ptr[g_const->m_log2_coef_128b[2]]); // (c0 * xmm4 + c1) * xmm4 + c2
vsubps(xmm4, ptr[g_const->m_log2_coef_128b[3]]); // xmm4 - 1.0f
vfmadd213ps(xmm4, xmm5, xmm0); // ((c0 * xmm4 + c1) * xmm4 + c2) * (xmm4 - 1.0f) + xmm0
}
else
{
vmulps(xmm5, xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
vmulps(xmm5, xmm4, ptr[g_const->m_log2_coef_128b[0]]);
vaddps(xmm5, ptr[g_const->m_log2_coef_128b[1]]);
vmulps(xmm5, xmm4);
vsubps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
vaddps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
vsubps(xmm4, ptr[g_const->m_log2_coef_128b[3]]);
vaddps(xmm5, ptr[g_const->m_log2_coef_128b[2]]);
vmulps(xmm4, xmm5);
vaddps(xmm4, xmm0);
}
@ -1175,7 +1175,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD_AVX()
if(m_cpu.has(util::Cpu::tFMA))
{
vmovaps(xmm5, ptr[&m_local.gd->l]);
vfmadd213ps(xmm4, xmm5, ptr[&m_local.gd->k]);
vfmadd213ps(xmm4, xmm5, ptr[&m_local.gd->k]);
}
else
{

View File

@ -268,13 +268,13 @@ void GSDrawScanlineCodeGenerator::Init()
sub(ebx, edx);
// GSVector4i test = m_test[skip] | m_test[15 + (steps & (steps >> 31))];
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)&m_test[0]]);
vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)&m_test[15]]);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[0]]);
vpmovsxbd(ymm0, ptr[eax * 8 + (size_t)g_const->m_test_256b[15]]);
vpor(ymm7, ymm0);
shl(edx, 5);
@ -593,7 +593,7 @@ void GSDrawScanlineCodeGenerator::Step()
sar(edx, 31);
and(edx, ecx);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)&m_test[15]]);
vpmovsxbd(ymm7, ptr[edx * 8 + (size_t)g_const->m_test_256b[15]]);
}
}
@ -1157,31 +1157,31 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
vpslld(ymm0, ymm4, 1);
vpsrld(ymm0, ymm0, 24);
vpsubd(ymm0, ymm1);
vcvtdq2ps(ymm0, ymm0);
vcvtdq2ps(ymm0, ymm0);
// ymm0 = (float)(exp(q) - 127)
vpslld(ymm4, ymm4, 9);
vpsrld(ymm4, ymm4, 9);
vorps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
vorps(ymm4, ptr[g_const->m_log2_coef_256b[3]]);
// ymm4 = mant(q) | 1.0f
if(m_cpu.has(util::Cpu::tFMA))
{
vmovaps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]); // c0
vfmadd213ps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]); // c0 * ymm4 + c1
vfmadd213ps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]); // (c0 * ymm4 + c1) * ymm4 + c2
vsubps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]); // ymm4 - 1.0f
vmovaps(ymm5, ptr[g_const->m_log2_coef_256b[0]]); // c0
vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[1]]); // c0 * ymm4 + c1
vfmadd213ps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[2]]); // (c0 * ymm4 + c1) * ymm4 + c2
vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]); // ymm4 - 1.0f
vfmadd213ps(ymm4, ymm5, ymm0); // ((c0 * ymm4 + c1) * ymm4 + c2) * (ymm4 - 1.0f) + ymm0
}
else
{
vmulps(ymm5, ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
vaddps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
vmulps(ymm5, ymm4, ptr[g_const->m_log2_coef_256b[0]]);
vaddps(ymm5, ptr[g_const->m_log2_coef_256b[1]]);
vmulps(ymm5, ymm4);
vsubps(ymm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
vaddps(ymm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
vsubps(ymm4, ptr[g_const->m_log2_coef_256b[3]]);
vaddps(ymm5, ptr[g_const->m_log2_coef_256b[2]]);
vmulps(ymm4, ymm5);
vaddps(ymm4, ymm0);
}
@ -1191,7 +1191,7 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
if(m_cpu.has(util::Cpu::tFMA))
{
vmovaps(ymm5, ptr[&m_local.gd->l]);
vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]);
vfmadd213ps(ymm4, ymm5, ptr[&m_local.gd->k]);
}
else
{

View File

@ -269,14 +269,14 @@ void GSDrawScanlineCodeGenerator::Init_SSE()
shl(edx, 4);
movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
movdqa(xmm7, ptr[edx + (size_t)g_const->m_test_128b[0]]);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
por(xmm7, ptr[eax + (size_t)&m_test[7]]);
por(xmm7, ptr[eax + (size_t)g_const->m_test_128b[7]]);
}
else
{
@ -596,7 +596,7 @@ void GSDrawScanlineCodeGenerator::Step_SSE()
and(edx, ecx);
shl(edx, 4);
movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
movdqa(xmm7, ptr[edx + (size_t)g_const->m_test_128b[7]]);
}
}
@ -1183,22 +1183,22 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD_SSE()
pslld(xmm0, 1);
psrld(xmm0, 24);
psubd(xmm0, xmm1);
cvtdq2ps(xmm0, xmm0);
cvtdq2ps(xmm0, xmm0);
// xmm0 = (float)(exp(q) - 127)
pslld(xmm4, 9);
psrld(xmm4, 9);
orps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
orps(xmm4, ptr[g_const->m_log2_coef_128b[3]]);
// xmm4 = mant(q) | 1.0f
movdqa(xmm5, xmm4);
mulps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[0]]);
addps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[1]]);
mulps(xmm5, ptr[g_const->m_log2_coef_128b[0]]);
addps(xmm5, ptr[g_const->m_log2_coef_128b[1]]);
mulps(xmm5, xmm4);
subps(xmm4, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[3]]);
addps(xmm5, ptr[&GSDrawScanlineCodeGenerator::m_log2_coef[2]]);
subps(xmm4, ptr[g_const->m_log2_coef_128b[3]]);
addps(xmm5, ptr[g_const->m_log2_coef_128b[2]]);
mulps(xmm4, xmm5);
addps(xmm4, xmm0);

View File

@ -24,46 +24,6 @@
using namespace Xbyak;
#if _M_SSE >= 0x501
GSVector8 GSSetupPrimCodeGenerator::m_shift[9];
#else
GSVector4 GSSetupPrimCodeGenerator::m_shift[5];
#endif
void GSSetupPrimCodeGenerator::InitVectors()
{
#if _M_SSE >= 0x501
GSVector8 shift[9] =
{
GSVector8(8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f),
GSVector8(0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f),
GSVector8(-1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f),
GSVector8(-2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f),
GSVector8(-3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f, 4.0f),
GSVector8(-4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f, 3.0f),
GSVector8(-5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f, 2.0f),
GSVector8(-6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f, 1.0f),
GSVector8(-7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f, 0.0f),
};
for (size_t n = 0; n < countof(shift); ++n)
m_shift[n] = shift[n];
#else
GSVector4 shift[5] =
{
GSVector4(4.0f, 4.0f, 4.0f, 4.0f),
GSVector4(0.0f, 1.0f, 2.0f, 3.0f),
GSVector4(-1.0f, 0.0f, 1.0f, 2.0f),
GSVector4(-2.0f, -1.0f, 0.0f, 1.0f),
GSVector4(-3.0f, -2.0f, -1.0f, 0.0f),
};
for (size_t n = 0; n < countof(shift); ++n)
m_shift[n] = shift[n];
#endif
}
GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
, m_local(*(GSScanlineLocalData*)param)

View File

@ -54,12 +54,4 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator
public:
GSSetupPrimCodeGenerator(void* param, uint64 key, void* code, size_t maxsize);
#if _M_SSE >= 0x501
static GSVector8 m_shift[9];
#else
static GSVector4 m_shift[5];
#endif
static void InitVectors();
};

View File

@ -48,7 +48,7 @@ void GSSetupPrimCodeGenerator::Generate_AVX()
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(rax, (size_t)&m_shift[0]);
mov(rax, (size_t)g_const->m_shift_128b);
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{

View File

@ -53,7 +53,7 @@ void GSSetupPrimCodeGenerator::Generate_AVX2()
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(rax, (size_t)&m_shift[0]);
mov(rax, (size_t)g_const->m_shift_256b);
for(int i = 0; i < (m_sel.notest ? 2 : 9); i++)
{

View File

@ -40,7 +40,7 @@ void GSSetupPrimCodeGenerator::Generate_SSE()
if((m_en.z || m_en.f) && m_sel.prim != GS_SPRITE_CLASS || m_en.t || m_en.c && m_sel.iip)
{
mov(rax, (size_t)&m_shift[0]);
mov(rax, (size_t)g_const->m_shift_128b[0]);
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{

View File

@ -40,7 +40,7 @@ void GSSetupPrimCodeGenerator::Generate_AVX()
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
vmovaps(Xmm(3 + i), ptr[g_const->m_shift_128b[i]]);
}
}

View File

@ -40,7 +40,7 @@ void GSSetupPrimCodeGenerator::Generate_AVX2()
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
vmovaps(Ymm(3 + i), ptr[&m_shift[i]]);
vmovaps(Ymm(3 + i), ptr[g_const->m_shift_256b[i]]);
}
}
@ -104,7 +104,7 @@ void GSSetupPrimCodeGenerator::Depth_AVX2()
// m_local.d[i].z = dz * shift[1 + i];
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
else vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
vmovaps(ptr[&m_local.d[i].z], ymm0);
}
@ -113,7 +113,7 @@ void GSSetupPrimCodeGenerator::Depth_AVX2()
// m_local.d[i].f = GSVector8i(df * m_shift[i]).xxzzlh();
if(i < 4) vmulps(ymm0, ymm1, Ymm(4 + i));
else vmulps(ymm0, ymm1, ptr[&m_shift[i + 1]]);
else vmulps(ymm0, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpshuflw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(ymm0, ymm0, _MM_SHUFFLE(2, 2, 0, 0));
@ -190,7 +190,7 @@ void GSSetupPrimCodeGenerator::Texture_AVX2()
// GSVector8 v = dstq * shift[1 + i];
if(i < 4) vmulps(ymm2, ymm1, Ymm(4 + i));
else vmulps(ymm2, ymm1, ptr[&m_shift[i + 1]]);
else vmulps(ymm2, ymm1, ptr[g_const->m_shift_256b[i + 1]]);
if(m_sel.fst)
{
@ -253,14 +253,14 @@ void GSSetupPrimCodeGenerator::Color_AVX2()
// GSVector8i r = GSVector8i(dr * shift[1 + i]).ps32();
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
else vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector4i b = GSVector8i(db * shift[1 + i]).ps32();
if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
else vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);
@ -285,14 +285,14 @@ void GSSetupPrimCodeGenerator::Color_AVX2()
// GSVector8i g = GSVector8i(dg * shift[1 + i]).ps32();
if(i < 4) vmulps(ymm0, ymm2, Ymm(4 + i));
else vmulps(ymm0, ymm2, ptr[&m_shift[i + 1]]);
else vmulps(ymm0, ymm2, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm0, ymm0);
vpackssdw(ymm0, ymm0);
// GSVector8i a = GSVector8i(da * shift[1 + i]).ps32();
if(i < 4) vmulps(ymm1, ymm3, Ymm(4 + i));
else vmulps(ymm1, ymm3, ptr[&m_shift[i + 1]]);
else vmulps(ymm1, ymm3, ptr[g_const->m_shift_256b[i + 1]]);
vcvttps2dq(ymm1, ymm1);
vpackssdw(ymm1, ymm1);

View File

@ -40,7 +40,7 @@ void GSSetupPrimCodeGenerator::Generate_SSE()
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
movaps(Xmm(3 + i), ptr[g_const->m_shift_128b[i]]);
}
}