gsdx: Use constexpr to initialize avx vectors without avx instructions

This commit is contained in:
TellowKrinkle 2021-01-11 23:25:27 -06:00 committed by tellowkrinkle
parent 8117df5644
commit 862518e7da
14 changed files with 382 additions and 374 deletions

View File

@ -125,18 +125,6 @@ EXPORT_C_(int) GSinit()
theApp.Init();
GSUtil::Init();
GSBlock::InitVectors();
GSClut::InitVectors();
GSRendererSW::InitVectors();
GSVector4i::InitVectors();
GSVector4::InitVectors();
#if _M_SSE >= 0x500
GSVector8::InitVectors();
#endif
#if _M_SSE >= 0x501
GSVector8i::InitVectors();
#endif
GSVertexTrace::InitVectors();
if (g_const == nullptr)
return -1;

View File

@ -23,54 +23,26 @@
#include "GSBlock.h"
#if _M_SSE >= 0x501
GSVector8i GSBlock::m_r16mask;
const GSVector8i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
#else
GSVector4i GSBlock::m_r16mask;
const GSVector4i GSBlock::m_r16mask(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
#endif
GSVector4i GSBlock::m_r8mask;
GSVector4i GSBlock::m_r4mask;
const GSVector4i GSBlock::m_r8mask(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
const GSVector4i GSBlock::m_r4mask(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
#if _M_SSE >= 0x501
GSVector8i GSBlock::m_xxxa;
GSVector8i GSBlock::m_xxbx;
GSVector8i GSBlock::m_xgxx;
GSVector8i GSBlock::m_rxxx;
const GSVector8i GSBlock::m_xxxa(GSVector8i::cxpr_set1_epi32(0x00008000));
const GSVector8i GSBlock::m_xxbx(GSVector8i::cxpr_set1_epi32(0x00007c00));
const GSVector8i GSBlock::m_xgxx(GSVector8i::cxpr_set1_epi32(0x000003e0));
const GSVector8i GSBlock::m_rxxx(GSVector8i::cxpr_set1_epi32(0x0000001f));
#else
GSVector4i GSBlock::m_xxxa;
GSVector4i GSBlock::m_xxbx;
GSVector4i GSBlock::m_xgxx;
GSVector4i GSBlock::m_rxxx;
const GSVector4i GSBlock::m_xxxa(GSVector4i::cxpr_set1_epi32(0x00008000));
const GSVector4i GSBlock::m_xxbx(GSVector4i::cxpr_set1_epi32(0x00007c00));
const GSVector4i GSBlock::m_xgxx(GSVector4i::cxpr_set1_epi32(0x000003e0));
const GSVector4i GSBlock::m_rxxx(GSVector4i::cxpr_set1_epi32(0x0000001f));
#endif
GSVector4i GSBlock::m_uw8hmask0;
GSVector4i GSBlock::m_uw8hmask1;
GSVector4i GSBlock::m_uw8hmask2;
GSVector4i GSBlock::m_uw8hmask3;
void GSBlock::InitVectors()
{
#if _M_SSE >= 0x501
m_r16mask = GSVector8i(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15, 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
#else
m_r16mask = GSVector4i(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15);
#endif
m_r8mask = GSVector4i(0, 4, 2, 6, 8, 12, 10, 14, 1, 5, 3, 7, 9, 13, 11, 15);
m_r4mask = GSVector4i(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
#if _M_SSE >= 0x501
m_xxxa = GSVector8i(0x00008000);
m_xxbx = GSVector8i(0x00007c00);
m_xgxx = GSVector8i(0x000003e0);
m_rxxx = GSVector8i(0x0000001f);
#else
m_xxxa = GSVector4i(0x00008000);
m_xxbx = GSVector4i(0x00007c00);
m_xgxx = GSVector4i(0x000003e0);
m_rxxx = GSVector4i(0x0000001f);
#endif
m_uw8hmask0 = GSVector4i(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
m_uw8hmask1 = GSVector4i(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
m_uw8hmask2 = GSVector4i(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
m_uw8hmask3 = GSVector4i(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);
}
const GSVector4i GSBlock::m_uw8hmask0(0, 0, 0, 0, 1, 1, 1, 1, 8, 8, 8, 8, 9, 9, 9, 9);
const GSVector4i GSBlock::m_uw8hmask1(2, 2, 2, 2, 3, 3, 3, 3, 10, 10, 10, 10, 11, 11, 11, 11);
const GSVector4i GSBlock::m_uw8hmask2(4, 4, 4, 4, 5, 5, 5, 5, 12, 12, 12, 12, 13, 13, 13, 13);
const GSVector4i GSBlock::m_uw8hmask3(6, 6, 6, 6, 7, 7, 7, 7, 14, 14, 14, 14, 15, 15, 15, 15);

View File

@ -28,33 +28,31 @@
class GSBlock
{
#if _M_SSE >= 0x501
static GSVector8i m_r16mask;
static const GSVector8i m_r16mask;
#else
static GSVector4i m_r16mask;
static const GSVector4i m_r16mask;
#endif
static GSVector4i m_r8mask;
static GSVector4i m_r4mask;
static const GSVector4i m_r8mask;
static const GSVector4i m_r4mask;
#if _M_SSE >= 0x501
static GSVector8i m_xxxa;
static GSVector8i m_xxbx;
static GSVector8i m_xgxx;
static GSVector8i m_rxxx;
static const GSVector8i m_xxxa;
static const GSVector8i m_xxbx;
static const GSVector8i m_xgxx;
static const GSVector8i m_rxxx;
#else
static GSVector4i m_xxxa;
static GSVector4i m_xxbx;
static GSVector4i m_xgxx;
static GSVector4i m_rxxx;
static const GSVector4i m_xxxa;
static const GSVector4i m_xxbx;
static const GSVector4i m_xgxx;
static const GSVector4i m_rxxx;
#endif
static GSVector4i m_uw8hmask0;
static GSVector4i m_uw8hmask1;
static GSVector4i m_uw8hmask2;
static GSVector4i m_uw8hmask3;
static const GSVector4i m_uw8hmask0;
static const GSVector4i m_uw8hmask1;
static const GSVector4i m_uw8hmask2;
static const GSVector4i m_uw8hmask3;
public:
static void InitVectors();
template<int i, int alignment, uint32 mask> __forceinline static void WriteColumn32(uint8* RESTRICT dst, const uint8* RESTRICT src, int srcpitch)
{
const uint8* RESTRICT s0 = &src[srcpitch * 0];

View File

@ -731,16 +731,9 @@ __forceinline void GSClut::ExpandCLUT64_T16(const GSVector4i& hi, const GSVector
// TODO
GSVector4i GSClut::m_bm;
GSVector4i GSClut::m_gm;
GSVector4i GSClut::m_rm;
void GSClut::InitVectors()
{
m_bm = GSVector4i(0x00007c00);
m_gm = GSVector4i(0x000003e0);
m_rm = GSVector4i(0x0000001f);
}
const GSVector4i GSClut::m_bm(GSVector4i::cxpr_set1_epi32(0x00007c00));
const GSVector4i GSClut::m_gm(GSVector4i::cxpr_set1_epi32(0x000003e0));
const GSVector4i GSClut::m_rm(GSVector4i::cxpr_set1_epi32(0x0000001f));
void GSClut::Expand16(const uint16* RESTRICT src, uint32* RESTRICT dst, int w, const GIFRegTEXA& TEXA)
{

View File

@ -30,9 +30,9 @@ class GSLocalMemory;
class alignas(32) GSClut : public GSAlignedClass<32>
{
static GSVector4i m_bm;
static GSVector4i m_gm;
static GSVector4i m_rm;
static const GSVector4i m_bm;
static const GSVector4i m_gm;
static const GSVector4i m_rm;
GSLocalMemory* m_mem;
@ -97,8 +97,6 @@ class alignas(32) GSClut : public GSAlignedClass<32>
static void Expand16(const uint16* RESTRICT src, uint32* RESTRICT dst, int w, const GIFRegTEXA& TEXA);
public:
static void InitVectors();
GSClut(GSLocalMemory* mem);
virtual ~GSClut();

View File

@ -22,196 +22,146 @@
#include "stdafx.h"
#include "GSVector.h"
GSVector4i GSVector4i::m_xff[17];
GSVector4i GSVector4i::m_x0f[17];
void GSVector4i::InitVectors()
const GSVector4i GSVector4i::m_xff[17] =
{
GSVector4i xff[17] =
{
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x000000ff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
};
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x000000ff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
};
GSVector4i x0f[17] =
{
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0000000f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
};
for (size_t n = 0; n < countof(xff); ++n)
m_xff[n] = xff[n];
for (size_t n = 0; n < countof(x0f); ++n)
m_x0f[n] = x0f[n];
}
GSVector4 GSVector4::m_ps0123;
GSVector4 GSVector4::m_ps4567;
GSVector4 GSVector4::m_half;
GSVector4 GSVector4::m_one;
GSVector4 GSVector4::m_two;
GSVector4 GSVector4::m_four;
GSVector4 GSVector4::m_x4b000000;
GSVector4 GSVector4::m_x4f800000;
GSVector4 GSVector4::m_max;
GSVector4 GSVector4::m_min;
void GSVector4::InitVectors()
const GSVector4i GSVector4i::m_x0f[17] =
{
m_ps0123 = GSVector4(0.0f, 1.0f, 2.0f, 3.0f);
m_ps4567 = GSVector4(4.0f, 5.0f, 6.0f, 7.0f);
m_half = GSVector4(0.5f);
m_one = GSVector4(1.0f);
m_two = GSVector4(2.0f);
m_four = GSVector4(4.0f);
m_x4b000000 = GSVector4(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
m_x4f800000 = GSVector4(_mm_castsi128_ps(_mm_set1_epi32(0x4f800000)));
m_max = GSVector4(FLT_MAX);
m_min = GSVector4(FLT_MIN);
}
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0000000f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
};
const GSVector4 GSVector4::m_ps0123(cxpr_setr_ps(0.0f, 1.0f, 2.0f, 3.0f));
const GSVector4 GSVector4::m_ps4567(cxpr_setr_ps(4.0f, 5.0f, 6.0f, 7.0f));
const GSVector4 GSVector4::m_half(cxpr_set1_ps(0.5f));
const GSVector4 GSVector4::m_one(cxpr_set1_ps(1.0f));
const GSVector4 GSVector4::m_two(cxpr_set1_ps(2.0f));
const GSVector4 GSVector4::m_four(cxpr_set1_ps(4.0f));
const GSVector4 GSVector4::m_x4b000000(cxpr_set1_epi32(0x4b000000));
const GSVector4 GSVector4::m_x4f800000(cxpr_set1_epi32(0x4f800000));
const GSVector4 GSVector4::m_max(cxpr_set1_ps(FLT_MAX));
const GSVector4 GSVector4::m_min(cxpr_set1_ps(FLT_MIN));
#if _M_SSE >= 0x500
GSVector8 GSVector8::m_half;
GSVector8 GSVector8::m_one;
GSVector8 GSVector8::m_x7fffffff;
GSVector8 GSVector8::m_x80000000;
GSVector8 GSVector8::m_x4b000000;
GSVector8 GSVector8::m_x4f800000;
GSVector8 GSVector8::m_max;
GSVector8 GSVector8::m_min;
void GSVector8::InitVectors()
{
m_half = GSVector8(0.5f);
m_one = GSVector8(1.0f);
m_x7fffffff = GSVector8(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
m_x80000000 = GSVector8(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
m_x4b000000 = GSVector8(_mm256_castsi256_ps(_mm256_set1_epi32(0x4b000000)));
m_x4f800000 = GSVector8(_mm256_castsi256_ps(_mm256_set1_epi32(0x4f800000)));
m_max = GSVector8(FLT_MAX);
m_min = GSVector8(FLT_MIN);
}
const GSVector8 GSVector8::m_half(cxpr_set1_ps(0.5f));
const GSVector8 GSVector8::m_one(cxpr_set1_ps(1.0f));
const GSVector8 GSVector8::m_x7fffffff(cxpr_set1_epi32(0x7fffffff));
const GSVector8 GSVector8::m_x80000000(cxpr_set1_epi32(0x80000000));
const GSVector8 GSVector8::m_x4b000000(cxpr_set1_epi32(0x4b000000));
const GSVector8 GSVector8::m_x4f800000(cxpr_set1_epi32(0x4f800000));
const GSVector8 GSVector8::m_max(cxpr_set1_ps(FLT_MAX));
const GSVector8 GSVector8::m_min(cxpr_set1_ps(FLT_MAX));
#endif
#if _M_SSE >= 0x501
GSVector8i GSVector8i::m_xff[33];
GSVector8i GSVector8i::m_x0f[33];
void GSVector8i::InitVectors()
const GSVector8i GSVector8i::m_xff[33] =
{
GSVector8i xff[33] =
{
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
};
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff),
GSVector8i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
};
GSVector8i x0f[33] =
{
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
};
for (size_t n = 0; n < countof(xff); ++n)
m_xff[n] = xff[n];
for (size_t n = 0; n < countof(x0f); ++n)
m_x0f[n] = x0f[n];
}
const GSVector8i GSVector8i::m_x0f[33] =
{
GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f),
GSVector8i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
};
#endif
GSVector4i GSVector4i::fit(int arx, int ary) const

View File

@ -22,6 +22,42 @@
class alignas(16) GSVector4
{
public:
constexpr static __m128 cxpr_setr_ps(float x, float y, float z, float w)
{
#ifdef __GNUC__
return __m128{x, y, z, w};
#else
__m128 m = {};
m.m128_f32[0] = x;
m.m128_f32[1] = y;
m.m128_f32[2] = z;
m.m128_f32[3] = w;
return m;
#endif
}
constexpr static __m128 cxpr_set1_ps(float x)
{
return cxpr_setr_ps(x, x, x, x);
}
constexpr static __m128 cxpr_setr_epi32(uint32 x, uint32 y, uint32 z, uint32 w)
{
#ifdef __GNUC__
return (__m128)(__v4su{x, y, z, w});
#else
__m128 m = {};
m.m128_u32[0] = x;
m.m128_u32[1] = y;
m.m128_u32[2] = z;
m.m128_u32[3] = w;
return m;
#endif
}
constexpr static __m128 cxpr_set1_epi32(uint32 x)
{
return cxpr_setr_epi32(x, x, x, x);
}
union
{
struct {float x, y, z, w;};
@ -40,28 +76,24 @@ public:
__m128 m;
};
static GSVector4 m_ps0123;
static GSVector4 m_ps4567;
static GSVector4 m_half;
static GSVector4 m_one;
static GSVector4 m_two;
static GSVector4 m_four;
static GSVector4 m_x4b000000;
static GSVector4 m_x4f800000;
static GSVector4 m_max;
static GSVector4 m_min;
static const GSVector4 m_ps0123;
static const GSVector4 m_ps4567;
static const GSVector4 m_half;
static const GSVector4 m_one;
static const GSVector4 m_two;
static const GSVector4 m_four;
static const GSVector4 m_x4b000000;
static const GSVector4 m_x4f800000;
static const GSVector4 m_max;
static const GSVector4 m_min;
static void InitVectors();
__forceinline GSVector4()
{
}
GSVector4() = default;
constexpr GSVector4(const GSVector4&) = default;
__forceinline GSVector4(float x, float y, float z, float w)
constexpr GSVector4(float x, float y, float z, float w)
: m(cxpr_setr_ps(x, y, z, w))
{
m = _mm_set_ps(w, z, y, x);
}
__forceinline GSVector4(float x, float y)
@ -97,9 +129,9 @@ public:
m = _mm_cvtepi32_ps(_mm_loadl_epi64((__m128i*)&v));
}
__forceinline explicit GSVector4(__m128 m)
constexpr explicit GSVector4(__m128 m)
: m(m)
{
this->m = m;
}
__forceinline explicit GSVector4(float f)

View File

@ -21,10 +21,41 @@
class alignas(16) GSVector4i
{
static GSVector4i m_xff[17];
static GSVector4i m_x0f[17];
static const GSVector4i m_xff[17];
static const GSVector4i m_x0f[17];
public:
constexpr static __m128i cxpr_setr_epi32(uint32 x, uint32 y, uint32 z, uint32 w)
{
#ifdef __GNUC__
return (__m128i)(__v4su{x, y, z, w});
#else
__m128i m = {};
m.m128i_u32[0] = x;
m.m128i_u32[1] = y;
m.m128i_u32[2] = z;
m.m128i_u32[3] = w;
return m;
#endif
}
constexpr static __m128i cxpr_set1_epi32(uint32 x)
{
return cxpr_setr_epi32(x, x, x, x);
}
constexpr static __m128i cxpr_setr_epi8(uint8 b0, uint8 b1, uint8 b2, uint8 b3, uint8 b4, uint8 b5, uint8 b6, uint8 b7, uint8 b8, uint8 b9, uint8 b10, uint8 b11, uint8 b12, uint8 b13, uint8 b14, uint8 b15)
{
#ifdef __GNUC__
return (__m128i)__v16qu{b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15};
#else
__m128i m = {};
m.m128i_u8[0] = b0; m.m128i_u8[1] = b1; m.m128i_u8[2] = b2; m.m128i_u8[3] = b3;
m.m128i_u8[4] = b4; m.m128i_u8[5] = b5; m.m128i_u8[6] = b6; m.m128i_u8[7] = b7;
m.m128i_u8[8] = b8; m.m128i_u8[9] = b9; m.m128i_u8[10] = b10; m.m128i_u8[11] = b11;
m.m128i_u8[12] = b12; m.m128i_u8[13] = b13; m.m128i_u8[14] = b14; m.m128i_u8[15] = b15;
return m;
#endif
}
union
{
struct {int x, y, z, w;};
@ -43,28 +74,13 @@ public:
__m128i m;
};
static void InitVectors();
__forceinline GSVector4i()
constexpr GSVector4i(): m(cxpr_set1_epi32(0))
{
x = 0;
y = 0;
z = 0;
w = 0;
}
__forceinline GSVector4i(int x, int y, int z, int w)
constexpr GSVector4i(int x, int y, int z, int w)
: m(cxpr_setr_epi32(x, y, z, w))
{
// 4 gprs
// m = _mm_set_epi32(w, z, y, x);
// 2 gprs
GSVector4i xz = load(x).upl32(load(z));
GSVector4i yw = load(y).upl32(load(w));
*this = xz.upl32(yw);
}
__forceinline GSVector4i(int x, int y)
@ -77,9 +93,9 @@ public:
m = _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
}
__forceinline GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
constexpr GSVector4i(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
: m(cxpr_setr_epi8(b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15))
{
m = _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0);
}
__forceinline GSVector4i(const GSVector4i& v)
@ -92,14 +108,15 @@ public:
m = _mm_loadl_epi64((__m128i*)&v);
}
// MSVC has bad codegen for the constexpr version when applied to non-constexpr things (https://godbolt.org/z/h8qbn7), so leave the non-constexpr version default
__forceinline explicit GSVector4i(int i)
{
*this = i;
}
__forceinline explicit GSVector4i(__m128i m)
constexpr explicit GSVector4i(__m128i m)
: m(m)
{
this->m = m;
}
__forceinline explicit GSVector4i(const GSVector4& v, bool truncate = true);

View File

@ -24,6 +24,50 @@
class alignas(32) GSVector8
{
public:
constexpr static __m256 cxpr_setr_ps(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
{
#ifdef __GNUC__
return __m256{x0, y0, z0, w0, x1, y1, z1, w1};
#else
__m256 m = {};
m.m256_f32[0] = x0;
m.m256_f32[1] = y0;
m.m256_f32[2] = z0;
m.m256_f32[3] = w0;
m.m256_f32[4] = x1;
m.m256_f32[5] = y1;
m.m256_f32[6] = z1;
m.m256_f32[7] = w1;
return m;
#endif
}
constexpr static __m256 cxpr_set1_ps(float x)
{
return cxpr_setr_ps(x, x, x, x, x, x, x, x);
}
constexpr static __m256 cxpr_setr_epi32(uint32 x0, uint32 y0, uint32 z0, uint32 w0, uint32 x1, uint32 y1, uint32 z1, uint32 w1)
{
#ifdef __GNUC__
return (__m256)__v8su{x0, y0, z0, w0, x1, y1, z1, w1};
#else
union { __m256 m; uint32 u[8]; } t = {};
t.u[0] = x0;
t.u[1] = y0;
t.u[2] = z0;
t.u[3] = w0;
t.u[4] = x1;
t.u[5] = y1;
t.u[6] = z1;
t.u[7] = w1;
return t.m;
#endif
}
constexpr static __m256 cxpr_set1_epi32(uint32 x)
{
return cxpr_setr_epi32(x, x, x, x, x, x, x, x);
}
union
{
struct {float x0, y0, z0, w0, x1, y1, z1, w1;};
@ -42,26 +86,22 @@ public:
__m128 m0, m1;
};
static GSVector8 m_half;
static GSVector8 m_one;
static GSVector8 m_x7fffffff;
static GSVector8 m_x80000000;
static GSVector8 m_x4b000000;
static GSVector8 m_x4f800000;
static GSVector8 m_max;
static GSVector8 m_min;
static const GSVector8 m_half;
static const GSVector8 m_one;
static const GSVector8 m_x7fffffff;
static const GSVector8 m_x80000000;
static const GSVector8 m_x4b000000;
static const GSVector8 m_x4f800000;
static const GSVector8 m_max;
static const GSVector8 m_min;
static void InitVectors();
GSVector8() = default;
__forceinline GSVector8()
constexpr GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
: m(cxpr_setr_ps(x0, y0, z0, w0, x1, y1, z1, w1))
{
}
__forceinline GSVector8(float x0, float y0, float z0, float w0, float x1, float y1, float z1, float w1)
{
m = _mm256_set_ps(w1, z1, y1, x1, w0, z0, y0, x0);
}
__forceinline GSVector8(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
{
m = _mm256_cvtepi32_ps(_mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0));
@ -80,10 +120,7 @@ public:
#endif
}
__forceinline GSVector8(const GSVector8& v)
{
m = v.m;
}
constexpr GSVector8(const GSVector8& v) = default;
__forceinline explicit GSVector8(float f)
{
@ -110,9 +147,9 @@ public:
*this = m;
}
__forceinline explicit GSVector8(__m256 m)
constexpr explicit GSVector8(__m256 m)
: m(m)
{
this->m = m;
}
#if _M_SSE >= 0x501

View File

@ -23,10 +23,57 @@
class alignas(32) GSVector8i
{
static GSVector8i m_xff[33];
static GSVector8i m_x0f[33];
static const GSVector8i m_xff[33];
static const GSVector8i m_x0f[33];
public:
constexpr static __m256i cxpr_setr_epi32(uint32 x0, uint32 y0, uint32 z0, uint32 w0, uint32 x1, uint32 y1, uint32 z1, uint32 w1)
{
#ifdef __GNUC__
return (__m256i)__v8su{x0, y0, z0, w0, x1, y1, z1, w1};
#else
__m256i m = {};
m.m256i_u32[0] = x0;
m.m256i_u32[1] = y0;
m.m256i_u32[2] = z0;
m.m256i_u32[3] = w0;
m.m256i_u32[4] = x1;
m.m256i_u32[5] = y1;
m.m256i_u32[6] = z1;
m.m256i_u32[7] = w1;
return m;
#endif
}
constexpr static __m256i cxpr_set1_epi32(uint32 x)
{
return cxpr_setr_epi32(x, x, x, x, x, x, x, x);
}
constexpr static __m256i cxpr_setr_epi8(
uint8 b0, uint8 b1, uint8 b2, uint8 b3, uint8 b4, uint8 b5, uint8 b6, uint8 b7,
uint8 b8, uint8 b9, uint8 b10, uint8 b11, uint8 b12, uint8 b13, uint8 b14, uint8 b15,
uint8 b16, uint8 b17, uint8 b18, uint8 b19, uint8 b20, uint8 b21, uint8 b22, uint8 b23,
uint8 b24, uint8 b25, uint8 b26, uint8 b27, uint8 b28, uint8 b29, uint8 b30, uint8 b31)
{
#ifdef __GNUC__
return (__m256i)__v32qu
{
b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15,
b16, b17, b18, b19, b20, b21, b22, b23, b24, b25, b26, b27, b28, b29, b30, b31,
};
#else
__m256i m = {};
m.m256i_u8[0] = b0; m.m256i_u8[1] = b1; m.m256i_u8[2] = b2; m.m256i_u8[3] = b3;
m.m256i_u8[4] = b4; m.m256i_u8[5] = b5; m.m256i_u8[6] = b6; m.m256i_u8[7] = b7;
m.m256i_u8[8] = b8; m.m256i_u8[9] = b9; m.m256i_u8[10] = b10; m.m256i_u8[11] = b11;
m.m256i_u8[12] = b12; m.m256i_u8[13] = b13; m.m256i_u8[14] = b14; m.m256i_u8[15] = b15;
m.m256i_u8[16] = b16; m.m256i_u8[17] = b17; m.m256i_u8[18] = b18; m.m256i_u8[19] = b19;
m.m256i_u8[20] = b20; m.m256i_u8[21] = b21; m.m256i_u8[22] = b22; m.m256i_u8[23] = b23;
m.m256i_u8[24] = b24; m.m256i_u8[25] = b25; m.m256i_u8[26] = b26; m.m256i_u8[27] = b27;
m.m256i_u8[28] = b28; m.m256i_u8[29] = b29; m.m256i_u8[30] = b30; m.m256i_u8[31] = b31;
return m;
#endif
}
union
{
struct {int x0, y0, z0, w0, x1, y1, z1, w1;};
@ -45,9 +92,7 @@ public:
__m128i m0, m1;
};
static void InitVectors();
__forceinline GSVector8i() {}
GSVector8i() = default;
__forceinline explicit GSVector8i(const GSVector8& v, bool truncate = true);
@ -55,9 +100,9 @@ public:
__forceinline static GSVector8i cast(const GSVector4& v);
__forceinline static GSVector8i cast(const GSVector4i& v);
__forceinline GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
constexpr GSVector8i(int x0, int y0, int z0, int w0, int x1, int y1, int z1, int w1)
: m(cxpr_setr_epi32(x0, y0, z0, w0, x1, y1, z1, w1))
{
m = _mm256_set_epi32(w1, z1, y1, x1, w0, z0, y0, x0);
}
__forceinline GSVector8i(
@ -67,16 +112,15 @@ public:
m = _mm256_set_epi16(s15, s14, s13, s12, s11, s10, s9, s8, s7, s6, s5, s4, s3, s2, s1, s0);
}
__forceinline GSVector8i(
constexpr GSVector8i(
char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7,
char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15,
char b16, char b17, char b18, char b19, char b20, char b21, char b22, char b23,
char b24, char b25, char b26, char b27, char b28, char b29, char b30, char b31
)
char b24, char b25, char b26, char b27, char b28, char b29, char b30, char b31)
: m(cxpr_setr_epi8(
b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15,
b16, b17, b18, b19, b20, b21, b22, b23, b24, b25, b26, b27, b28, b29, b30, b31))
{
m = _mm256_set_epi8(
b31, b30, b29, b28, b27, b26, b25, b24, b23, b22, b21, b20, b19, b18, b17, b16,
b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0);
}
__forceinline GSVector8i(__m128i m0, __m128i m1)
@ -92,10 +136,7 @@ public:
#endif
}
__forceinline GSVector8i(const GSVector8i& v)
{
m = v.m;
}
GSVector8i(const GSVector8i& v) = default;
__forceinline explicit GSVector8i(int i)
{
@ -107,9 +148,9 @@ public:
*this = m;
}
__forceinline explicit GSVector8i(__m256i m)
constexpr explicit GSVector8i(__m256i m)
: m(m)
{
this->m = m;
}
__forceinline void operator = (const GSVector8i& v)

View File

@ -24,12 +24,7 @@
#include "GSUtil.h"
#include "GSState.h"
GSVector4 GSVertexTrace::s_minmax;
void GSVertexTrace::InitVectors()
{
s_minmax = GSVector4(FLT_MAX, -FLT_MAX);
}
const GSVector4 GSVertexTrace::s_minmax(FLT_MAX, -FLT_MAX, 0.f, 0.f);
GSVertexTrace::GSVertexTrace(const GSState* state)
: m_accurate_stq(false), m_state(state), m_primclass(GS_INVALID_CLASS)

View File

@ -41,7 +41,7 @@ public:
protected:
const GSState* m_state;
static GSVector4 s_minmax;
static const GSVector4 s_minmax;
typedef void (GSVertexTrace::*FindMinMaxPtr)(const void* vertex, const uint32* index, int count);
@ -72,8 +72,6 @@ public:
GSVector2 m_lod; // x = min, y = max
public:
static void InitVectors();
GSVertexTrace(const GSState* state);
virtual ~GSVertexTrace() {}

View File

@ -26,20 +26,11 @@
static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL;
GSVector4 GSRendererSW::m_pos_scale;
const GSVector4 GSRendererSW::m_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
#if _M_SSE >= 0x501
GSVector8 GSRendererSW::m_pos_scale2;
const GSVector8 GSRendererSW::m_pos_scale2(1.0f / 16, 1.0f / 16, 1.0f, 128.0f, 1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
#endif
void GSRendererSW::InitVectors()
{
m_pos_scale = GSVector4(1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
#if _M_SSE >= 0x501
m_pos_scale2 = GSVector8(1.0f / 16, 1.0f / 16, 1.0f, 128.0f, 1.0f / 16, 1.0f / 16, 1.0f, 128.0f);
#endif
}
GSRendererSW::GSRendererSW(int threads)
: m_fzb(NULL)
{

View File

@ -26,9 +26,9 @@
class GSRendererSW : public GSRenderer
{
static GSVector4 m_pos_scale;
static const GSVector4 m_pos_scale;
#if _M_SSE >= 0x501
static GSVector8 m_pos_scale2;
static const GSVector8 m_pos_scale2;
#endif
class SharedData : public GSDrawScanline::SharedData
@ -100,8 +100,6 @@ protected:
bool GetScanlineGlobalData(SharedData* data);
public:
static void InitVectors();
GSRendererSW(int threads);
virtual ~GSRendererSW();
};