GSdx-ogl: LINUX only. sync from trunk (5068:5090)

git-svn-id: http://pcsx2.googlecode.com/svn/branches/gsdx-ogl@5091 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut 2012-02-09 21:40:39 +00:00
commit 4a00648d9f
68 changed files with 3717 additions and 4212 deletions

View File

@ -57,7 +57,11 @@ include(SelectPcsx2Plugins)
# add additional project-wide include directories
include_directories(${PROJECT_SOURCE_DIR}/common/include
${PROJECT_SOURCE_DIR}/common/include/Utilities
${PROJECT_SOURCE_DIR}/common/include/x86emitter)
${PROJECT_SOURCE_DIR}/common/include/x86emitter
# WORKAROUND Some issue with multiarch on Debian/Ubuntu
/usr/include/i386-linux-gnu
/usr/include/x86_64-linux-gnu
)
# make the translation
if(EXISTS "${PROJECT_SOURCE_DIR}/locales")

View File

@ -5649,6 +5649,7 @@ Serial = SLUS-20911
Name = Shin Megami Tensei - Nocturne
Region = NTSC-U
Compat = 5
eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level
---------------------------------------------
Serial = SLUS-20912
Name = Superbikes TT
@ -10338,6 +10339,7 @@ Region = NTSC-U
Serial = SLUS-28045
Name = Shin Megami Tensei - Nocturne [Trade Demo]
Region = NTSC-U
eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level
---------------------------------------------
Serial = SLUS-28046
Name = Guilty Gear Isuka [Trade Demo]
@ -13611,6 +13613,7 @@ Region = NTSC-K
Serial = SLKA-25160
Name = Shin Megami Tensei III - Nocturne Maniax
Region = NTSC-K
eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level
---------------------------------------------
Serial = SLKA-25165
Name = Mobile Suit Gundam - Seed Destiny - Rengou vs. Z.A.F.T. II Plus
@ -17250,10 +17253,12 @@ Region = NTSC-J
Serial = SLPM-65241
Name = Shin Megami Tensei 3 - Nocturne [Limited Edition]
Region = NTSC-J
eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level
---------------------------------------------
Serial = SLPM-65242
Name = Shin Megami Tensei 3 - Nocturne
Region = NTSC-J
eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level
---------------------------------------------
Serial = SLPM-65243
Name = Densha de Go! Professional 2
@ -18019,11 +18024,13 @@ Region = NTSC-J
Serial = SLPM-65461
Name = Shin Megami Tensei 3 - Nocturne - Maniacs
Region = NTSC-J
eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level
---------------------------------------------
Serial = SLPM-65462
Name = Shin Megami Tensei 3 - Nocturne - Maniacs
Region = NTSC-J
Compat = 5
eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level
---------------------------------------------
Serial = SLPM-65463
Name = Rocky
@ -23700,6 +23707,7 @@ Region = NTSC-J
Serial = SLPM-74205
Name = Shin Megami Tensei III - Nocturne [PlayStation 2 The Best]
Region = NTSC-J
eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level
---------------------------------------------
Serial = SLPM-74206
Name = Onimusha [PlayStation 2 The Best]

View File

@ -169,6 +169,11 @@ static wxLanguage i18n_FallbackToAnotherLang( wxLanguage wxLangId )
case wxLANGUAGE_CHINESE_SINGAPORE : return wxLANGUAGE_CHINESE_SIMPLIFIED;
case wxLANGUAGE_SAMI :
// The correct fallback for Sami would be
// however, currently wxWidgets (2.9.3) only supports wxLANGUAGE_SAMI.
// case: wxLANGUAGE_SAMI_LULE_SWEDEN :
// case: wxLANGUAGE_SAMI_NORTHERN_SWEDEN :
// case: wxLANGUAGE_SAMI_SOUTHERN_SWEDEN :
case wxLANGUAGE_SWEDISH_FINLAND : return wxLANGUAGE_SWEDISH;
case wxLANGUAGE_PORTUGUESE : return wxLANGUAGE_PORTUGUESE_BRAZILIAN;
@ -178,8 +183,30 @@ static wxLanguage i18n_FallbackToAnotherLang( wxLanguage wxLangId )
case wxLANGUAGE_GERMAN_BELGIUM :
case wxLANGUAGE_GERMAN_LIECHTENSTEIN :
case wxLANGUAGE_GERMAN_LUXEMBOURG :
// Currently wxWidgets (2.9.3) doesn't support Sorbian.
// case wxLANGUAGE_LOWER_SORBIAN :
// case wxLANGUAGE_UPPER_SORBIAN :
case wxLANGUAGE_GERMAN_SWISS : return wxLANGUAGE_GERMAN;
case wxLANGUAGE_SPANISH_ARGENTINA:
case wxLANGUAGE_SPANISH_BOLIVIA:
case wxLANGUAGE_SPANISH_CHILE:
case wxLANGUAGE_SPANISH_COLOMBIA:
case wxLANGUAGE_SPANISH_COSTA_RICA:
case wxLANGUAGE_SPANISH_DOMINICAN_REPUBLIC:
case wxLANGUAGE_SPANISH_ECUADOR:
case wxLANGUAGE_SPANISH_EL_SALVADOR:
case wxLANGUAGE_SPANISH_GUATEMALA:
case wxLANGUAGE_SPANISH_HONDURAS:
case wxLANGUAGE_SPANISH_MEXICAN:
case wxLANGUAGE_SPANISH_NICARAGUA:
case wxLANGUAGE_SPANISH_PANAMA:
case wxLANGUAGE_SPANISH_PARAGUAY:
case wxLANGUAGE_SPANISH_PERU:
case wxLANGUAGE_SPANISH_PUERTO_RICO:
case wxLANGUAGE_SPANISH_URUGUAY:
case wxLANGUAGE_SPANISH_VENEZUELA: return wxLANGUAGE_SPANISH_MODERN;
case wxLANGUAGE_ITALIAN_SWISS : return wxLANGUAGE_ITALIAN;
default : break;

View File

@ -107,10 +107,6 @@ set(GSdxSources
GSUtil.cpp
GSVector.cpp
GSVertexTrace.cpp
GSVertexTrace.x64.avx.cpp
GSVertexTrace.x86.cpp
GSVertexTrace.x86.avx.cpp
GSVertexTrace.x64.cpp
GSWnd.cpp
GSdx.cpp
stdafx.cpp

View File

@ -28,8 +28,8 @@ const GSVector4i GPULocalMemory::m_xxbx(0x00007c00);
const GSVector4i GPULocalMemory::m_xgxx(0x000003e0);
const GSVector4i GPULocalMemory::m_rxxx(0x0000001f);
#define VM_SIZE ((1 << (12 + 11)) * sizeof(uint16))
#define VM_ALLOC_SIZE (VM_SIZE * 2)
#define VM_REAL_SIZE ((1 << (12 + 11)) * sizeof(uint16))
#define VM_ALLOC_SIZE (VM_REAL_SIZE * 2)
#define TEX_ALLOC_SIZE (256 * 256 * (1 + 1 + 4) * 32)
GPULocalMemory::GPULocalMemory()
@ -39,7 +39,7 @@ GPULocalMemory::GPULocalMemory()
//
int size = VM_SIZE;
int size = VM_REAL_SIZE;
m_vm = (uint16*)vmalloc(VM_ALLOC_SIZE, false);

View File

@ -214,7 +214,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
s_gs = NULL;
}
if(renderer == 12)
if(renderer == 15)
{
#ifdef _WINDOWS
@ -225,12 +225,11 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1)
return -1;
}
if(s_gs == NULL)
{
s_gs = new GSRendererCS();
delete s_gs;
s_renderer = renderer;
}
s_gs = new GSRendererCS();
s_renderer = renderer;
#endif
}

View File

@ -90,6 +90,12 @@ enum GIF_REG
GIF_REG_NOP = 0x0f,
};
enum GIF_REG_COMPLEX
{
GIF_REG_STQRGBAXYZF2 = 0x00,
GIF_REG_STQRGBAXYZ2 = 0x01,
};
enum GIF_A_D_REG
{
GIF_A_D_REG_PRIM = 0x00,
@ -821,7 +827,16 @@ union
};
};
REG_END2
__forceinline bool IsRepeating() {return (1 << TW) > (int)(TBW << 6) || (PSM == PSM_PSMT8 || PSM == PSM_PSMT4) && TBW == 1;}
__forceinline bool IsRepeating()
{
if(TBW < 2)
{
if(PSM == PSM_PSMT8) return TW > 7 || TH > 6;
if(PSM == PSM_PSMT4) return TW > 7 || TH > 7;
}
return (TBW << 6) < (1u << TW);
}
REG_END2
REG64_(GIFReg, TEX1)
@ -1090,21 +1105,77 @@ REG_SET_END
__aligned(struct, 32) GIFPath
{
GIFTag tag;
uint32 reg;
uint32 nreg;
uint32 nloop;
uint32 adonly;
uint32 nreg;
uint32 reg;
uint32 type;
GSVector4i regs;
void SetTag(const void* mem)
enum {TYPE_UNKNOWN, TYPE_ADONLY, TYPE_STQRGBAXYZF2, TYPE_STQRGBAXYZ2};
__forceinline void SetTag(const void* mem)
{
GSVector4i v = GSVector4i::load<false>(mem);
GSVector4i::store<true>(&tag, v);
const GIFTag* RESTRICT src = (const GIFTag*)mem;
// the compiler has a hard time not reloading every time a field of src is accessed
uint32 a = src->u32[0];
uint32 b = src->u32[1];
tag.u32[0] = a;
tag.u32[1] = b;
nloop = a & 0x7fff;
if(nloop == 0) return;
GSVector4i v = GSVector4i::loadl(&src->REGS); // REGS not stored to tag.REGS, only into this->regs, restored before saving the state though
nreg = (b & 0xf0000000) ? (b >> 28) : 16; // src->NREG
regs = v.upl8(v >> 4) & GSVector4i::x0f(nreg);
reg = 0;
regs = v.uph8(v >> 4) & 0x0f0f0f0f;
nreg = tag.NREG ? tag.NREG : 16;
nloop = tag.NLOOP;
adonly = regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1;
type = TYPE_UNKNOWN;
if(tag.FLG == GIF_FLG_PACKED)
{
if(regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1)
{
type = TYPE_ADONLY;
}
else
{
switch(nreg)
{
case 1: break;
case 2: break;
case 3:
if(regs.u32[0] == 0x00040102) type = TYPE_STQRGBAXYZF2; // many games, TODO: formats mixed with NOPs (xeno2: 040f010f02, 04010f020f, mgs3: 04010f0f02, 0401020f0f, 04010f020f)
if(regs.u32[0] == 0x00050102) type = TYPE_STQRGBAXYZ2; // GoW (has other crazy formats, like ...030503050103)
// TODO: common types with UV instead
break;
case 4: break;
case 5: break;
case 6: break;
case 7: break;
case 8: break;
case 9:
if(regs.u32[0] == 0x02040102 && regs.u32[1] == 0x01020401 && regs.u32[2] == 0x00000004) {type = TYPE_STQRGBAXYZF2; nreg = 3; nloop *= 3;} // ffx
break;
case 10: break;
case 11: break;
case 12:
if(regs.u32[0] == 0x02040102 && regs.u32[1] == 0x01020401 && regs.u32[2] == 0x04010204) {type = TYPE_STQRGBAXYZF2; nreg = 3; nloop *= 4;} // dq8 (not many, mostly 040102)
break;
case 13: break;
case 14: break;
case 15: break;
case 16: break;
default:
__assume(0);
}
}
}
}
__forceinline uint8 GetReg()

View File

@ -884,7 +884,7 @@ public:
}
}
static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs
template<bool AEM> static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs
{
const GSVector4i* s = (const GSVector4i*)src;
@ -895,44 +895,36 @@ public:
GSVector4i bm = m_xxbx;
GSVector4i l, h;
if(TEXA.AEM)
for(int i = 0; i < 8; i++, dst += dstpitch)
{
for(int i = 0; i < 8; i++, dst += dstpitch)
GSVector4i v0 = s[i * 2 + 0];
l = v0.upl16(v0);
h = v0.uph16(v0);
if(AEM)
{
GSVector4i v0 = s[i * 2 + 0];
l = v0.upl16(v0);
h = v0.uph16(v0);
((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero());
((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero());
}
else
{
((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15));
((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15));
}
GSVector4i v1 = s[i * 2 + 1];
GSVector4i v1 = s[i * 2 + 1];
l = v1.upl16(v1);
h = v1.uph16(v1);
l = v1.upl16(v1);
h = v1.uph16(v1);
if(AEM)
{
((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero());
((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero());
}
}
else
{
for(int i = 0; i < 8; i++, dst += dstpitch)
else
{
GSVector4i v0 = s[i * 2 + 0];
l = v0.upl16(v0);
h = v0.uph16(v0);
((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15));
((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15));
GSVector4i v1 = s[i * 2 + 1];
l = v1.upl16(v1);
h = v1.uph16(v1);
((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15));
((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15));
}
@ -1432,6 +1424,56 @@ public:
}
}
}
template<bool AEM> __forceinline static GSVector4i Expand16to32(const GSVector4i& c, const GSVector4i& TA0, const GSVector4i& TA1)
{
return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == GSVector4i::zero()) : TA0.blend(TA1, c.sra16(15)));
}
template<bool AEM> __forceinline static void ReadAndExpandBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA)
{
#if 0 // not faster
const GSVector4i* s = (const GSVector4i*)src;
GSVector4i TA0(TEXA.TA0 << 24);
GSVector4i TA1(TEXA.TA1 << 24);
for(int i = 0; i < 4; i++, dst += dstpitch * 2)
{
GSVector4i v0 = s[i * 4 + 0];
GSVector4i v1 = s[i * 4 + 1];
GSVector4i v2 = s[i * 4 + 2];
GSVector4i v3 = s[i * 4 + 3];
GSVector4i::sw16(v0, v1, v2, v3);
GSVector4i::sw32(v0, v1, v2, v3);
GSVector4i::sw16(v0, v2, v1, v3);
GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0];
d0[0] = Expand16to32<AEM>(v0.upl16(v0), TA0, TA1);
d0[1] = Expand16to32<AEM>(v0.uph16(v0), TA0, TA1);
d0[2] = Expand16to32<AEM>(v1.upl16(v1), TA0, TA1);
d0[3] = Expand16to32<AEM>(v1.uph16(v1), TA0, TA1);
GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1];
d1[0] = Expand16to32<AEM>(v2.upl16(v2), TA0, TA1);
d1[1] = Expand16to32<AEM>(v2.uph16(v2), TA0, TA1);
d1[2] = Expand16to32<AEM>(v3.upl16(v3), TA0, TA1);
d1[3] = Expand16to32<AEM>(v3.uph16(v3), TA0, TA1);
}
#else
__aligned(uint16, 32) block[16 * 8];
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16<AEM>(block, dst, dstpitch, TEXA);
#endif
}
__forceinline static void ReadAndExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal)
{

View File

@ -389,6 +389,8 @@ void GSClut::GetAlphaMinMax32(int& amin, int& amax)
void GSClut::WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut)
{
// 4 blocks
for(int i = 0; i < 64; i += 16)
{
WriteCLUT_T32_I4_CSM1(&src[i + 0], &clut[i * 2 + 0]);
@ -400,6 +402,8 @@ void GSClut::WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT
__forceinline void GSClut::WriteCLUT_T32_I4_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut)
{
// 1 block
GSVector4i* s = (GSVector4i*)src;
GSVector4i* d = (GSVector4i*)clut;
@ -420,6 +424,8 @@ __forceinline void GSClut::WriteCLUT_T32_I4_CSM1(const uint32* RESTRICT src, uin
void GSClut::WriteCLUT_T16_I8_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut)
{
// 2 blocks
GSVector4i* s = (GSVector4i*)src;
GSVector4i* d = (GSVector4i*)clut;
@ -443,6 +449,8 @@ void GSClut::WriteCLUT_T16_I8_CSM1(const uint16* RESTRICT src, uint16* RESTRICT
__forceinline void GSClut::WriteCLUT_T16_I4_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut)
{
// 1 block (half)
for(int i = 0; i < 16; i++)
{
clut[i] = src[clutTableT16I4[i]];

View File

@ -103,6 +103,7 @@ public:
virtual void BeginScene() {}
virtual void DrawPrimitive() {};
virtual void DrawIndexedPrimitive() {}
virtual void DrawIndexedPrimitive(int offset, int count) {}
virtual void EndScene();
virtual void ClearRenderTarget(GSTexture* t, const GSVector4& c) {}

View File

@ -98,8 +98,6 @@ bool GSDevice11::Create(GSWnd* wnd)
hr = D3D11CreateDeviceAndSwapChain(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, flags, levels, countof(levels), D3D11_SDK_VERSION, &scd, &m_swapchain, &m_dev, &level, &m_ctx);
// hr = D3D11CreateDeviceAndSwapChain(NULL, D3D_DRIVER_TYPE_REFERENCE, NULL, flags, NULL, 0, D3D11_SDK_VERSION, &scd, &m_swapchain, &m_dev, &level, &m_ctx);
//return false;
if(FAILED(hr)) return false;
if(!SetFeatureLevel(level, true))
@ -360,6 +358,13 @@ void GSDevice11::DrawIndexedPrimitive()
m_ctx->DrawIndexed(m_index.count, m_index.start, m_vertex.start);
}
void GSDevice11::DrawIndexedPrimitive(int offset, int count)
{
ASSERT(offset + count <= m_index.count);
m_ctx->DrawIndexed(count, m_index.start + offset, m_vertex.start);
}
void GSDevice11::Dispatch(uint32 x, uint32 y, uint32 z)
{
m_ctx->Dispatch(x, y, z);
@ -720,6 +725,18 @@ void GSDevice11::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vert
}
void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t count)
{
void* ptr = NULL;
if(IAMapVertexBuffer(&ptr, stride, count))
{
GSVector4i::storent(ptr, vertex, count * stride);
IAUnmapVertexBuffer();
}
}
bool GSDevice11::IAMapVertexBuffer(void** vertex, size_t stride, size_t count)
{
ASSERT(m_vertex.count == 0);
@ -729,7 +746,6 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou
m_vb = NULL;
m_vertex.start = 0;
m_vertex.count = 0;
m_vertex.limit = std::max<int>(count * 3 / 2, 11000);
}
@ -748,7 +764,7 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou
hr = m_dev->CreateBuffer(&bd, NULL, &m_vb);
if(FAILED(hr)) return;
if(FAILED(hr)) return false;
}
D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE;
@ -762,17 +778,24 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou
D3D11_MAPPED_SUBRESOURCE m;
if(SUCCEEDED(m_ctx->Map(m_vb, 0, type, 0, &m)))
if(FAILED(m_ctx->Map(m_vb, 0, type, 0, &m)))
{
GSVector4i::storent((uint8*)m.pData + m_vertex.start * stride, vertex, count * stride);
m_ctx->Unmap(m_vb, 0);
return false;
}
*vertex = (uint8*)m.pData + m_vertex.start * stride;
m_vertex.count = count;
m_vertex.stride = stride;
IASetVertexBuffer(m_vb, stride);
return true;
}
void GSDevice11::IAUnmapVertexBuffer()
{
m_ctx->Unmap(m_vb, 0);
IASetVertexBuffer(m_vb, m_vertex.stride);
}
void GSDevice11::IASetVertexBuffer(ID3D11Buffer* vb, size_t stride)
@ -798,7 +821,7 @@ void GSDevice11::IASetIndexBuffer(const void* index, size_t count)
m_ib_old = m_ib;
m_ib = NULL;
m_index.count = 0;
m_index.start = 0;
m_index.limit = std::max<int>(count * 3 / 2, 11000);
}
@ -904,7 +927,11 @@ void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1)
{
PSSetShaderResource(0, sr0);
PSSetShaderResource(1, sr1);
PSSetShaderResource(2, NULL);
for(int i = 2; i < countof(m_state.ps_srv); i++)
{
PSSetShaderResource(i, NULL);
}
}
void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
@ -913,6 +940,13 @@ void GSDevice11::PSSetShaderResource(int i, GSTexture* sr)
if(sr) srv = *(GSTexture11*)sr;
PSSetShaderResourceView(i, srv);
}
void GSDevice11::PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv)
{
ASSERT(i < countof(m_state.ps_srv));
if(m_state.ps_srv[i] != srv)
{
m_state.ps_srv[i] = srv;
@ -944,14 +978,14 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
if(m_srv_changed)
{
m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv);
m_ctx->PSSetShaderResources(0, countof(m_state.ps_srv), m_state.ps_srv);
m_srv_changed = false;
}
if(m_ss_changed)
{
m_ctx->PSSetSamplers(0, 3, m_state.ps_ss);
m_ctx->PSSetSamplers(0, countof(m_state.ps_ss), m_state.ps_ss);
m_ss_changed = false;
}
@ -966,9 +1000,9 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb)
void GSDevice11::CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv)
{
// TODO: if(m_state.cs_srv[i] != srv)
if(m_state.cs_srv[i] != srv)
{
// TODO: m_state.cs_srv[i] = srv;
m_state.cs_srv[i] = srv;
m_ctx->CSSetShaderResources(i, 1, &srv);
}
@ -976,17 +1010,14 @@ void GSDevice11::CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv)
void GSDevice11::CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav)
{
// TODO: if(m_state.cs_uav[i] != uav)
{
// TODO: m_state.cs_uav[i] = uav;
uint32 counters[8];
memset(counters, 0, sizeof(counters));
// uint32 count[] = {-1};
m_ctx->CSSetUnorderedAccessViews(i, 1, &uav, NULL);
}
m_ctx->CSSetUnorderedAccessViews(i, 1, &uav, counters);
}
void GSDevice11::CSSetShader(ID3D11ComputeShader* cs)
void GSDevice11::CSSetShader(ID3D11ComputeShader* cs, ID3D11Buffer* cs_cb)
{
if(m_state.cs != cs)
{
@ -994,6 +1025,13 @@ void GSDevice11::CSSetShader(ID3D11ComputeShader* cs)
m_ctx->CSSetShader(cs, NULL, 0);
}
if(m_state.cs_cb != cs_cb)
{
m_state.cs_cb = cs_cb;
m_ctx->CSSetConstantBuffers(0, 1, &cs_cb);
}
}
void GSDevice11::OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref)
@ -1064,6 +1102,41 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector
}
}
void GSDevice11::OMSetRenderTargets(const GSVector2i& rtsize, int count, ID3D11UnorderedAccessView** uav, uint32* counters, const GSVector4i* scissor)
{
m_ctx->OMSetRenderTargetsAndUnorderedAccessViews(0, NULL, NULL, 0, count, uav, counters);
m_state.rtv = NULL;
m_state.dsv = NULL;
if(m_state.viewport != rtsize)
{
m_state.viewport = rtsize;
D3D11_VIEWPORT vp;
memset(&vp, 0, sizeof(vp));
vp.TopLeftX = 0;
vp.TopLeftY = 0;
vp.Width = (float)rtsize.x;
vp.Height = (float)rtsize.y;
vp.MinDepth = 0.0f;
vp.MaxDepth = 1.0f;
m_ctx->RSSetViewports(1, &vp);
}
GSVector4i r = scissor ? *scissor : GSVector4i(rtsize).zwxy();
if(!m_state.scissor.eq(r))
{
m_state.scissor = r;
m_ctx->RSSetScissorRects(1, r);
}
}
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il)
{
HRESULT hr;
@ -1135,6 +1208,38 @@ HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MAC
return hr;
}
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count)
{
HRESULT hr;
vector<D3D11_SHADER_MACRO> m;
PrepareShaderMacro(m, macro);
CComPtr<ID3D11Blob> shader, error;
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL);
if(error)
{
printf("%s\n", (const char*)error->GetBufferPointer());
}
if(FAILED(hr))
{
return hr;
}
hr = m_dev->CreateGeometryShaderWithStreamOutput((void*)shader->GetBufferPointer(), shader->GetBufferSize(), layout, count, NULL, 0, D3D11_SO_NO_RASTERIZED_STREAM, NULL, gs);
if(FAILED(hr))
{
return hr;
}
return hr;
}
HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps)
{
HRESULT hr;
@ -1177,7 +1282,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MAC
CComPtr<ID3D11Blob> shader, error;
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.ps.c_str(), 0, 0, NULL, &shader, &error, NULL);
hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.cs.c_str(), 0, 0, NULL, &shader, &error, NULL);
if(error)
{

View File

@ -60,11 +60,13 @@ class GSDevice11 : public GSDeviceDX
ID3D11VertexShader* vs;
ID3D11Buffer* vs_cb;
ID3D11GeometryShader* gs;
ID3D11ShaderResourceView* ps_srv[3];
ID3D11ShaderResourceView* ps_srv[16];
ID3D11PixelShader* ps;
ID3D11Buffer* ps_cb;
ID3D11SamplerState* ps_ss[3];
ID3D11ShaderResourceView* cs_srv[16];
ID3D11ComputeShader* cs;
ID3D11Buffer* cs_cb;
GSVector2i viewport;
GSVector4i scissor;
ID3D11DepthStencilState* dss;
@ -146,6 +148,7 @@ public:
void DrawPrimitive();
void DrawIndexedPrimitive();
void DrawIndexedPrimitive(int offset, int count);
void Dispatch(uint32 x, uint32 y, uint32 z);
void ClearRenderTarget(GSTexture* t, const GSVector4& c);
@ -169,6 +172,8 @@ public:
void StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, const GSVector4& dr, ID3D11PixelShader* ps, ID3D11Buffer* ps_cb, ID3D11BlendState* bs, bool linear = true);
void IASetVertexBuffer(const void* vertex, size_t stride, size_t count);
bool IAMapVertexBuffer(void** vertex, size_t stride, size_t count);
void IAUnmapVertexBuffer();
void IASetVertexBuffer(ID3D11Buffer* vb, size_t stride);
void IASetIndexBuffer(const void* index, size_t count);
void IASetIndexBuffer(ID3D11Buffer* ib);
@ -178,16 +183,17 @@ public:
void GSSetShader(ID3D11GeometryShader* gs);
void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1);
void PSSetShaderResource(int i, GSTexture* sr);
void PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv);
void PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb);
void PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2 = NULL);
void CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv);
void CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav);
void CSSetShader(ID3D11ComputeShader* cs);
void CSSetShader(ID3D11ComputeShader* cs, ID3D11Buffer* cs_cb);
void OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref);
void OMSetBlendState(ID3D11BlendState* bs, float bf);
void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL);
void OMSetRenderTargets(const GSVector2i& rtsize, int count, ID3D11UnorderedAccessView** uav, uint32* counters, const GSVector4i* scissor = NULL);
void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim);
void SetupVS(VSSelector sel, const VSConstantBuffer* cb);
void SetupGS(GSSelector sel);
void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel);
@ -202,6 +208,7 @@ public:
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps);
HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs);
HRESULT CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs);

View File

@ -911,6 +911,18 @@ void GSDevice9::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* verti
}
void GSDevice9::IASetVertexBuffer(const void* vertex, size_t stride, size_t count)
{
void* ptr = NULL;
if(IAMapVertexBuffer(&ptr, stride, count))
{
GSVector4i::storent(ptr, vertex, count * stride);
IAUnmapVertexBuffer();
}
}
bool GSDevice9::IAMapVertexBuffer(void** vertex, size_t stride, size_t count)
{
ASSERT(m_vertex.count == 0);
@ -930,7 +942,7 @@ void GSDevice9::IASetVertexBuffer(const void* vertex, size_t stride, size_t coun
hr = m_dev->CreateVertexBuffer(m_vertex.limit * stride, D3DUSAGE_DYNAMIC | D3DUSAGE_WRITEONLY, 0, D3DPOOL_DEFAULT, &m_vb, NULL);
if(FAILED(hr)) return;
if(FAILED(hr)) return false;
}
uint32 flags = D3DLOCK_NOOVERWRITE;
@ -942,19 +954,22 @@ void GSDevice9::IASetVertexBuffer(const void* vertex, size_t stride, size_t coun
flags = D3DLOCK_DISCARD;
}
void* ptr = NULL;
if(SUCCEEDED(m_vb->Lock(m_vertex.start * stride, count * stride, &ptr, flags)))
if(FAILED(m_vb->Lock(m_vertex.start * stride, count * stride, vertex, flags)))
{
GSVector4i::storent(ptr, vertex, count * stride);
m_vb->Unlock();
return false;
}
m_vertex.count = count;
m_vertex.stride = stride;
IASetVertexBuffer(m_vb, stride);
return true;
}
void GSDevice9::IAUnmapVertexBuffer()
{
m_vb->Unlock();
IASetVertexBuffer(m_vb, m_vertex.stride);
}
void GSDevice9::IASetVertexBuffer(IDirect3DVertexBuffer9* vb, size_t stride)

View File

@ -196,6 +196,8 @@ public:
void StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, const GSVector4& dr, IDirect3DPixelShader9* ps, const float* ps_cb, int ps_cb_len, Direct3DBlendState9* bs, bool linear = true);
void IASetVertexBuffer(const void* vertex, size_t stride, size_t count);
bool IAMapVertexBuffer(void** vertex, size_t stride, size_t count);
void IAUnmapVertexBuffer();
void IASetVertexBuffer(IDirect3DVertexBuffer9* vb, size_t stride);
void IASetIndexBuffer(const void* index, size_t count);
void IASetIndexBuffer(IDirect3DIndexBuffer9* ib);
@ -216,7 +218,6 @@ public:
HRESULT CompileShader(uint32 id, const string& entry, const D3DXMACRO* macro, IDirect3DVertexShader9** vs, const D3DVERTEXELEMENT9* layout, int count, IDirect3DVertexDeclaration9** il);
HRESULT CompileShader(uint32 id, const string& entry, const D3DXMACRO* macro, IDirect3DPixelShader9** ps);
void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim);
void SetupVS(VSSelector sel, const VSConstantBuffer* cb);
void SetupGS(GSSelector sel) {}
void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel);

View File

@ -279,7 +279,6 @@ public:
bool SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode);
void GetFeatureLevel(D3D_FEATURE_LEVEL& level) const {level = m_shader.level;}
virtual void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim) = 0;
virtual void SetupVS(VSSelector sel, const VSConstantBuffer* cb) = 0;
virtual void SetupGS(GSSelector sel) = 0;
virtual void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel) = 0;

View File

@ -91,6 +91,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data)
sel.fb = m_global.sel.fb;
sel.zb = m_global.sel.zb;
sel.zoverflow = m_global.sel.zoverflow;
sel.notest = m_global.sel.notest;
m_sp = m_sp_map[sel];
}
@ -272,17 +273,24 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
// Init
int skip = left & 3;
int skip, steps;
left -= skip;
int steps = pixels + skip - 4;
if(!sel.notest)
{
skip = left & 3;
steps = pixels + skip - 4;
left -= skip;
test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
}
else
{
skip = 0;
steps = pixels - 4;
}
const GSVector2i* fza_base = &m_global.fzbr[top];
const GSVector2i* fza_offset = &m_global.fzbc[left >> 2];
test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
if(sel.prim != GS_SPRITE_CLASS)
{
if(sel.fwrite && sel.fge)
@ -318,7 +326,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
}
else if(sel.ltf)
{
vf = v.xxzzlh().srl16(1);
vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION);
}
s = GSVector4::cast(u);
@ -508,8 +516,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
u -= 0x8000;
v -= 0x8000;
uf = u.xxzzlh().srl16(1);
vf = v.xxzzlh().srl16(1);
uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION);
vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION);
}
GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
@ -629,8 +637,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
u -= 0x8000;
v -= 0x8000;
uf = u.xxzzlh().srl16(1);
vf = v.xxzzlh().srl16(1);
uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION);
vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION);
}
GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16));
@ -764,11 +772,11 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
if(sel.ltf)
{
uf = u.xxzzlh().srl16(1);
uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION);
if(sel.prim != GS_SPRITE_CLASS)
{
vf = v.xxzzlh().srl16(1);
vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION);
}
}
@ -1000,27 +1008,30 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
int fzm = 0;
if(sel.fwrite)
if(!sel.notest)
{
fm |= test;
}
if(sel.fwrite)
{
fm |= test;
}
if(sel.zwrite)
{
zm |= test;
}
if(sel.zwrite)
{
zm |= test;
}
if(sel.fwrite && sel.zwrite)
{
fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
}
else if(sel.fwrite)
{
fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask();
}
else if(sel.zwrite)
{
fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask();
if(sel.fwrite && sel.zwrite)
{
fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask();
}
else if(sel.fwrite)
{
fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask();
}
else if(sel.zwrite)
{
fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask();
}
}
// WriteZBuf
@ -1030,16 +1041,39 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
if(sel.ztest && sel.zpsm < 2)
{
zs = zs.blend8(zd, zm);
}
if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs);
if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs);
bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest;
if(sel.notest)
{
if(fast)
{
GSVector4i::storel((uint8*)m_global.vm + za * 2, zs);
GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs);
}
else
{
WritePixel(zs, za, 0, sel.zpsm);
WritePixel(zs, za, 1, sel.zpsm);
WritePixel(zs, za, 2, sel.zpsm);
WritePixel(zs, za, 3, sel.zpsm);
}
}
else
{
if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm);
if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm);
if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm);
if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm);
if(fast)
{
if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs);
if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs);
}
else
{
if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm);
if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm);
if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm);
if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm);
}
}
}
@ -1197,17 +1231,37 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
fs = fs.blend(fd, fm);
}
if(sel.rfb && sel.fpsm < 2)
bool fast = sel.rfb ? sel.fpsm < 2 : sel.fpsm == 0 && sel.notest;
if(sel.notest)
{
if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs);
if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs);
if(fast)
{
GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs);
GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs);
}
else
{
WritePixel(fs, fa, 0, sel.fpsm);
WritePixel(fs, fa, 1, sel.fpsm);
WritePixel(fs, fa, 2, sel.fpsm);
WritePixel(fs, fa, 3, sel.fpsm);
}
}
else
{
if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm);
if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm);
if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm);
if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm);
if(fast)
{
if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs);
if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs);
}
else
{
if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm);
if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm);
if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm);
if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm);
}
}
}
}
@ -1273,7 +1327,10 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
}
}
test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
if(!sel.notest)
{
test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))];
}
}
}
@ -1492,6 +1549,7 @@ void GSDrawScanline::DrawRectT(const int* RESTRICT row, const int* RESTRICT col,
if(masked) ASSERT(mask.u32[0] != 0);
color = color.andnot(mask);
c = color.extract32<0>();
GSVector4i br = r.ralign<Align_Inside>(GSVector2i(8 * 4 / sizeof(T), 8));

View File

@ -250,31 +250,40 @@ L("exit");
void GSDrawScanlineCodeGenerator::Init()
{
// int skip = left & 3;
if(!m_sel.notest)
{
// int skip = left & 3;
mov(ebx, edx);
and(edx, 3);
mov(ebx, edx);
and(edx, 3);
// left -= skip;
// int steps = pixels + skip - 4;
sub(ebx, edx);
lea(ecx, ptr[ecx + edx - 4]);
// int steps = pixels + skip - 4;
// left -= skip;
lea(ecx, ptr[ecx + edx - 4]);
sub(ebx, edx);
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
shl(edx, 4);
shl(edx, 4);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
vpor(xmm7, ptr[eax + (size_t)&m_test[7]]);
vpor(xmm7, ptr[eax + (size_t)&m_test[7]]);
}
else
{
mov(ebx, edx); // left
xor(edx, edx); // skip
lea(ecx, ptr[ecx - 4]); // steps
}
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
@ -380,7 +389,8 @@ void GSDrawScanlineCodeGenerator::Init()
{
vpshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm6, 1);
vpsrlw(xmm6, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm6, 15 - GS_BILINEAR_PRECISION);
vmovdqa(ptr[&m_local.temp.vf], xmm6);
}
}
@ -573,14 +583,17 @@ void GSDrawScanlineCodeGenerator::Step()
}
}
// test = m_test[7 + (steps & (steps >> 31))];
if(!m_sel.notest)
{
// test = m_test[7 + (steps & (steps >> 31))];
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
shl(edx, 4);
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
shl(edx, 4);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
}
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
@ -730,7 +743,8 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION);
vmovdqa(ptr[&m_local.temp.uf], xmm0);
if(m_sel.prim != GS_SPRITE_CLASS)
@ -739,7 +753,8 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION);
vmovdqa(ptr[&m_local.temp.vf], xmm0);
}
}
@ -1283,14 +1298,16 @@ return;
vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION);
vmovdqa(ptr[&m_local.temp.uf], xmm0);
// GSVector4i vf = v.xxzzlh().srl16(1);
vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION);
vmovdqa(ptr[&m_local.temp.vf], xmm0);
}
@ -1524,14 +1541,16 @@ return;
vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION);
vmovdqa(ptr[&m_local.temp.uf], xmm0);
// GSVector4i vf = v.xxzzlh().srl16(1);
vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
vpsrlw(xmm0, 1);
vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION);
vmovdqa(ptr[&m_local.temp.vf], xmm0);
}
@ -2302,6 +2321,11 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
void GSDrawScanlineCodeGenerator::WriteMask()
{
if(m_sel.notest)
{
return;
}
// fm |= test;
// zm |= test;
@ -2348,17 +2372,17 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
return;
}
bool fast = m_sel.ztest && m_sel.zpsm < 2;
vmovdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]);
if(fast)
if(m_sel.ztest && m_sel.zpsm < 2)
{
// zs = zs.blend8(zd, zm);
vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4);
}
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
}
@ -2664,7 +2688,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
}
bool fast = m_sel.rfb && m_sel.fpsm < 2;
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
}
@ -2677,49 +2701,67 @@ void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{
if(fast)
if(m_sel.notest)
{
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
test(mask, 0x0f);
je("@f");
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
L("@@");
test(mask, 0xf0);
je("@f");
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
L("@@");
// vmaskmovps?
if(fast)
{
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
}
else
{
WritePixel(src, addr, 0, psm);
WritePixel(src, addr, 1, psm);
WritePixel(src, addr, 2, psm);
WritePixel(src, addr, 3, psm);
}
}
else
{
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
if(fast)
{
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
L("@@");
test(mask, 0x0f);
je("@f");
vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
L("@@");
test(mask, 0xf0);
je("@f");
vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
L("@@");
// vmaskmovps?
}
else
{
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
L("@@");
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
L("@@");
}
}
}

View File

@ -250,31 +250,40 @@ L("exit");
void GSDrawScanlineCodeGenerator::Init()
{
// int skip = left & 3;
if(!m_sel.notest)
{
// int skip = left & 3;
mov(ebx, edx);
and(edx, 3);
mov(ebx, edx);
and(edx, 3);
// left -= skip;
// int steps = pixels + skip - 4;
sub(ebx, edx);
lea(ecx, ptr[ecx + edx - 4]);
// int steps = pixels + skip - 4;
// left -= skip;
lea(ecx, ptr[ecx + edx - 4]);
sub(ebx, edx);
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
// GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))];
shl(edx, 4);
shl(edx, 4);
movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
mov(eax, ecx);
sar(eax, 31);
and(eax, ecx);
shl(eax, 4);
por(xmm7, ptr[eax + (size_t)&m_test[7]]);
por(xmm7, ptr[eax + (size_t)&m_test[7]]);
}
else
{
mov(ebx, edx); // left
xor(edx, edx); // skip
lea(ecx, ptr[ecx - 4]); // steps
}
// GSVector2i* fza_base = &m_local.gd->fzbr[top];
@ -380,7 +389,8 @@ void GSDrawScanlineCodeGenerator::Init()
{
pshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm6, 1);
psrlw(xmm6, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) psllw(xmm6, 15 - GS_BILINEAR_PRECISION);
movdqa(ptr[&m_local.temp.vf], xmm6);
}
}
@ -578,14 +588,17 @@ void GSDrawScanlineCodeGenerator::Step()
}
}
// test = m_test[7 + (steps & (steps >> 31))];
if(!m_sel.notest)
{
// test = m_test[7 + (steps & (steps >> 31))];
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
shl(edx, 4);
mov(edx, ecx);
sar(edx, 31);
and(edx, ecx);
shl(edx, 4);
movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]);
}
}
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
@ -735,7 +748,8 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
psrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION);
movdqa(ptr[&m_local.temp.uf], xmm0);
if(m_sel.prim != GS_SPRITE_CLASS)
@ -744,7 +758,8 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
psrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION);
movdqa(ptr[&m_local.temp.vf], xmm0);
}
}
@ -1338,14 +1353,16 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
psrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION);
movdqa(ptr[&m_local.temp.uf], xmm0);
// GSVector4i vf = v.xxzzlh().srl16(1);
pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
psrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION);
movdqa(ptr[&m_local.temp.vf], xmm0);
}
@ -1591,14 +1608,16 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD()
pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
psrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION);
movdqa(ptr[&m_local.temp.uf], xmm0);
// GSVector4i vf = v.xxzzlh().srl16(1);
pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0));
pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0));
psrlw(xmm0, 1);
psrlw(xmm0, 16 - GS_BILINEAR_PRECISION);
if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION);
movdqa(ptr[&m_local.temp.vf], xmm0);
}
@ -2415,6 +2434,11 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
void GSDrawScanlineCodeGenerator::WriteMask()
{
if(m_sel.notest)
{
return;
}
// fm |= test;
// zm |= test;
@ -2462,11 +2486,9 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
return;
}
bool fast = m_sel.ztest && m_sel.zpsm < 2;
movdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]);
if(fast)
if(m_sel.ztest && m_sel.zpsm < 2)
{
// zs = zs.blend8(zd, zm);
@ -2475,6 +2497,8 @@ void GSDrawScanlineCodeGenerator::WriteZBuf()
blend8(xmm1, xmm7);
}
bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest;
WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1);
}
@ -2804,7 +2828,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame()
blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm
}
bool fast = m_sel.rfb && m_sel.fpsm < 2;
bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest;
WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0);
}
@ -2817,47 +2841,65 @@ void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr)
void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz)
{
if(fast)
if(m_sel.notest)
{
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
test(mask, 0x0f);
je("@f");
movq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
L("@@");
test(mask, 0xf0);
je("@f");
movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
L("@@");
if(fast)
{
movq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
}
else
{
WritePixel(src, addr, 0, psm);
WritePixel(src, addr, 1, psm);
WritePixel(src, addr, 2, psm);
WritePixel(src, addr, 3, psm);
}
}
else
{
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
if(fast)
{
// if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs);
// if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs);
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
L("@@");
test(mask, 0x0f);
je("@f");
movq(qword[addr * 2 + (size_t)m_local.gd->vm], src);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
L("@@");
test(mask, 0xf0);
je("@f");
movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src);
L("@@");
}
else
{
// if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>());
// if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>());
// if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>());
// if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>());
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
L("@@");
test(mask, 0x03);
je("@f");
WritePixel(src, addr, 0, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
L("@@");
test(mask, 0x0c);
je("@f");
WritePixel(src, addr, 1, psm);
L("@@");
test(mask, 0x30);
je("@f");
WritePixel(src, addr, 2, psm);
L("@@");
test(mask, 0xc0);
je("@f");
WritePixel(src, addr, 3, psm);
L("@@");
}
}
}

View File

@ -53,7 +53,8 @@ public:
GSOffset* fb;
GSOffset* zb;
GSOffset* tex;
GSPixelOffset4* fzb;
GSPixelOffset* fzb;
GSPixelOffset4* fzb4;
} offset;
GSDrawingContext()

View File

@ -342,55 +342,55 @@ GSLocalMemory::GSLocalMemory()
m_psm[PSM_PSMCT24].rtx = &GSLocalMemory::ReadTexture24;
m_psm[PSM_PSMCT16].rtx = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16S;
m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMT8].rtx = &GSLocalMemory::ReadTexture8;
m_psm[PSM_PSMT4].rtx = &GSLocalMemory::ReadTexture4;
m_psm[PSM_PSMT8H].rtx = &GSLocalMemory::ReadTexture8H;
m_psm[PSM_PSMT4HL].rtx = &GSLocalMemory::ReadTexture4HL;
m_psm[PSM_PSMT4HH].rtx = &GSLocalMemory::ReadTexture4HH;
m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32Z;
m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24Z;
m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16Z;
m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16SZ;
m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32;
m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24;
m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMCT24].rtxP = &GSLocalMemory::ReadTexture24;
m_psm[PSM_PSMCT16].rtxP = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16S;
m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMT8].rtxP = &GSLocalMemory::ReadTexture8P;
m_psm[PSM_PSMT4].rtxP = &GSLocalMemory::ReadTexture4P;
m_psm[PSM_PSMT8H].rtxP = &GSLocalMemory::ReadTexture8HP;
m_psm[PSM_PSMT4HL].rtxP = &GSLocalMemory::ReadTexture4HLP;
m_psm[PSM_PSMT4HH].rtxP = &GSLocalMemory::ReadTexture4HHP;
m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32Z;
m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24Z;
m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16Z;
m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16SZ;
m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32;
m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24;
m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16;
m_psm[PSM_PSMCT24].rtxb = &GSLocalMemory::ReadTextureBlock24;
m_psm[PSM_PSMCT16].rtxb = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16S;
m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMT8].rtxb = &GSLocalMemory::ReadTextureBlock8;
m_psm[PSM_PSMT4].rtxb = &GSLocalMemory::ReadTextureBlock4;
m_psm[PSM_PSMT8H].rtxb = &GSLocalMemory::ReadTextureBlock8H;
m_psm[PSM_PSMT4HL].rtxb = &GSLocalMemory::ReadTextureBlock4HL;
m_psm[PSM_PSMT4HH].rtxb = &GSLocalMemory::ReadTextureBlock4HH;
m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32Z;
m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24Z;
m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16Z;
m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16SZ;
m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32;
m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24;
m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMCT24].rtxbP = &GSLocalMemory::ReadTextureBlock24;
m_psm[PSM_PSMCT16].rtxbP = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16S;
m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMT8].rtxbP = &GSLocalMemory::ReadTextureBlock8P;
m_psm[PSM_PSMT4].rtxbP = &GSLocalMemory::ReadTextureBlock4P;
m_psm[PSM_PSMT8H].rtxbP = &GSLocalMemory::ReadTextureBlock8HP;
m_psm[PSM_PSMT4HL].rtxbP = &GSLocalMemory::ReadTextureBlock4HLP;
m_psm[PSM_PSMT4HH].rtxbP = &GSLocalMemory::ReadTextureBlock4HHP;
m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32Z;
m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24Z;
m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16Z;
m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16SZ;
m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32;
m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24;
m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16;
m_psm[PSM_PSMCT16].bpp = m_psm[PSM_PSMCT16S].bpp = 16;
m_psm[PSM_PSMT8].bpp = 8;
@ -473,6 +473,62 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm)
return o;
}
GSPixelOffset* GSLocalMemory::GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF)
{
uint32 fbp = FRAME.Block();
uint32 zbp = ZBUF.Block();
uint32 fpsm = FRAME.PSM;
uint32 zpsm = ZBUF.PSM;
uint32 bw = FRAME.FBW;
ASSERT(m_psm[fpsm].trbpp > 8 || m_psm[zpsm].trbpp > 8);
// "(psm & 0x0f) ^ ((psm & 0xf0) >> 2)" creates 4 bit unique identifiers for render target formats (only)
uint32 fpsm_hash = (fpsm & 0x0f) ^ ((fpsm & 0x30) >> 2);
uint32 zpsm_hash = (zpsm & 0x0f) ^ ((zpsm & 0x30) >> 2);
uint32 hash = (FRAME.FBP << 0) | (ZBUF.ZBP << 9) | (bw << 18) | (fpsm_hash << 24) | (zpsm_hash << 28);
hash_map<uint32, GSPixelOffset*>::iterator i = m_pomap.find(hash);
if(i != m_pomap.end())
{
return i->second;
}
GSPixelOffset* o = (GSPixelOffset*)_aligned_malloc(sizeof(GSPixelOffset), 32);
o->hash = hash;
o->fbp = fbp;
o->zbp = zbp;
o->fpsm = fpsm;
o->zpsm = zpsm;
o->bw = bw;
pixelAddress fpa = m_psm[fpsm].pa;
pixelAddress zpa = m_psm[zpsm].pa;
int fs = m_psm[fpsm].bpp >> 5;
int zs = m_psm[zpsm].bpp >> 5;
for(int i = 0; i < 2048; i++)
{
o->row[i].x = (int)fpa(0, i, fbp, bw) << fs;
o->row[i].y = (int)zpa(0, i, zbp, bw) << zs;
}
for(int i = 0; i < 2048; i++)
{
o->col[i].x = m_psm[fpsm].rowOffset[0][i] << fs;
o->col[i].y = m_psm[zpsm].rowOffset[0][i] << zs;
}
m_pomap[hash] = o;
return o;
}
GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF)
{
uint32 fbp = FRAME.Block();
@ -1550,28 +1606,22 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i&
void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned(uint16, 32) block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
if(TEXA.AEM)
{
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
FOREACH_BLOCK_START(r, 16, 8, 32)
{
ReadAndExpandBlock16<true>(src, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned(uint16, 32) block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
else
{
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
FOREACH_BLOCK_START(r, 16, 8, 32)
{
ReadAndExpandBlock16<false>(src, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
@ -1629,61 +1679,6 @@ void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i&
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture32Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
FOREACH_BLOCK_START(r, 8, 8, 32)
{
ReadBlock32<true>(src, dst, dstpitch);
}
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
if(TEXA.AEM)
{
FOREACH_BLOCK_START(r, 8, 8, 32)
{
ReadAndExpandBlock24<true>(src, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
else
{
FOREACH_BLOCK_START(r, 8, 8, 32)
{
ReadAndExpandBlock24<false>(src, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
}
void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned(uint16, 32) block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
{
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)
{
__aligned(uint16, 32) block[16 * 8];
FOREACH_BLOCK_START(r, 16, 8, 32)
{
ReadBlock16<true>(src, (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
FOREACH_BLOCK_END
}
///////////////////
void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
@ -1709,20 +1704,16 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons
void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned(uint16, 32) block[16 * 8];
ALIGN_STACK(32);
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned(uint16, 32) block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
if(TEXA.AEM)
{
ReadAndExpandBlock16<true>(BlockPtr(bp), dst, dstpitch, TEXA);
}
else
{
ReadAndExpandBlock16<false>(BlockPtr(bp), dst, dstpitch, TEXA);
}
}
void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
@ -1760,45 +1751,6 @@ void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, con
ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut);
}
void GSLocalMemory::ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(32);
ReadBlock32<true>(BlockPtr(bp), dst, dstpitch);
}
void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
ALIGN_STACK(32);
if(TEXA.AEM)
{
ReadAndExpandBlock24<true>(BlockPtr(bp), dst, dstpitch, TEXA);
}
else
{
ReadAndExpandBlock24<false>(BlockPtr(bp), dst, dstpitch, TEXA);
}
}
void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned(uint16, 32) block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
void GSLocalMemory::ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const
{
__aligned(uint16, 32) block[16 * 8];
ReadBlock16<true>(BlockPtr(bp), (uint8*)block, sizeof(block) / 8);
ExpandBlock16(block, dst, dstpitch, TEXA);
}
///////////////////
void GSLocalMemory::ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA)

View File

@ -56,6 +56,16 @@ public:
uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL);
};
struct GSPixelOffset
{
// 16 bit offsets (m_vm16[...])
GSVector2i row[2048]; // f yn | z yn
GSVector2i col[2048]; // f xn | z xn
uint32 hash;
uint32 fbp, zbp, fpsm, zpsm, bw;
};
struct GSPixelOffset4
{
// 16 bit offsets (m_vm16[...])
@ -158,6 +168,7 @@ protected:
//
hash_map<uint32, GSOffset*> m_omap;
hash_map<uint32, GSPixelOffset*> m_pomap;
hash_map<uint32, GSPixelOffset4*> m_po4map;
hash_map<uint64, vector<GSVector2i>*> m_p2tmap;
@ -166,6 +177,7 @@ public:
virtual ~GSLocalMemory();
GSOffset* GetOffset(uint32 bp, uint32 bw, uint32 psm);
GSPixelOffset* GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
GSPixelOffset4* GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
vector<GSVector2i>* GetPage2TileMap(const GIFRegTEX0& TEX0);
@ -863,32 +875,22 @@ public:
void ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture4(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture8H(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture4HL(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture32Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA);
void ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
void ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const;
// pal ? 8 : 32

View File

@ -35,7 +35,7 @@ public:
enum counter_t
{
Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad,
Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, SyncPoint,
CounterLast,
};

View File

@ -30,6 +30,8 @@
#define THREAD_HEIGHT 4
int GSRasterizerData::s_counter = 0;
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon)
: m_ds(ds)
, m_id(id)
@ -40,7 +42,7 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe
m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
m_edge.count = 0;
m_myscanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
int row = 0;
@ -48,14 +50,14 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe
{
for(int i = 0; i < threads; i++, row++)
{
m_myscanline[row] = i == id ? 1 : 0;
m_scanline[row] = i == id ? 1 : 0;
}
}
}
GSRasterizer::~GSRasterizer()
{
_aligned_free(m_myscanline);
_aligned_free(m_scanline);
if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048);
@ -66,7 +68,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top) const
{
ASSERT(top >= 0 && top < 2048);
return m_myscanline[top >> THREAD_HEIGHT] != 0;
return m_scanline[top >> THREAD_HEIGHT] != 0;
}
bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
@ -78,7 +80,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
while(top < bottom)
{
if(m_myscanline[top++])
if(m_scanline[top++])
{
return true;
}
@ -91,9 +93,9 @@ int GSRasterizer::FindMyNextScanline(int top) const
{
int i = top >> THREAD_HEIGHT;
if(m_myscanline[i] == 0)
if(m_scanline[i] == 0)
{
while(m_myscanline[++i] == 0);
while(m_scanline[++i] == 0);
top = i << THREAD_HEIGHT;
}
@ -124,6 +126,8 @@ void GSRasterizer::Draw(GSRasterizerData* data)
if(data->vertex != NULL && data->vertex_count == 0 || data->index != NULL && data->index_count == 0) return;
data->start = __rdtsc();
m_ds->BeginDraw(data);
const GSVertexSW* vertex = data->vertex;
@ -140,8 +144,6 @@ void GSRasterizer::Draw(GSRasterizerData* data)
m_fscissor_x = GSVector4(data->scissor).xzxz();
m_fscissor_y = GSVector4(data->scissor).ywyw();
uint64 start = __rdtsc();
switch(data->primclass)
{
case GS_POINT_CLASS:
@ -206,7 +208,9 @@ void GSRasterizer::Draw(GSRasterizerData* data)
__assume(0);
}
uint64 ticks = __rdtsc() - start;
data->pixels = m_pixels;
uint64 ticks = __rdtsc() - data->start;
m_ds->EndDraw(data->frame, ticks, m_pixels);
}
@ -444,28 +448,18 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index)
GSVector4 dxy01c = dxy01 * cross;
GSVector4 _z = dxy01c * dv[1].p.zzzz(dv[0].p); // dx0 * z1, dy0 * z1, dx1 * z0, dy1 * z0
GSVector4 _f = dxy01c * dv[1].p.wwww(dv[0].p); // dx0 * f1, dy0 * f1, dx1 * f0, dy1 * f0
/*
dscan = dv[1] * dxy01c.yyyy() - dv[0] * dxy01c.wwww();
dedge = dv[0] * dxy01c.zzzz() - dv[1] * dxy01c.xxxx();
*/
GSVector4 _zf = _z.ywyw(_f).hsub(_z.zxzx(_f)); // dy0 * z1 - dy1 * z0, dy0 * f1 - dy1 * f0, dx1 * z0 - dx0 * z1, dx1 * f0 - dx0 * f1
dscan.p = dv[1].p * dxy01c.yyyy() - dv[0].p * dxy01c.wwww();
dscan.t = dv[1].t * dxy01c.yyyy() - dv[0].t * dxy01c.wwww();
dscan.c = dv[1].c * dxy01c.yyyy() - dv[0].c * dxy01c.wwww();
dscan.p = _zf.zwxy(); // dy0 * z1 - dy1 * z0, dy0 * f1 - dy1 * f0
dedge.p = _zf; // dx1 * z0 - dx0 * z1, dx1 * f0 - dx0 * f1
GSVector4 _s = dxy01c * dv[1].t.xxxx(dv[0].t); // dx0 * s1, dy0 * s1, dx1 * s0, dy1 * s0
GSVector4 _t = dxy01c * dv[1].t.yyyy(dv[0].t); // dx0 * t1, dy0 * t1, dx1 * t0, dy1 * t0
GSVector4 _q = dxy01c * dv[1].t.zzzz(dv[0].t); // dx0 * q1, dy0 * q1, dx1 * q0, dy1 * q0
dscan.t = _s.ywyw(_t).hsub(_q.ywyw()); // dy0 * s1 - dy1 * s0, dy0 * t1 - dy1 * t0, dy0 * q1 - dy1 * q0
dedge.t = _s.zxzx(_t).hsub(_q.zxzx()); // dx1 * s0 - dx0 * s1, dx1 * t0 - dx0 * t1, dx1 * q0 - dx0 * q1
GSVector4 _r = dxy01c * dv[1].c.xxxx(dv[0].c); // dx0 * r1, dy0 * r1, dx1 * r0, dy1 * r0
GSVector4 _g = dxy01c * dv[1].c.yyyy(dv[0].c); // dx0 * g1, dy0 * g1, dx1 * g0, dy1 * g0
GSVector4 _b = dxy01c * dv[1].c.zzzz(dv[0].c); // dx0 * b1, dy0 * b1, dx1 * b0, dy1 * b0
GSVector4 _a = dxy01c * dv[1].c.wwww(dv[0].c); // dx0 * a1, dy0 * a1, dx1 * a0, dy1 * a0
dscan.c = _r.ywyw(_g).hsub(_b.ywyw(_a)); // dy0 * r1 - dy1 * r0, dy0 * g1 - dy1 * g0, dy0 * b1 - dy1 * b0, dy0 * a1 - dy1 * a0
dedge.c = _r.zxzx(_g).hsub(_b.zxzx(_a)); // dx1 * r0 - dx0 * r1, dx1 * g0 - dx0 * g1, dx1 * b0 - dx0 * b1, dx1 * a0 - dx0 * a1
dedge.p = dv[0].p * dxy01c.zzzz() - dv[1].p * dxy01c.xxxx();
dedge.t = dv[0].t * dxy01c.zzzz() - dv[1].t * dxy01c.xxxx();
dedge.c = dv[0].c * dxy01c.zzzz() - dv[1].c * dxy01c.xxxx();
if(m1 & 1)
{
@ -555,7 +549,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
scan.t = edge.t + dedge.t * dy;
scan.c = edge.c + dedge.c * dy;
AddScanline(e++, pixels, left, top, scan + dscan * (l - p0).xxxx());
GSVector4 prestep = (l - p0).xxxx();
scan.p += dscan.p * prestep;
scan.t += dscan.t * prestep;
scan.c += dscan.c * prestep;
AddScanline(e++, pixels, left, top, scan);
}
top++;
@ -904,11 +904,20 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS
//
GSRasterizerList::GSRasterizerList()
: GSJobQueue<shared_ptr<GSRasterizerData> >()
, m_sync_count(0)
, m_syncpoint_count(0)
GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
: m_perfmon(perfmon)
{
m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64);
int row = 0;
while(row < (2048 >> THREAD_HEIGHT))
{
for(int i = 0; i < threads; i++, row++)
{
m_scanline[row] = i;
}
}
}
GSRasterizerList::~GSRasterizerList()
@ -917,31 +926,49 @@ GSRasterizerList::~GSRasterizerList()
{
delete *i;
}
_aligned_free(m_scanline);
}
void GSRasterizerList::Queue(shared_ptr<GSRasterizerData> data)
{
// disable dispatcher thread for now and pass-through directly,
// would only be relevant if data->syncpoint was utilized more,
// it would hide the syncing latency from the main gs thread
GSVector4i r = data->bbox.rintersect(data->scissor);
// Push(data);
ASSERT(r.top >= 0 && r.top < 2048 && r.bottom >= 0 && r.bottom < 2048);
Process(data); m_count++;
int top = r.top >> THREAD_HEIGHT;
int bottom = std::min<int>((r.bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT, top + m_workers.size());
while(top < bottom)
{
m_workers[m_scanline[top++]]->Push(data);
}
}
void GSRasterizerList::Sync()
{
if(GetCount() == 0) return;
if(!IsSynced())
{
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Wait();
}
Wait(); // first dispatch all items to workers
m_perfmon->Put(GSPerfMon::SyncPoint, 1);
}
}
bool GSRasterizerList::IsSynced() const
{
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Wait(); // then wait all workers to finish their jobs
if(!m_workers[i]->IsEmpty())
{
return false;
}
}
m_sync_count++;
return true;
}
int GSRasterizerList::GetPixels(bool reset)
@ -956,24 +983,6 @@ int GSRasterizerList::GetPixels(bool reset)
return pixels;
}
void GSRasterizerList::Process(shared_ptr<GSRasterizerData>& item)
{
if(item->syncpoint)
{
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Wait();
}
m_syncpoint_count++;
}
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Push(item);
}
}
// GSRasterizerList::GSWorker
GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r)
@ -994,16 +1003,6 @@ int GSRasterizerList::GSWorker::GetPixels(bool reset)
return m_r->GetPixels(reset);
}
void GSRasterizerList::GSWorker::Push(const shared_ptr<GSRasterizerData>& item)
{
GSVector4i r = item->bbox.rintersect(item->scissor);
if(m_r->IsOneOfMyScanlines(r.top, r.bottom))
{
GSJobQueue<shared_ptr<GSRasterizerData> >::Push(item);
}
}
void GSRasterizerList::GSWorker::Process(shared_ptr<GSRasterizerData>& item)
{
m_r->Draw(item.get());

View File

@ -30,6 +30,8 @@
__aligned(class, 32) GSRasterizerData : public GSAlignedClass<32>
{
static int s_counter;
public:
GSVector4i scissor;
GSVector4i bbox;
@ -39,8 +41,10 @@ public:
int vertex_count;
uint32* index;
int index_count;
bool syncpoint;
uint64 frame;
uint64 start;
int pixels;
int counter;
GSRasterizerData()
: scissor(GSVector4i::zero())
@ -51,9 +55,11 @@ public:
, vertex_count(0)
, index(NULL)
, index_count(0)
, syncpoint(false)
, frame(0)
, start(0)
, pixels(0)
{
counter = s_counter++;
}
virtual ~GSRasterizerData()
@ -109,6 +115,7 @@ public:
virtual void Queue(shared_ptr<GSRasterizerData> data) = 0;
virtual void Sync() = 0;
virtual bool IsSynced() const = 0;
virtual int GetPixels(bool reset = true) = 0;
};
@ -119,7 +126,7 @@ protected:
IDrawScanline* m_ds;
int m_id;
int m_threads;
uint8* m_myscanline;
uint8* m_scanline;
GSVector4i m_scissor;
GSVector4 m_fscissor_x;
GSVector4 m_fscissor_y;
@ -155,12 +162,12 @@ public:
void Queue(shared_ptr<GSRasterizerData> data);
void Sync() {}
bool IsSynced() const {return true;}
int GetPixels(bool reset);
};
class GSRasterizerList
: public IRasterizer
, private GSJobQueue<shared_ptr<GSRasterizerData> >
{
protected:
class GSWorker : public GSJobQueue<shared_ptr<GSRasterizerData> >
@ -175,17 +182,14 @@ protected:
// GSJobQueue
void Push(const shared_ptr<GSRasterizerData>& item);
void Process(shared_ptr<GSRasterizerData>& item);
};
GSPerfMon* m_perfmon;
vector<GSWorker*> m_workers;
uint8* m_scanline;
GSRasterizerList();
// GSJobQueue
void Process(shared_ptr<GSRasterizerData>& item);
GSRasterizerList(int threads, GSPerfMon* perfmon);
public:
virtual ~GSRasterizerList();
@ -200,7 +204,7 @@ public:
}
else
{
GSRasterizerList* rl = new GSRasterizerList();
GSRasterizerList* rl = new GSRasterizerList(threads, perfmon);
for(int i = 0; i < threads; i++)
{
@ -211,12 +215,10 @@ public:
}
}
int m_sync_count;
int m_syncpoint_count;
// IRasterizer
void Queue(shared_ptr<GSRasterizerData> data);
void Sync();
bool IsSynced() const;
int GetPixels(bool reset);
};

View File

@ -22,9 +22,8 @@
#include "stdafx.h"
#include "GSRenderer.h"
GSRenderer::GSRenderer(GSVertexTrace* vt, size_t vertex_stride)
: GSState(vt, vertex_stride)
, m_dev(NULL)
GSRenderer::GSRenderer()
: m_dev(NULL)
, m_shader(0)
, m_shift_key(false)
, m_control_key(false)
@ -38,12 +37,6 @@ GSRenderer::GSRenderer(GSVertexTrace* vt, size_t vertex_stride)
m_aa1 = !!theApp.GetConfig("aa1", 0);
m_mipmap = !!theApp.GetConfig("mipmap", 1);
m_fxaa = !!theApp.GetConfig("fxaa", 0);
s_n = 0;
s_dump = !!theApp.GetConfig("dump", 0);
s_save = !!theApp.GetConfig("save", 0);
s_savez = !!theApp.GetConfig("savez", 0);
s_saven = theApp.GetConfig("saven", 0);
}
GSRenderer::~GSRenderer()
@ -259,7 +252,7 @@ bool GSRenderer::Merge(int field)
{
int field2 = 1 - ((m_interlace - 1) & 1);
int mode = (m_interlace - 1) >> 1;
m_dev->Interlace(ds, field ^ field2, mode, tex[1] ? tex[1]->GetScale().y : tex[0]->GetScale().y);
}
@ -306,6 +299,8 @@ void GSRenderer::VSync(int field)
ResetDevice();
}
m_dev->AgePool();
// osd
if((m_perfmon.GetFrame() & 0x1f) == 0)
@ -334,7 +329,7 @@ void GSRenderer::VSync(int field)
s2.c_str(),
theApp.m_gs_interlace[m_interlace].name.c_str(),
theApp.m_gs_aspectratio[m_aspectratio].name.c_str(),
(int)m_perfmon.Get(GSPerfMon::Quad),
(int)m_perfmon.Get(GSPerfMon::SyncPoint),
(int)m_perfmon.Get(GSPerfMon::Prim),
(int)m_perfmon.Get(GSPerfMon::Draw),
m_perfmon.CPU(),

View File

@ -55,14 +55,8 @@ public:
GSWnd m_wnd;
GSDevice* m_dev;
int s_n;
bool s_dump;
bool s_save;
bool s_savez;
int s_saven;
public:
GSRenderer(GSVertexTrace* vt, size_t vertex_stride);
GSRenderer();
virtual ~GSRenderer();
virtual bool CreateWnd(const string& title, int w, int h);

View File

@ -22,18 +22,28 @@
#include "stdafx.h"
#include "GSRendererCS.h"
#define PS_BATCH_SIZE 512
GSRendererCS::GSRendererCS()
: GSRenderer(new GSVertexTraceCS(this), sizeof(GSVertex))
: GSRenderer()
{
m_nativeres = true;
InitConvertVertex(GSRendererCS);
memset(m_vm_valid, 0, sizeof(m_vm_valid));
memset(m_texture, 0, sizeof(m_texture));
m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
}
GSRendererCS::~GSRendererCS()
{
for(int i = 0; i < countof(m_texture); i++)
{
delete m_texture[i];
}
_aligned_free(m_output);
}
bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
@ -41,27 +51,157 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
if(!__super::CreateDevice(dev_unk))
return false;
HRESULT hr;
D3D11_DEPTH_STENCIL_DESC dsd;
D3D11_BLEND_DESC bsd;
D3D11_SAMPLER_DESC sd;
D3D11_BUFFER_DESC bd;
D3D11_TEXTURE2D_DESC td;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
D3D_FEATURE_LEVEL level;
((GSDeviceDX*)dev_unk)->GetFeatureLevel(level);
if(level < D3D_FEATURE_LEVEL_10_0)
if(level < D3D_FEATURE_LEVEL_11_0)
return false;
HRESULT hr;
GSDevice11* dev = (GSDevice11*)dev_unk;
D3D11_BUFFER_DESC bd;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
ID3D11DeviceContext* ctx = *dev;
// empty depth stencil state
memset(&dsd, 0, sizeof(dsd));
dsd.StencilEnable = false;
dsd.DepthEnable = false;
hr = (*dev)->CreateDepthStencilState(&dsd, &m_dss);
if(FAILED(hr)) return false;
// empty blend state
memset(&bsd, 0, sizeof(bsd));
bsd.RenderTarget[0].BlendEnable = false;
hr = (*dev)->CreateBlendState(&bsd, &m_bs);
if(FAILED(hr)) return false;
// point sampler
memset(&sd, 0, sizeof(sd));
sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT;
sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP;
sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP;
sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP;
sd.MaxLOD = FLT_MAX;
sd.MaxAnisotropy = 16;
sd.ComparisonFunc = D3D11_COMPARISON_NEVER;
hr = (*dev)->CreateSamplerState(&sd, &m_ss);
if(FAILED(hr)) return false;
// link buffer
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = 256 << 20; // 256 MB w00t
bd.StructureByteStride = sizeof(uint32) * 4; // c, z, id, next
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_lb);
{
uint32 data[] = {0, 0, 0xffffffff, 0};
D3D11_BOX box;
memset(&box, 0, sizeof(box));
box.right = sizeof(data);
box.bottom = 1;
box.back = 1;
ctx->UpdateSubresource(m_lb, 0, &box, data, 0, 0);
}
if(FAILED(hr)) return false;
memset(&uavd, 0, sizeof(uavd));
uavd.Format = DXGI_FORMAT_UNKNOWN;
uavd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride;
uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER;
uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
hr = (*dev)->CreateUnorderedAccessView(m_lb, &uavd, &m_lb_uav);
if(FAILED(hr)) return false;
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_UNKNOWN;
srvd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
hr = (*dev)->CreateShaderResourceView(m_lb, &srvd, &m_lb_srv);
if(FAILED(hr)) return false;
// start offset buffer
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(uint32) * 2048 * 2048; // index
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_sob);
if(FAILED(hr)) return false;
memset(&uavd, 0, sizeof(uavd));
uavd.Format = DXGI_FORMAT_R32_TYPELESS;
uavd.Buffer.NumElements = bd.ByteWidth / sizeof(uint32);
uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
hr = (*dev)->CreateUnorderedAccessView(m_sob, &uavd, &m_sob_uav);
if(FAILED(hr)) return false;
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_R32_TYPELESS;
srvd.BufferEx.NumElements = bd.ByteWidth / sizeof(uint32);
srvd.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX;
hr = (*dev)->CreateShaderResourceView(m_sob, &srvd, &m_sob_srv);
if(FAILED(hr)) return false;
const uint32 tmp = 0;
ctx->ClearUnorderedAccessViewUint(m_sob_uav, &tmp); // initial clear, next time Draw should restore it in Step 2
// video memory (4MB)
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = 4 * 1024 * 1024;
bd.StructureByteStride = 4;
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
@ -81,35 +221,32 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
if(FAILED(hr)) return false;
/*
memset(&td, 0, sizeof(td));
// vertex buffer
td.Width = PAGE_SIZE;
td.Height = MAX_PAGES;
td.Format = DXGI_FORMAT_R8_UINT;
td.MipLevels = 1;
td.ArraySize = 1;
td.SampleDesc.Count = 1;
td.SampleDesc.Quality = 0;
td.Usage = D3D11_USAGE_DEFAULT;
td.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(GSVertex) * 10000;
bd.StructureByteStride = sizeof(GSVertex);
bd.Usage = D3D11_USAGE_DYNAMIC;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_vb);
hr = (*dev)->CreateTexture2D(&td, NULL, &m_vm);
if(FAILED(hr)) return false;
// index buffer
memset(&uavd, 0, sizeof(uavd));
memset(&bd, 0, sizeof(bd));
uavd.Format = DXGI_FORMAT_R8_UINT;
uavd.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D;
bd.ByteWidth = sizeof(uint32) * 10000 * 3;
bd.Usage = D3D11_USAGE_DYNAMIC;
bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_ib);
hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav);
if(FAILED(hr)) return false;
*/
// one page, for copying between cpu<->gpu
memset(&bd, 0, sizeof(bd));
@ -121,219 +258,429 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk)
hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb);
if(FAILED(hr)) return false;
/*
memset(&td, 0, sizeof(td));
td.Width = PAGE_SIZE;
td.Height = 1;
td.Format = DXGI_FORMAT_R8_UINT;
td.MipLevels = 1;
td.ArraySize = 1;
td.SampleDesc.Count = 1;
td.SampleDesc.Quality = 0;
td.Usage = D3D11_USAGE_STAGING;
td.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE;
hr = (*dev)->CreateTexture2D(&td, NULL, &m_pb);
if(FAILED(hr)) return false;
*/
// VSConstantBuffer
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(VSConstantBuffer);
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_vs_cb);
if(FAILED(hr)) return false;
// PS
D3D11_SHADER_MACRO macro[] =
{
{NULL, NULL},
};
hr = dev->CompileShader(IDR_CS_FX, "ps_main0", macro, &m_ps0);
if(FAILED(hr)) return false;
// PSConstantBuffer
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(PSConstantBuffer);
bd.Usage = D3D11_USAGE_DEFAULT;
bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
hr = (*dev)->CreateBuffer(&bd, NULL, &m_ps_cb);
if(FAILED(hr)) return false;
//
return true;
}
void GSRendererCS::ResetDevice()
{
for(int i = 0; i < countof(m_texture); i++)
{
delete m_texture[i];
m_texture[i] = NULL;
}
}
void GSRendererCS::VSync(int field)
{
__super::VSync(field);
//printf("%lld\n", m_perfmon.GetFrame());
}
GSTexture* GSRendererCS::GetOutput(int i)
{
// TODO: create a compute shader which unswizzles the frame from m_vm to the output texture
return NULL;
}
const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB;
template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererCS::ConvertVertex(size_t dst_index, size_t src_index)
{
// TODO: vertex format more fitting as the input for the compute shader
int w = DISPFB.FBW * 64;
int h = GetFrameRect(i).bottom;
if(src_index != dst_index)
// TODO: round up bottom
if(m_dev->ResizeTexture(&m_texture[i], w, h))
{
GSVertex v = ((GSVertex*)m_vertex.buff)[src_index];
const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM];
((GSVertex*)m_vertex.buff)[dst_index] = v;
GSVector4i r(0, 0, w, h);
GSVector4i r2 = r.ralign<Align_Outside>(psm.bs);
GSOffset* o = m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM);
Read(o, r2, false);
(m_mem.*psm.rtx)(o, r2, m_output, 1024 * 4, m_env.TEXA);
m_texture[i]->Update(r, m_output, 1024 * 4);
if(s_dump)
{
if(s_save && s_n >= s_saven)
{
m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM));
}
s_n++;
}
}
return m_texture[i];
}
void GSRendererCS::Draw()
{
HRESULT hr;
GSDrawingEnvironment& env = m_env;
GSDrawingContext* context = m_context;
GSVector2i rtsize(2048, 2048);
GSVector4i scissor = GSVector4i(context->scissor.in).rintersect(GSVector4i(rtsize).zwxy());
GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
GSVector4i r = bbox.rintersect(scissor);
uint32 fm = context->FRAME.FBMSK;
uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0;
if(fm != 0xffffffff)
{
Write(context->offset.fb, r);
// TODO: m_tc->InvalidateVideoMem(context->offset.fb, r, false);
}
if(zm != 0xffffffff)
{
Write(context->offset.zb, r);
// TODO: m_tc->InvalidateVideoMem(context->offset.zb, r, false);
}
// TODO: if(24-bit) fm/zm |= 0xff000000;
if(PRIM->TME)
{
m_mem.m_clut.Read32(context->TEX0, env.TEXA);
GSVector4i r;
GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear());
// TODO: unswizzle pages of r to a texture, check m_vm_valid, bit not set cpu->gpu, set gpu->gpu
// TODO: Write transfer should directly write to m_vm, then Read/Write syncing won't be necessary, clut must be updated with the gpu also
// TODO: tex = m_tc->LookupSource(context->TEX0, env.TEXA, r);
// if(!tex) return;
}
//
GSDevice11* dev = (GSDevice11*)m_dev;
ID3D11DeviceContext* ctx = *dev;
D3D11_BUFFER_DESC bd;
D3D11_UNORDERED_ACCESS_VIEW_DESC uavd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
D3D11_MAPPED_SUBRESOURCE map;
CComPtr<ID3D11ShaderResourceView> vb_srv;
CComPtr<ID3D11ShaderResourceView> ib_srv;
// TODO: cache these in hash_maps
CComPtr<ID3D11Buffer> fbr, fbc, zbr, zbc;
CComPtr<ID3D11ShaderResourceView> fbr_srv, fbc_srv, zbr_srv, zbc_srv;
// TODO: grow m_vb, m_ib if needed
if(m_vertex.next > 10000) return;
if(m_index.tail > 30000) return;
// TODO: fill/advance/discardwhenfull, as in GSDevice11::IASetVertexBuffer/IASetIndexBuffer
hr = ctx->Map(m_vb, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around
if(FAILED(hr)) return;
memcpy(map.pData, m_vertex.buff, sizeof(GSVertex) * m_vertex.next);
ctx->Unmap(m_vb, 0);
//
hr = ctx->Map(m_ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around
dev->BeginScene();
if(FAILED(hr)) return;
// SetupOM
memcpy(map.pData, m_index.buff, sizeof(uint32) * m_index.tail);
ctx->Unmap(m_ib, 0);
// TODO: UpdateResource might be faster, based on my exprience with the real vertex buffer, write-no-overwrite/discarded dynamic buffer + map is better
//
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_UNKNOWN;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
srvd.Buffer.FirstElement = 0;
srvd.Buffer.NumElements = m_vertex.next;
hr = (*dev)->CreateShaderResourceView(m_vb, &srvd, &vb_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_R32_UINT;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
srvd.Buffer.FirstElement = 0;
srvd.Buffer.NumElements = m_index.tail;
hr = (*dev)->CreateShaderResourceView(m_ib, &srvd, &ib_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg
// fzb offsets
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(int) * 4096;
bd.StructureByteStride = sizeof(int);
bd.Usage = D3D11_USAGE_IMMUTABLE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
D3D11_SUBRESOURCE_DATA data;
memset(&data, 0, sizeof(data));
data.pSysMem = m_context->offset.fb->pixel.row;
hr = (*dev)->CreateBuffer(&bd, &data, &fbr);
data.pSysMem = m_context->offset.fb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats
dev->OMSetDepthStencilState(m_dss, 0);
dev->OMSetBlendState(m_bs, 0);
hr = (*dev)->CreateBuffer(&bd, &data, &fbc);
ID3D11UnorderedAccessView* uavs[] = {m_vm_uav, m_lb_uav, m_sob_uav};
uint32 counters[] = {1, 0, 0};
data.pSysMem = m_context->offset.zb->pixel.row;
hr = (*dev)->CreateBuffer(&bd, &data, &zbr);
dev->OMSetRenderTargets(rtsize, countof(uavs), uavs, counters, &scissor);
data.pSysMem = m_context->offset.zb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats
hr = (*dev)->CreateBuffer(&bd, &data, &zbc);
// SetupIA
// TODO: D3D10_SHADER_MACRO (primclass, less frequently changing drawing attribs, etc.)
D3D11_PRIMITIVE_TOPOLOGY topology;
uint32 sel = 0; // TODO
hash_map<uint32, CComPtr<ID3D11ComputeShader> >::iterator i = m_cs.find(sel);
CComPtr<ID3D11ComputeShader> cs;
if(i == m_cs.end())
switch(m_vt.m_primclass)
{
// hr = dev->CompileShader(IDR_CS_FX, "cs_main", NULL, &cs);
hr = dev->CompileShader("E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\cs.fx", "cs_main", NULL, &cs);
case GS_POINT_CLASS:
topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST;
break;
case GS_TRIANGLE_CLASS:
topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
break;
default:
__assume(0);
}
if(FAILED(hr)) return;
GSVector4i r2 = bbox.add32(GSVector4i(-1, -1, 1, 1)).rintersect(scissor);
m_cs[sel] = cs;
m_vertex.buff[m_vertex.next + 0].XYZ.X = context->XYOFFSET.OFX + (r2.left << 4);
m_vertex.buff[m_vertex.next + 0].XYZ.Y = context->XYOFFSET.OFY + (r2.top << 4);
m_vertex.buff[m_vertex.next + 1].XYZ.X = context->XYOFFSET.OFX + (r2.right << 4);
m_vertex.buff[m_vertex.next + 1].XYZ.Y = context->XYOFFSET.OFY + (r2.bottom << 4);
m_index.buff[m_index.tail + 0] = m_vertex.next + 0;
m_index.buff[m_index.tail + 1] = m_vertex.next + 1;
dev->IASetVertexBuffer(m_vertex.buff, sizeof(GSVertex), m_vertex.next + 2);
dev->IASetIndexBuffer(m_index.buff, m_index.tail + 2);
// SetupVS
VSSelector vs_sel;
vs_sel.tme = PRIM->TME;
vs_sel.fst = PRIM->FST;
VSConstantBuffer vs_cb;
float sx = 2.0f / (rtsize.x << 4);
float sy = 2.0f / (rtsize.y << 4);
//float sx = 1.0f / 16;
//float sy = 1.0f / 16;
float ox = (float)(int)context->XYOFFSET.OFX;
float oy = (float)(int)context->XYOFFSET.OFY;
vs_cb.VertexScale = GSVector4(sx, -sy, 0.0f, 0.0f);
vs_cb.VertexOffset = GSVector4(ox * sx + 1, -(oy * sy + 1), 0.0f, -1.0f);
//vs_cb.VertexScale = GSVector4(sx, sy, 0.0f, 0.0f);
//vs_cb.VertexOffset = GSVector4(ox * sx, oy * sy, 0.0f, -1.0f);
{
GSVertexShader11 vs;
hash_map<uint32, GSVertexShader11>::const_iterator i = m_vs.find(vs_sel);
if(i != m_vs.end())
{
vs = i->second;
}
else
{
string str[2];
str[0] = format("%d", vs_sel.tme);
str[1] = format("%d", vs_sel.fst);
D3D11_SHADER_MACRO macro[] =
{
{"VS_TME", str[0].c_str()},
{"VS_FST", str[1].c_str()},
{NULL, NULL},
};
D3D11_INPUT_ELEMENT_DESC layout[] =
{
{"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0},
};
dev->CompileShader(IDR_CS_FX, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il);
m_vs[vs_sel] = vs;
}
ctx->UpdateSubresource(m_vs_cb, 0, NULL, &vs_cb, 0, 0); // TODO: only update if changed
dev->VSSetShader(vs.vs, m_vs_cb);
dev->IASetInputLayout(vs.il);
}
// SetupGS
GSSelector gs_sel;
gs_sel.iip = PRIM->IIP;
CComPtr<ID3D11GeometryShader> gs[2];
for(int j = 0; j < 2; j++)
{
gs_sel.prim = j == 0 ? m_vt.m_primclass : GS_SPRITE_CLASS;
hash_map<uint32, CComPtr<ID3D11GeometryShader> >::const_iterator i = m_gs.find(gs_sel);
if(i != m_gs.end())
{
gs[j] = i->second;
}
else
{
string str[2];
str[0] = format("%d", gs_sel.iip);
str[1] = format("%d", j == 0 ? gs_sel.prim : GS_SPRITE_CLASS);
D3D11_SHADER_MACRO macro[] =
{
{"GS_IIP", str[0].c_str()},
{"GS_PRIM", str[1].c_str()},
{NULL, NULL},
};
dev->CompileShader(IDR_CS_FX, "gs_main", macro, &gs[j]);
m_gs[gs_sel] = gs[j];
}
}
// SetupPS
dev->PSSetSamplerState(m_ss, NULL, NULL);
PSSelector ps_sel;
ps_sel.fpsm = context->FRAME.PSM;
ps_sel.zpsm = context->ZBUF.PSM;
CComPtr<ID3D11PixelShader> ps[2] = {m_ps0, NULL};
hash_map<uint32, CComPtr<ID3D11PixelShader> >::const_iterator i = m_ps1.find(ps_sel);
if(i != m_ps1.end())
{
ps[1] = i->second;
}
else
{
cs = i->second;
string str[15];
str[0] = format("%d", PS_BATCH_SIZE);
str[1] = format("%d", context->FRAME.PSM);
str[2] = format("%d", context->ZBUF.PSM);
D3D11_SHADER_MACRO macro[] =
{
{"PS_BATCH_SIZE", str[0].c_str()},
{"PS_FPSM", str[1].c_str()},
{"PS_ZPSM", str[2].c_str()},
{NULL, NULL},
};
dev->CompileShader(IDR_CS_FX, "ps_main1", macro, &ps[1]);
m_ps1[ps_sel] = ps[1];
}
PSConstantBuffer ps_cb;
ps_cb.fm = fm;
ps_cb.zm = zm;
ctx->UpdateSubresource(m_ps_cb, 0, NULL, &ps_cb, 0, 0); // TODO: only update if changed
OffsetBuffer* fzbo = NULL;
//
GetOffsetBuffer(&fzbo);
dev->CSSetShaderUAV(0, m_vm_uav);
dev->CSSetShaderSRV(0, vb_srv);
dev->CSSetShaderSRV(1, ib_srv);
dev->CSSetShaderSRV(2, fbr_srv);
dev->CSSetShaderSRV(3, fbc_srv);
dev->CSSetShaderSRV(4, zbr_srv);
dev->CSSetShaderSRV(5, zbc_srv);
dev->CSSetShader(cs);
dev->PSSetShaderResourceView(0, fzbo->row_srv);
dev->PSSetShaderResourceView(1, fzbo->col_srv);
// TODO: palette, texture
GSVector4i bbox = GSVector4i(0, 0, 640, 512); // TODO: vertex trace
int step = PS_BATCH_SIZE * GSUtil::GetVertexCount(PRIM->PRIM);
GSVector4i r = bbox.ralign<Align_Outside>(GSVector2i(16, 8));
for(int i = 0; i < m_index.tail; i += step)
{
dev->IASetPrimitiveTopology(topology);
dev->GSSetShader(gs[0]);
dev->PSSetShader(ps[0], m_ps_cb);
dev->DrawIndexedPrimitive(i, std::min<int>(m_index.tail - i, step));
bool fb = true; // TODO: frame buffer used
bool zb = true; // TODO: z-buffer used
dev->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_LINELIST);
dev->GSSetShader(gs[1]);
dev->PSSetShader(ps[1], m_ps_cb);
dev->DrawIndexedPrimitive(m_index.tail, 2);
if(fb) Write(m_context->offset.fb, r);
if(zb) Write(m_context->offset.zb, r);
//printf("%d/%d, %d %d %d %d\n", i, m_index.tail, r2.x, r2.y, r2.z, r2.w);
}
// TODO: constant buffer (frequently chaning drawing attribs)
// TODO: texture (implement texture cache)
// TODO: clut to a palette texture (should be texture1d, not simply buffer, it is random accessed)
// TODO: CSSetShaderSRV(6 7 8 ..., texture level 0 1 2 ...) or use Texture3D?
// TODO: invalidate texture cache
dev->EndScene();
/*
CComPtr<ID3D11Query> q;
if(0)
{
std::string s;
/*
s = format("c:\\temp1\\_%05d_f%lld_fb0_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0);
m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024);
Read(m_mem.GetOffset(0, 16, PSM_PSMCT32), GSVector4i(0, 0, 1024, 1024), false);
*/
//
if(fm != 0xffffffff) Read(context->offset.fb, r, false);
//
if(zm != 0xffffffff) Read(context->offset.zb, r, false);
D3D11_QUERY_DESC qd;
memset(&qd, 0, sizeof(qd));
qd.Query = D3D11_QUERY_EVENT;
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM);
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
hr = (*dev)->CreateQuery(&qd, &q);
s = format("c:\\temp1\\_%05d_f%lld_zt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM);
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
ctx->Begin(q);
*/
printf("[%lld] dispatch %05x %d %05x %d %05x %d %dx%d | %d %d %d\n",
__rdtsc(),
m_context->FRAME.Block(), m_context->FRAME.PSM,
m_context->ZBUF.Block(), m_context->ZBUF.PSM,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH,
PRIM->PRIM, m_vertex.next, m_index.tail);
/*
s = format("c:\\temp1\\_%05d_f%lld_fb1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0);
m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024);
*/
GSVector4i rsize = r.rsize();
dev->Dispatch(rsize.z >> 4, rsize.w >> 3, 1); // TODO: pass upper-left corner offset (r.xy) in a const buffer
/*
ctx->End(q);
uint64 t0 = __rdtsc();
BOOL b;
while(S_OK != ctx->GetData(q, &b, sizeof(BOOL), 0)) {}
printf("%lld\n", __rdtsc() - t0);
*/
s_n++;
}
}
void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated
Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated (important)
// TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target
// TODO: invalidate texture cache
@ -356,6 +703,10 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r)
memset(&box, 0, sizeof(box));
box.right = 1;
box.bottom = 1;
box.back = 1;
uint32* pages = o->GetPages(r);
for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
@ -370,10 +721,20 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r)
m_vm_valid[row] |= col;
box.left = page * PAGE_SIZE;
box.right = box.left + PAGE_SIZE;
box.right = (page + 1) * PAGE_SIZE;
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + box.left, 0, 0);
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0);
/*
// m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4
box.left = 0;
box.right = PAGE_SIZE;
box.top = page;
box.bottom = box.top + 1;
ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0);
*/
if(0)
printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page);
}
}
@ -391,6 +752,10 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
memset(&box, 0, sizeof(box));
box.right = 1;
box.bottom = 1;
box.back = 1;
uint32* pages = o->GetPages(r);
for(size_t i = 0; pages[i] != GSOffset::EOP; i++)
@ -402,21 +767,34 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
if(m_vm_valid[row] & col)
{
if(invalidate) m_vm_valid[row] ^= col;
if(invalidate)
{
m_vm_valid[row] ^= col;
}
box.left = page * PAGE_SIZE;
box.right = box.left + PAGE_SIZE;
box.right = (page + 1) * PAGE_SIZE;
ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
/*
// m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4
box.left = 0;
box.right = PAGE_SIZE;
box.top = page;
box.bottom = box.top + 1;
ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box);
*/
D3D11_MAPPED_SUBRESOURCE map;
if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ_WRITE, 0, &map)))
if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ, 0, &map)))
{
memcpy(m_mem.m_vm8 + box.left, map.pData, PAGE_SIZE);
memcpy(m_mem.m_vm8 + page * PAGE_SIZE, map.pData, PAGE_SIZE);
ctx->Unmap(m_pb, 0);
if(0)
printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page);
}
}
@ -424,3 +802,64 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate)
delete [] pages;
}
bool GSRendererCS::GetOffsetBuffer(OffsetBuffer** fzbo)
{
HRESULT hr;
GSDevice11* dev = (GSDevice11*)m_dev;
D3D11_BUFFER_DESC bd;
D3D11_SHADER_RESOURCE_VIEW_DESC srvd;
D3D11_SUBRESOURCE_DATA data;
hash_map<uint32, OffsetBuffer>::iterator i = m_offset.find(m_context->offset.fzb->hash);
if(i == m_offset.end())
{
OffsetBuffer ob;
memset(&bd, 0, sizeof(bd));
bd.ByteWidth = sizeof(GSVector2i) * 2048;
bd.Usage = D3D11_USAGE_IMMUTABLE;
bd.BindFlags = D3D11_BIND_SHADER_RESOURCE;
memset(&data, 0, sizeof(data));
data.pSysMem = m_context->offset.fzb->row;
hr = (*dev)->CreateBuffer(&bd, &data, &ob.row);
if(FAILED(hr)) return false;
data.pSysMem = m_context->offset.fzb->col;
hr = (*dev)->CreateBuffer(&bd, &data, &ob.col);
if(FAILED(hr)) return false;
memset(&srvd, 0, sizeof(srvd));
srvd.Format = DXGI_FORMAT_R32G32_SINT;
srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
srvd.Buffer.FirstElement = 0;
srvd.Buffer.NumElements = 2048;
hr = (*dev)->CreateShaderResourceView(ob.row, &srvd, &ob.row_srv);
if(FAILED(hr)) return false;
hr = (*dev)->CreateShaderResourceView(ob.col, &srvd, &ob.col_srv);
if(FAILED(hr)) return false;
m_offset[m_context->offset.fzb->hash] = ob;
i = m_offset.find(m_context->offset.fzb->hash);
}
*fzbo = &i->second;
return true;
}

View File

@ -26,28 +26,114 @@
class GSRendererCS : public GSRenderer
{
class GSVertexTraceCS : public GSVertexTrace
struct VSSelector
{
public:
GSVertexTraceCS(const GSState* state) : GSVertexTrace(state) {}
union
{
struct
{
uint32 tme:1;
uint32 fst:1;
};
uint32 key;
};
operator uint32() {return key & 0x3;}
VSSelector() : key(0) {}
};
__aligned(struct, 32) VSConstantBuffer
{
GSVector4 VertexScale;
GSVector4 VertexOffset;
};
struct GSSelector
{
union
{
struct
{
uint32 iip:1;
uint32 prim:2;
};
uint32 key;
};
operator uint32() {return key & 0x7;}
GSSelector() : key(0) {}
};
struct PSSelector
{
union
{
struct
{
uint32 fpsm:6;
uint32 zpsm:6;
};
uint32 key;
};
operator uint32() {return key & 0x3ff;}
PSSelector() : key(0) {}
};
__aligned(struct, 32) PSConstantBuffer
{
uint32 fm;
uint32 zm;
};
CComPtr<ID3D11DepthStencilState> m_dss;
CComPtr<ID3D11BlendState> m_bs;
CComPtr<ID3D11SamplerState> m_ss;
CComPtr<ID3D11Buffer> m_lb;
CComPtr<ID3D11UnorderedAccessView> m_lb_uav;
CComPtr<ID3D11ShaderResourceView> m_lb_srv;
CComPtr<ID3D11Buffer> m_sob;
CComPtr<ID3D11UnorderedAccessView> m_sob_uav;
CComPtr<ID3D11ShaderResourceView> m_sob_srv;
CComPtr<ID3D11Buffer> m_vm;
//CComPtr<ID3D11Texture2D> m_vm;
CComPtr<ID3D11UnorderedAccessView> m_vm_uav;
CComPtr<ID3D11Buffer> m_vb;
CComPtr<ID3D11Buffer> m_ib;
CComPtr<ID3D11Buffer> m_pb;
hash_map<uint32, CComPtr<ID3D11ComputeShader> > m_cs;
uint32 m_vm_valid[16];
CComPtr<ID3D11Buffer> m_pb;
//CComPtr<ID3D11Texture2D> m_pb;
hash_map<uint32, GSVertexShader11 > m_vs;
CComPtr<ID3D11Buffer> m_vs_cb;
hash_map<uint32, CComPtr<ID3D11GeometryShader> > m_gs;
CComPtr<ID3D11PixelShader> m_ps0;
hash_map<uint32, CComPtr<ID3D11PixelShader> > m_ps1;
CComPtr<ID3D11Buffer> m_ps_cb;
void Write(GSOffset* o, const GSVector4i& r);
void Read(GSOffset* o, const GSVector4i& r, bool invalidate);
struct OffsetBuffer
{
CComPtr<ID3D11Buffer> row, col;
CComPtr<ID3D11ShaderResourceView> row_srv, col_srv;
};
hash_map<uint32, OffsetBuffer> m_offset;
bool GetOffsetBuffer(OffsetBuffer** fzbo);
protected:
template<uint32 prim, uint32 tme, uint32 fst>
void ConvertVertex(size_t dst_index, size_t src_index);
GSTexture* m_texture[2];
uint8* m_output;
bool CreateDevice(GSDevice* dev);
void ResetDevice();
void VSync(int field);
GSTexture* GetOutput(int i);
void Draw();
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);

View File

@ -23,10 +23,9 @@
#include "GSRendererDX.h"
#include "GSDeviceDX.h"
GSRendererDX::GSRendererDX(GSVertexTrace* vt, size_t vertex_stride, GSTextureCache* tc, const GSVector2& pixelcenter)
: GSRendererHW(vt, vertex_stride, tc)
GSRendererDX::GSRendererDX(GSTextureCache* tc, const GSVector2& pixelcenter)
: GSRendererHW(tc)
, m_pixelcenter(pixelcenter)
, m_topology(-1)
{
m_logz = !!theApp.GetConfig("logz", 0);
m_fba = !!theApp.GetConfig("fba", 1);
@ -61,7 +60,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
GSVector4 s = GSVector4(rtscale.x / rtsize.x, rtscale.y / rtsize.y);
GSVector4 o = GSVector4(-1.0f, 1.0f);
GSVector4 src = ((m_vt->m_min.p.xyxy(m_vt->m_max.p) + o.xxyy()) * s.xyxy()).sat(o.zzyy());
GSVector4 src = ((m_vt.m_min.p.xyxy(m_vt.m_max.p) + o.xxyy()) * s.xyxy()).sat(o.zzyy());
GSVector4 dst = src * 2.0f + o.xxxx();
GSVertexPT1 vertices[] =
@ -111,7 +110,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
if(!IsOpaque())
{
om_bsel.abe = PRIM->ABE || PRIM->AA1 && m_vt->m_primclass == GS_LINE_CLASS;
om_bsel.abe = PRIM->ABE || PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS;
om_bsel.a = context->ALPHA.A;
om_bsel.b = context->ALPHA.B;
@ -154,11 +153,11 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
{
if(context->ZBUF.PSM == PSM_PSMZ24)
{
if(m_vt->m_max.p.z > 0xffffff)
if(m_vt.m_max.p.z > 0xffffff)
{
ASSERT(m_vt->m_min.p.z > 0xffffff);
ASSERT(m_vt.m_min.p.z > 0xffffff);
// Fixme :Following conditional fixes some dialog frame in Wild Arms 3, but may not be what was intended.
if (m_vt->m_min.p.z > 0xffffff)
if (m_vt.m_min.p.z > 0xffffff)
{
vs_sel.bppz = 1;
om_dssel.ztst = ZTST_ALWAYS;
@ -167,11 +166,11 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
}
else if(context->ZBUF.PSM == PSM_PSMZ16 || context->ZBUF.PSM == PSM_PSMZ16S)
{
if(m_vt->m_max.p.z > 0xffff)
if(m_vt.m_max.p.z > 0xffff)
{
ASSERT(m_vt->m_min.p.z > 0xffff); // sfex capcom logo
ASSERT(m_vt.m_min.p.z > 0xffff); // sfex capcom logo
// Fixme : Same as above, I guess.
if (m_vt->m_min.p.z > 0xffff)
if (m_vt.m_min.p.z > 0xffff)
{
vs_sel.bppz = 2;
om_dssel.ztst = ZTST_ALWAYS;
@ -213,7 +212,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
GSDeviceDX::GSSelector gs_sel;
gs_sel.iip = PRIM->IIP;
gs_sel.prim = m_vt->m_primclass;
gs_sel.prim = m_vt.m_primclass;
// ps
@ -233,7 +232,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
}
}
if (env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
if(env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST)
{
ps_sel.colclip = 1;
}
@ -281,7 +280,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
ps_sel.aem = env.TEXA.AEM;
ps_sel.tfx = context->TEX0.TFX;
ps_sel.tcc = context->TEX0.TCC;
ps_sel.ltf = m_filter == 2 ? m_vt->IsLinear() : m_filter;
ps_sel.ltf = m_filter == 2 ? m_vt.IsLinear() : m_filter;
ps_sel.rt = tex->m_target;
int w = tex->m_texture->GetWidth();
@ -330,8 +329,9 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc
uint8 afix = context->ALPHA.FIX;
SetupIA();
dev->SetupOM(om_dssel, om_bsel, afix);
dev->SetupIA(m_vertex.buff, m_vertex.next, m_index.buff, m_index.tail, m_topology);
dev->SetupVS(vs_sel, &vs_cb);
dev->SetupGS(gs_sel);
dev->SetupPS(ps_sel, &ps_cb, ps_ssel);

View File

@ -32,13 +32,12 @@ class GSRendererDX : public GSRendererHW
bool UserHacks_AlphaHack;
protected:
int m_topology;
virtual void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex);
virtual void SetupIA() = 0;
virtual void UpdateFBA(GSTexture* rt) {}
public:
GSRendererDX(GSVertexTrace* vt, size_t vertex_stride, GSTextureCache* tc, const GSVector2& pixelcenter = GSVector2(0, 0));
GSRendererDX(GSTextureCache* tc, const GSVector2& pixelcenter = GSVector2(0, 0));
virtual ~GSRendererDX();
};

View File

@ -25,9 +25,8 @@
#include "resource.h"
GSRendererDX11::GSRendererDX11()
: GSRendererDX(new GSVertexTraceDX11(this), sizeof(GSVertexHW11), new GSTextureCache11(this), GSVector2(-0.5f, -0.5f))
: GSRendererDX(new GSTextureCache11(this), GSVector2(-0.5f, -0.5f))
{
InitConvertVertex(GSRendererDX11);
}
bool GSRendererDX11::CreateDevice(GSDevice* dev)
@ -38,43 +37,38 @@ bool GSRendererDX11::CreateDevice(GSDevice* dev)
return true;
}
template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererDX11::ConvertVertex(size_t dst_index, size_t src_index)
void GSRendererDX11::SetupIA()
{
GSVertex* s = (GSVertex*)((GSVertexHW11*)m_vertex.buff + src_index);
GSVertexHW11* d = (GSVertexHW11*)m_vertex.buff + dst_index;
GSDevice11* dev = (GSDevice11*)m_dev;
GSVector4i v0 = ((GSVector4i*)s)[0];
GSVector4i v1 = ((GSVector4i*)s)[1];
void* ptr = NULL;
if(tme && fst)
if(dev->IAMapVertexBuffer(&ptr, sizeof(GSVertex), m_vertex.next))
{
// TODO: modify VertexTrace and the shaders to read uv from v1.u16[0], v1.u16[1], then this step is not needed
GSVector4i::storent(ptr, m_vertex.buff, sizeof(GSVertex) * m_vertex.next);
v0 = GSVector4i::cast(GSVector4(v1.uph16()).xyzw(GSVector4::cast(v0))); // uv => st
dev->IAUnmapVertexBuffer();
}
((GSVector4i*)d)[0] = v0;
((GSVector4i*)d)[1] = v1;
}
dev->IASetIndexBuffer(m_index.buff, m_index.tail);
void GSRendererDX11::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)
{
switch(m_vt->m_primclass)
D3D11_PRIMITIVE_TOPOLOGY t;
switch(m_vt.m_primclass)
{
case GS_POINT_CLASS:
m_topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST;
t = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
m_topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST;
t = D3D11_PRIMITIVE_TOPOLOGY_LINELIST;
break;
case GS_TRIANGLE_CLASS:
m_topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
t = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST;
break;
default:
__assume(0);
}
__super::DrawPrims(rt, ds, tex);
dev->IASetPrimitiveTopology(t);
}

View File

@ -28,14 +28,7 @@
class GSRendererDX11 : public GSRendererDX
{
protected:
template<uint32 prim, uint32 tme, uint32 fst>
void ConvertVertex(size_t dst_index, size_t src_index);
void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex);
int GetPosX(const void* vertex) const {return (int)((const GSVertexHW11*)vertex)->p.x;}
int GetPosY(const void* vertex) const {return (int)((const GSVertexHW11*)vertex)->p.y;}
uint32 GetColor(const void* vertex) const {return ((const GSVertexHW11*)vertex)->c0;}
void SetColor(void* vertex, uint32 c) const {((GSVertexHW11*)vertex)->c0 = c;}
void SetupIA();
public:
GSRendererDX11();

View File

@ -25,9 +25,8 @@
#include "resource.h"
GSRendererDX9::GSRendererDX9()
: GSRendererDX(new GSVertexTraceDX9(this), sizeof(GSVertexHW9), new GSTextureCache9(this))
: GSRendererDX(new GSTextureCache9(this))
{
InitConvertVertex(GSRendererDX9);
}
bool GSRendererDX9::CreateDevice(GSDevice* dev)
@ -57,56 +56,21 @@ bool GSRendererDX9::CreateDevice(GSDevice* dev)
return true;
}
template<uint32 prim, uint32 tme, uint32 fst>
void GSRendererDX9::ConvertVertex(size_t dst_index, size_t src_index)
void GSRendererDX9::SetupIA()
{
GSVertex* s = (GSVertex*)((GSVertexHW9*)m_vertex.buff + src_index);
GSVertexHW9* d = (GSVertexHW9*)m_vertex.buff + dst_index;
D3DPRIMITIVETYPE topology;
GSVector4 p = GSVector4(GSVector4i::load(s->XYZ.u32[0]).upl16());
if(tme && !fst)
{
p = p.xyxy(GSVector4((float)s->XYZ.Z, s->RGBAQ.Q));
}
else
{
p = p.xyxy(GSVector4::load((float)s->XYZ.Z));
}
GSVector4 t = GSVector4::zero();
if(tme)
{
if(fst)
{
t = GSVector4(GSVector4i::load(s->UV).upl16());
}
else
{
t = GSVector4::loadl(&s->ST);
}
}
t = t.xyxy(GSVector4::cast(GSVector4i(s->RGBAQ.u32[0], s->FOG)));
d->p = p;
d->t = t;
}
void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex)
{
switch(m_vt->m_primclass)
switch(m_vt.m_primclass)
{
case GS_POINT_CLASS:
m_topology = D3DPT_POINTLIST;
topology = D3DPT_POINTLIST;
break;
case GS_LINE_CLASS:
m_topology = D3DPT_LINELIST;
topology = D3DPT_LINELIST;
if(PRIM->IIP == 0)
{
@ -122,7 +86,7 @@ void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
case GS_TRIANGLE_CLASS:
m_topology = D3DPT_TRIANGLELIST;
topology = D3DPT_TRIANGLELIST;
if(PRIM->IIP == 0)
{
@ -138,7 +102,7 @@ void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
case GS_SPRITE_CLASS:
m_topology = D3DPT_TRIANGLELIST;
topology = D3DPT_TRIANGLELIST;
// each sprite converted to quad needs twice the space
@ -154,29 +118,35 @@ void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
size_t count = m_vertex.next;
int i = (int)count * 2 - 4;
GSVertexHW9* s = (GSVertexHW9*)&m_vertex.buff[sizeof(GSVertexHW9) * count] - 2;
GSVertexHW9* q = (GSVertexHW9*)&m_vertex.buff[sizeof(GSVertexHW9) * (count * 2)] - 4;
uint32* RESTRICT index = &m_index.buff[count * 3] - 6;
GSVertex* s = &m_vertex.buff[count - 2];
GSVertex* q = &m_vertex.buff[count * 2 - 4];
uint32* RESTRICT index = &m_index.buff[count * 3 - 6];
for(; i >= 0; i -= 4, s -= 2, q -= 4, index -= 6)
{
GSVertexHW9 v0 = s[0];
GSVertexHW9 v1 = s[1];
GSVertex v0 = s[0];
GSVertex v1 = s[1];
v0.p = v0.p.xyzw(v1.p); // z, q
v0.t = v0.t.xyzw(v1.t); // c, f
v0.RGBAQ = v1.RGBAQ;
v0.XYZ.Z = v1.XYZ.Z;
v0.FOG = v1.FOG;
q[0] = v0;
q[3] = v1;
// swap x, s
// swap x, s, u
GSVector4 p = v0.p.insert<0, 0>(v1.p);
GSVector4 t = v0.t.insert<0, 0>(v1.t);
v1.p = v1.p.insert<0, 0>(v0.p);
v1.t = v1.t.insert<0, 0>(v0.t);
v0.p = p;
v0.t = t;
uint16 x = v0.XYZ.X;
v0.XYZ.X = v1.XYZ.X;
v1.XYZ.X = x;
float s = v0.ST.S;
v0.ST.S = v1.ST.S;
v1.ST.S = s;
uint16 u = v0.U;
v0.U = v1.U;
v1.U = u;
q[1] = v0;
q[2] = v1;
@ -199,9 +169,56 @@ void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour
__assume(0);
}
(*(GSDevice9*)m_dev)->SetRenderState(D3DRS_SHADEMODE, PRIM->IIP ? D3DSHADE_GOURAUD : D3DSHADE_FLAT); // TODO
GSDevice9* dev = (GSDevice9*)m_dev;
__super::DrawPrims(rt, ds, tex);
(*dev)->SetRenderState(D3DRS_SHADEMODE, PRIM->IIP ? D3DSHADE_GOURAUD : D3DSHADE_FLAT); // TODO
void* ptr = NULL;
if(dev->IAMapVertexBuffer(&ptr, sizeof(GSVertexHW9), m_vertex.next))
{
GSVertex* RESTRICT s = (GSVertex*)m_vertex.buff;
GSVertexHW9* RESTRICT d = (GSVertexHW9*)ptr;
for(int i = 0; i < m_vertex.next; i++, s++, d++)
{
GSVector4 p = GSVector4(GSVector4i::load(s->XYZ.u32[0]).upl16());
if(PRIM->TME && !PRIM->FST)
{
p = p.xyxy(GSVector4((float)s->XYZ.Z, s->RGBAQ.Q));
}
else
{
p = p.xyxy(GSVector4::load((float)s->XYZ.Z));
}
GSVector4 t = GSVector4::zero();
if(PRIM->TME)
{
if(PRIM->FST)
{
t = GSVector4(GSVector4i::load(s->UV).upl16());
}
else
{
t = GSVector4::loadl(&s->ST);
}
}
t = t.xyxy(GSVector4::cast(GSVector4i(s->RGBAQ.u32[0], s->FOG)));
d->p = p;
d->t = t;
}
dev->IAUnmapVertexBuffer();
}
dev->IASetIndexBuffer(m_index.buff, m_index.tail);
dev->IASetPrimitiveTopology(topology);
}
void GSRendererDX9::UpdateFBA(GSTexture* rt)
@ -220,7 +237,7 @@ void GSRendererDX9::UpdateFBA(GSTexture* rt)
GSVector4 s = GSVector4(rt->GetScale().x / rt->GetWidth(), rt->GetScale().y / rt->GetHeight());
GSVector4 o = GSVector4(-1.0f, 1.0f);
GSVector4 src = ((m_vt->m_min.p.xyxy(m_vt->m_max.p) + o.xxyy()) * s.xyxy()).sat(o.zzyy());
GSVector4 src = ((m_vt.m_min.p.xyxy(m_vt.m_max.p) + o.xxyy()) * s.xyxy()).sat(o.zzyy());
GSVector4 dst = src * 2.0f + o.xxxx();
GSVertexPT1 vertices[] =

View File

@ -34,17 +34,9 @@ protected:
Direct3DBlendState9 bs;
} m_fba;
template<uint32 prim, uint32 tme, uint32 fst>
void ConvertVertex(size_t dst_index, size_t src_index);
void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex);
void SetupIA();
void UpdateFBA(GSTexture* rt);
int GetPosX(const void* vertex) const {return (int)((const GSVertexHW9*)vertex)->p.x;}
int GetPosY(const void* vertex) const {return (int)((const GSVertexHW9*)vertex)->p.y;}
uint32 GetColor(const void* vertex) const {return ((const GSVertexHW9*)vertex)->t.u32[2];}
void SetColor(void* vertex, uint32 c) const {((GSVertexHW9*)vertex)->t.u32[2] = c;}
public:
GSRendererDX9();
virtual ~GSRendererDX9() {}

View File

@ -22,9 +22,8 @@
#include "stdafx.h"
#include "GSRendererHW.h"
GSRendererHW::GSRendererHW(GSVertexTrace* vt, size_t vertex_stride, GSTextureCache* tc)
: GSRenderer(vt, vertex_stride)
, m_tc(tc)
GSRendererHW::GSRendererHW(GSTextureCache* tc)
: m_tc(tc)
, m_width(1024)
, m_height(1024)
, m_skip(0)
@ -101,19 +100,18 @@ void GSRendererHW::Reset()
void GSRendererHW::VSync(int field)
{
GSRenderer::VSync(field);
m_tc->IncAge();
m_dev->AgePool();
m_skip = 0;
if(m_reset)
{
m_tc->RemoveAll();
m_reset = false;
}
GSRenderer::VSync(field);
m_tc->IncAge();
m_skip = 0;
}
void GSRendererHW::ResetDevice()
@ -212,7 +210,7 @@ void GSRendererHW::Draw()
GSVector4i r;
GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt->IsLinear());
GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear());
tex = m_tc->LookupSource(context->TEX0, env.TEXA, r);
@ -299,7 +297,7 @@ void GSRendererHW::Draw()
//
GSVector4i r = GSVector4i(m_vt->m_min.p.xyxy(m_vt->m_max.p)).rintersect(GSVector4i(context->scissor.in));
GSVector4i r = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(GSVector4i(context->scissor.in));
if(fm != 0xffffffff)
{
@ -411,14 +409,14 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source
if(lines == 0)
{
if(m_vt->m_primclass == GS_LINE_CLASS && (m_vertex.next == 448 * 2 || m_vertex.next == 512 * 2))
if(m_vt.m_primclass == GS_LINE_CLASS && (m_vertex.next == 448 * 2 || m_vertex.next == 512 * 2))
{
lines = m_vertex.next / 2;
}
}
else
{
if(m_vt->m_primclass == GS_POINT_CLASS)
if(m_vt.m_primclass == GS_POINT_CLASS)
{
if(m_vertex.next >= 16 * 512)
{
@ -429,14 +427,14 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source
int ox = m_context->XYOFFSET.OFX;
int oy = m_context->XYOFFSET.OFY;
const uint8* RESTRICT v = m_vertex.buff;
const GSVertex* RESTRICT v = m_vertex.buff;
for(int i = (int)m_vertex.next; i >= 0; i--, v += m_vertex.stride)
for(int i = (int)m_vertex.next; i >= 0; i--, v++)
{
int x = (GetPosX(v) - ox) >> 4;
int y = (GetPosY(v) - oy) >> 4;
int x = (v->XYZ.X - ox) >> 4;
int y = (v->XYZ.Y - oy) >> 4;
video[(y << 8) + (y << 7) + (y << 6) + x] = GetColor(v);
video[(y << 8) + (y << 7) + (y << 6) + x] = v->RGBAQ.u32[0];
}
return false;
@ -446,7 +444,7 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source
lines = 0;
}
}
else if(m_vt->m_primclass == GS_LINE_CLASS)
else if(m_vt.m_primclass == GS_LINE_CLASS)
{
if(m_vertex.next == lines * 2)
{
@ -459,10 +457,8 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source
t->m_texture->Update(GSVector4i(0, 0, 448, lines), video, 448 * 4);
size_t stride = m_vertex.stride;
memcpy(&m_vertex.buff[stride * 2], &m_vertex.buff[stride * (m_vertex.next - 2)], stride);
memcpy(&m_vertex.buff[stride * 3], &m_vertex.buff[stride * (m_vertex.next - 1)], stride);
m_vertex.buff[2] = m_vertex.buff[m_vertex.next - 2];
m_vertex.buff[3] = m_vertex.buff[m_vertex.next - 1];
m_index.buff[0] = 0;
m_index.buff[1] = 1;
@ -474,7 +470,7 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source
m_vertex.head = m_vertex.tail = m_vertex.next = 4;
m_index.tail = 6;
m_vt->Update(m_vertex.buff, m_index.buff, m_index.tail, GS_TRIANGLE_CLASS);
m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, GS_TRIANGLE_CLASS);
}
else
{
@ -506,11 +502,11 @@ bool GSRendererHW::OI_MetalSlug6(GSTexture* rt, GSTexture* ds, GSTextureCache::S
{
// missing red channel fix (looks alright in pcsx2 r5000+)
uint8* RESTRICT v = m_vertex.buff;
GSVertex* RESTRICT v = m_vertex.buff;
for(int i = (int)m_vertex.next; i >= 0; i--, v += m_vertex.stride)
for(int i = (int)m_vertex.next; i >= 0; i--, v++)
{
uint32 c = GetColor(v);
uint32 c = v->RGBAQ.u32[0];
uint32 r = (c >> 0) & 0xff;
uint32 g = (c >> 8) & 0xff;
@ -518,11 +514,11 @@ bool GSRendererHW::OI_MetalSlug6(GSTexture* rt, GSTexture* ds, GSTextureCache::S
if(r == 0 && g != 0 && b != 0)
{
SetColor(v, (c & 0xffffff00) | ((g + b + 1) >> 1));
v->RGBAQ.u32[0] = (c & 0xffffff00) | ((g + b + 1) >> 1);
}
}
m_vt->Update(m_vertex.buff, m_index.buff, m_index.tail, m_vt->m_primclass);
m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, m_vt.m_primclass);
return true;
}
@ -702,7 +698,7 @@ bool GSRendererHW::OI_StarWarsForceUnleashed(GSTexture* rt, GSTexture* ds, GSTex
}
else if(PRIM->TME)
{
if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt->m_max.p.z == m_vt->m_min.p.z && m_vt->m_max.p.z == 0))
if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_max.p.z == 0))
{
m_dev->ClearDepth(ds, 0);
}
@ -758,7 +754,7 @@ bool GSRendererHW::OI_SpyroNewBeginning(GSTexture* rt, GSTexture* ds, GSTextureC
}
else if(PRIM->TME)
{
if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt->m_max.p.z == m_vt->m_min.p.z && m_vt->m_min.p.z == 0x0))
if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_min.p.z == 0))
{
m_dev->ClearDepth(ds, 0);
}
@ -784,7 +780,7 @@ bool GSRendererHW::OI_SpyroEternalNight(GSTexture* rt, GSTexture* ds, GSTextureC
}
else if(PRIM->TME)
{
if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt->m_max.p.z == m_vt->m_min.p.z && m_vt->m_min.p.z == 0x0))
if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_min.p.z == 0))
{
m_dev->ClearDepth(ds, 0);
}
@ -798,7 +794,7 @@ bool GSRendererHW::OI_TalesOfLegendia(GSTexture* rt, GSTexture* ds, GSTextureCac
uint32 FBP = m_context->FRAME.Block();
uint32 FPSM = m_context->FRAME.PSM;
if (FPSM == PSM_PSMCT32 && FBP == 0x01c00 && !m_context->TEST.ATE && m_vt->m_max.p.z == m_vt->m_min.p.z)
if (FPSM == PSM_PSMCT32 && FBP == 0x01c00 && !m_context->TEST.ATE && m_vt.m_eq.z)
{
m_context->TEST.ZTST = ZTST_ALWAYS;
//m_dev->ClearDepth(ds, 0);
@ -810,7 +806,7 @@ bool GSRendererHW::OI_TalesOfLegendia(GSTexture* rt, GSTexture* ds, GSTextureCac
bool GSRendererHW::OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t)
{
if(m_vt->m_primclass == GS_POINT_CLASS && !PRIM->TME)
if(m_vt.m_primclass == GS_POINT_CLASS && !PRIM->TME)
{
uint32 FBP = m_context->FRAME.Block();
uint32 FBW = m_context->FRAME.FBW;
@ -819,16 +815,16 @@ bool GSRendererHW::OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCa
{
if(m_vertex.next == 16)
{
uint8* RESTRICT v = m_vertex.buff;
GSVertex* RESTRICT v = m_vertex.buff;
for(int i = 0; i < 16; i++, v += m_vertex.stride)
for(int i = 0; i < 16; i++, v++)
{
uint32 c = GetColor(v);
uint32 c = v->RGBAQ.u32[0];
uint32 a = c >> 24;
c = (a >= 0x80 ? 0xff000000 : (a << 25)) | (c & 0x00ffffff);
SetColor(v, c);
v->RGBAQ.u32[0] = c;
m_mem.WritePixel32(i & 7, i >> 3, c, FBP, FBW);
}
@ -839,16 +835,16 @@ bool GSRendererHW::OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCa
}
else if(m_vertex.next == 256)
{
uint8* RESTRICT v = m_vertex.buff;
GSVertex* RESTRICT v = m_vertex.buff;
for(int i = 0; i < 256; i++, v += m_vertex.stride)
for(int i = 0; i < 256; i++, v++)
{
uint32 c = GetColor(v);
uint32 c = v->RGBAQ.u32[0];
uint32 a = c >> 24;
c = (a >= 0x80 ? 0xff000000 : (a << 25)) | (c & 0x00ffffff);
SetColor(v, c);
v->RGBAQ.u32[0] = c;
m_mem.WritePixel32(i & 15, i >> 4, c, FBP, FBW);
}

View File

@ -126,11 +126,6 @@ private:
} m_hacks;
virtual int GetPosX(const void* vertex) const = 0;
virtual int GetPosY(const void* vertex) const = 0;
virtual uint32 GetColor(const void* vertex) const = 0;
virtual void SetColor(void* vertex, uint32 c) const = 0;
#pragma endregion
protected:
@ -139,7 +134,7 @@ protected:
virtual void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) = 0;
public:
GSRendererHW(GSVertexTrace* vt, size_t vertex_stride, GSTextureCache* tc);
GSRendererHW(GSTextureCache* tc);
virtual ~GSRendererHW();
void SetGameCRC(uint32 crc, int options);

View File

@ -32,11 +32,6 @@ class GSRendererNull : public GSRenderer
};
protected:
template<uint32 prim, uint32 tme, uint32 fst>
void ConvertVertex(size_t dst_index, size_t src_index)
{
}
void Draw()
{
}
@ -48,8 +43,7 @@ protected:
public:
GSRendererNull()
: GSRenderer(new GSVertexTraceNull(this), sizeof(GSVertex))
: GSRenderer()
{
InitConvertVertex(GSRendererNull);
}
};

File diff suppressed because it is too large Load Diff

View File

@ -29,27 +29,48 @@ class GSRendererSW : public GSRenderer
{
class SharedData : public GSDrawScanline::SharedData
{
__aligned(struct, 16) TextureLevel
{
GSVector4i r;
GSTextureCacheSW::Texture* t;
};
public:
GSRendererSW* m_parent;
const uint32* m_fb_pages;
const uint32* m_zb_pages;
const uint32* m_tex_pages[7 + 1]; // NULL terminated
int m_fpsm;
int m_zpsm;
bool m_using_pages;
TextureLevel m_tex[7 + 1]; // NULL terminated
enum {SyncNone, SyncSource, SyncTarget} m_syncpoint;
public:
SharedData(GSRendererSW* parent);
virtual ~SharedData();
void UseTargetPages(const uint32* fb_pages, const uint32* zb_pages);
void UseSourcePages(GSTextureCacheSW::Texture* t, int level);
void UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm);
void ReleasePages();
void SetSource(GSTextureCacheSW::Texture* t, const GSVector4i& r, int level);
void UpdateSource();
};
typedef void (GSRendererSW::*ConvertVertexBufferPtr)(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
ConvertVertexBufferPtr m_cvb[4][2][2];
template<uint32 primclass, uint32 tme, uint32 fst>
void ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count);
protected:
IRasterizer* m_rl;
GSTextureCacheSW* m_tc;
GSTexture* m_texture[2];
uint8* m_output;
bool m_reset;
GSPixelOffset4* m_fzb;
GSVector4i m_fzb_bbox;
uint32 m_fzb_cur_pages[16];
uint32 m_fzb_pages[512]; // uint16 frame/zbuf pages interleaved
uint16 m_tex_pages[512];
uint32 m_tmp_pages[512 + 1];
@ -60,19 +81,19 @@ protected:
GSTexture* GetOutput(int i);
void Draw();
void Queue(shared_ptr<GSRasterizerData>& item);
void Sync(int reason);
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false);
void UsePages(const uint32* pages, int type);
void ReleasePages(const uint32* pages, int type);
template<uint32 mask> bool CheckTargetPages(const uint32* pages);
bool CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r);
bool CheckSourcePages(SharedData* sd);
bool GetScanlineGlobalData(SharedData* data);
template<uint32 prim, uint32 tme, uint32 fst>
void ConvertVertex(size_t dst_index, size_t src_index);
public:
GSRendererSW(int threads);
virtual ~GSRendererSW();

View File

@ -24,6 +24,8 @@
#include "GSLocalMemory.h"
#include "GSVector.h"
#define GS_BILINEAR_PRECISION 4 // max precision 15, but several games like okami, rogue galaxy, dq8 break above 4
union GSScanlineSelector
{
struct
@ -65,8 +67,9 @@ union GSScanlineSelector
uint32 edge:1; // 48
uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3)
uint32 lcm:1; // 50
uint32 mmin:2; // 51
uint32 lcm:1; // 52
uint32 mmin:2; // 53
uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels)
};
struct

View File

@ -315,7 +315,7 @@ void GSSettingsDlg::UpdateControls()
EnableWindow(GetDlgItem(m_hWnd, IDC_NATIVERES), hw);
EnableWindow(GetDlgItem(m_hWnd, IDC_FILTER), hw);
EnableWindow(GetDlgItem(m_hWnd, IDC_PALTEX), hw);
EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw && GSDevice9::GetMaxDepth(m_lastValidMsaa) < 32);
EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw);
EnableWindow(GetDlgItem(m_hWnd, IDC_FBA), dx9 && hw);
//EnableWindow(GetDlgItem(m_hWnd, IDC_AA1), sw); // Let uers set software params regardless of renderer used
//EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS_EDIT), sw);

View File

@ -38,7 +38,7 @@ void GSSetupPrimCodeGenerator::Generate()
{
mov(edx, dword[esp + _dscan]);
for(int i = 0; i < 5; i++)
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
vmovaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
@ -80,7 +80,7 @@ void GSSetupPrimCodeGenerator::Depth()
vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
vmovdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
@ -103,7 +103,7 @@ void GSSetupPrimCodeGenerator::Depth()
vmulps(xmm1, xmm0, xmm3);
vmovdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
@ -139,36 +139,6 @@ void GSSetupPrimCodeGenerator::Depth()
vmovdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
/*
// GSVector4 z = p.zzzz();
vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
vbroadcastss(xmm1, ptr[&GSVector4::m_half]);
vmulps(xmm1, xmm0);
vcvttps2dq(xmm1, xmm1);
vpslld(xmm1, 1);
vcvttps2dq(xmm0, xmm0);
vpcmpeqd(xmm2, xmm2);
vpsrld(xmm2, 31);
vpand(xmm0, xmm2);
vpor(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
vcvttps2dq(xmm0, xmm0);
}
*/
vmovdqa(ptr[&m_local.p.z], xmm0);
}
}
@ -210,7 +180,7 @@ void GSSetupPrimCodeGenerator::Texture()
vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
@ -272,7 +242,7 @@ void GSSetupPrimCodeGenerator::Color()
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
@ -302,7 +272,7 @@ void GSSetupPrimCodeGenerator::Color()
vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();

View File

@ -38,7 +38,7 @@ void GSSetupPrimCodeGenerator::Generate()
{
mov(edx, dword[esp + _dscan]);
for(int i = 0; i < 5; i++)
for(int i = 0; i < (m_sel.notest ? 2 : 5); i++)
{
movaps(Xmm(3 + i), ptr[&m_shift[i]]);
}
@ -82,7 +82,7 @@ void GSSetupPrimCodeGenerator::Depth()
pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0));
movdqa(ptr[&m_local.d4.f], xmm2);
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh();
@ -107,7 +107,7 @@ void GSSetupPrimCodeGenerator::Depth()
mulps(xmm1, xmm3);
movdqa(ptr[&m_local.d4.z], xmm1);
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// m_local.d[i].z = dz * m_shift[i];
@ -144,36 +144,6 @@ void GSSetupPrimCodeGenerator::Depth()
movdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]);
pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
/*
// GSVector4 z = p.zzzz();
shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
if(m_sel.zoverflow)
{
// m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001());
movaps(xmm1, ptr[&GSVector4::m_half]);
mulps(xmm1, xmm0);
cvttps2dq(xmm1, xmm1);
pslld(xmm1, 1);
cvttps2dq(xmm0, xmm0);
pcmpeqd(xmm2, xmm2);
psrld(xmm2, 31);
pand(xmm0, xmm2);
por(xmm0, xmm1);
}
else
{
// m_local.p.z = GSVector4i(z);
cvttps2dq(xmm0, xmm0);
}
*/
movdqa(ptr[&m_local.p.z], xmm0);
}
}
@ -217,7 +187,7 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4 v = ds/dt * m_shift[i];
@ -282,7 +252,7 @@ void GSSetupPrimCodeGenerator::Color()
shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i r = GSVector4i(dr * m_shift[i]).ps32();
@ -315,7 +285,7 @@ void GSSetupPrimCodeGenerator::Color()
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
for(int i = 0; i < 4; i++)
for(int i = 0; i < (m_sel.notest ? 1 : 4); i++)
{
// GSVector4i g = GSVector4i(dg * m_shift[i]).ps32();

View File

@ -26,7 +26,7 @@
//#define Offset_ST // Fixes Persona3 mini map alignment which is off even in software rendering
//#define Offset_UV // Fixes / breaks various titles
GSState::GSState(GSVertexTrace* vt, size_t vertex_stride)
GSState::GSState()
: m_version(6)
, m_mt(false)
, m_irq(NULL)
@ -35,24 +35,20 @@ GSState::GSState(GSVertexTrace* vt, size_t vertex_stride)
, m_crc(0)
, m_options(0)
, m_frameskip(0)
, m_vt(vt)
, m_vt(this)
, m_q(1.0f)
, m_texflush(true)
{
m_nativeres = !!theApp.GetConfig("nativeres", 0);
memset(&m_v, 0, sizeof(m_v));
m_q = 1.0f;
memset(&m_vertex, 0, sizeof(m_vertex));
memset(&m_index, 0, sizeof(m_index));
ASSERT(vertex_stride >= sizeof(GSVertex));
m_vertex.stride = vertex_stride;
m_vertex.tmp = (uint8*)_aligned_malloc(m_vertex.stride * 2, 32);
m_v.RGBAQ.Q = 1.0f;
GrowVertexBuffer();
memset(m_cv, 0, sizeof(m_cv));
m_sssize = 0;
m_sssize += sizeof(m_version);
@ -110,12 +106,16 @@ GSState::GSState(GSVertexTrace* vt, size_t vertex_stride)
Reset();
ResetHandlers();
s_n = 0;
s_dump = !!theApp.GetConfig("dump", 0);
s_save = !!theApp.GetConfig("save", 0);
s_savez = !!theApp.GetConfig("savez", 0);
s_saven = theApp.GetConfig("saven", 0);
}
GSState::~GSState()
{
_aligned_free(m_vertex.tmp);
if(m_vertex.buff) _aligned_free(m_vertex.buff);
if(m_index.buff) _aligned_free(m_index.buff);
}
@ -165,50 +165,28 @@ void GSState::SetFrameSkip(int skip)
{
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerNOP;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerNOP;
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = &GSState::GIFPackedRegHandlerNOP;
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = &GSState::GIFPackedRegHandlerNOP;
}
else
{
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2<GS_INVALID, 0>;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2<GS_INVALID, 0>;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerXYZF2<GS_INVALID, 1>;
m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerXYZ2<GS_INVALID, 1>;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerCLAMP<0>;
m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerCLAMP<1>;
m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG;
m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM;
m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ;
m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST;
m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerUV;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerXYZF2<GS_INVALID, 0>;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerXYZ2<GS_INVALID, 0>;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerXYZF2<GS_INVALID, 1>;
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerXYZ2<GS_INVALID, 1>;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT;
m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE;
UpdateVertexKick();
}
}
void GSState::Reset()
{
printf("GS reset\n");
// FIXME: memset(m_mem.m_vm8, 0, m_mem.m_vmsize); // bios logo not shown cut in half after reset, missing graphics in GoW after first FMV
memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path));
memset(&m_v, 0, sizeof(m_v));
@ -223,6 +201,8 @@ void GSState::Reset()
m_vertex.tail = 0;
m_vertex.next = 0;
m_index.tail = 0;
m_texflush = true;
}
void GSState::ResetHandlers()
@ -253,6 +233,8 @@ void GSState::ResetHandlers()
m_fpGIFRegHandlerXYZ[P][1] = &GSState::GIFRegHandlerXYZF2<P, 1>; \
m_fpGIFRegHandlerXYZ[P][2] = &GSState::GIFRegHandlerXYZ2<P, 0>; \
m_fpGIFRegHandlerXYZ[P][3] = &GSState::GIFRegHandlerXYZ2<P, 1>; \
m_fpGIFPackedRegHandlerSTQRGBAXYZF2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZF2<P>; \
m_fpGIFPackedRegHandlerSTQRGBAXYZ2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZ2<P>; \
SetHandlerXYZ(GS_POINTLIST);
SetHandlerXYZ(GS_LINELIST);
@ -334,6 +316,8 @@ GSVector4i GSState::GetDisplayRect(int i)
return r;
}
// There's a problem when games expand/shrink and relocate the visible area since GSdx doesn't support
// moving the output area. (Disgaea 2 intro FMV when upscaling is used, also those games hackfixed below.)
GSVector4i GSState::GetFrameRect(int i)
{
if(i < 0) i = IsEnabled(1) ? 1 : 0;
@ -356,12 +340,20 @@ GSVector4i GSState::GetFrameRect(int i)
r.top = m_regs->DISP[i].DISPFB.DBY;
r.right = r.left + w;
r.bottom = r.top + h;
//printf("%d %d %d %d %d %d\n",w,h,r.left,r.top,r.right,r.bottom);
/*static GSVector4i old_r = (GSVector4i) 0;
if ((old_r.left != r.left) || (old_r.right != r.right) || (old_r.top != r.top) || (old_r.right != r.right)){
printf("w %d h %d left %d top %d right %d bottom %d\n",w,h,r.left,r.top,r.right,r.bottom);
}
old_r = r;*/
return r;
}
GSVector2i GSState::GetDeviceSize(int i)
{
// TODO: return (m_regs->SMODE1.CMOD & 1) ? GSVector2i(640, 576) : GSVector2i(640, 480);
// TODO: other params of SMODE1 should affect the true device display size
// TODO2: pal games at 60Hz
@ -439,19 +431,12 @@ void GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* RESTRICT r)
m_v.RGBAQ.u32[0] = (uint32)GSVector4i::store(v);
#elif _M_SSE >= 0x200
#else
GSVector4i v = GSVector4i::load<false>(r) & GSVector4i::x000000ff();
m_v.RGBAQ.u32[0] = v.rgba32();
#else
m_v.RGBAQ.R = r->RGBA.R;
m_v.RGBAQ.G = r->RGBA.G;
m_v.RGBAQ.B = r->RGBA.B;
m_v.RGBAQ.A = r->RGBA.A;
#endif
m_v.RGBAQ.Q = m_q;
@ -463,16 +448,11 @@ void GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* RESTRICT r)
m_v.ST.u64 = r->u64[0];
#elif _M_SSE >= 0x200
#else
GSVector4i v = GSVector4i::loadl(r);
GSVector4i::storel(&m_v.ST.u64, v);
#else
m_v.ST.S = r->STQ.S;
m_v.ST.T = r->STQ.T;
#endif
m_q = r->STQ.Q;
@ -546,6 +526,69 @@ void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r)
{
}
template<uint32 prim>
void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size)
{
ASSERT(size > 0 && size % 3 == 0);
const GIFPackedReg* RESTRICT r_end = r + size;
while(r < r_end)
{
GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]);
xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV));
zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff());
m_v.m[1] = xy.upl32(zf); // TODO: only store the last one
VertexKick<prim>(r[2].XYZF2.Skip());
r += 3;
}
m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time
}
template<uint32 prim>
void GSState::GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uint32 size)
{
ASSERT(size > 0 && size % 3 == 0);
const GIFPackedReg* RESTRICT r_end = r + size;
while(r < r_end)
{
GSVector4i st = GSVector4i::loadl(&r[0].u64[0]);
GSVector4i q = GSVector4i::loadl(&r[0].u64[1]);
GSVector4i rgba = (GSVector4i::load<false>(&r[1]) & GSVector4i::x000000ff()).ps32().pu16();
m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one
GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]);
GSVector4i z = GSVector4i::loadl(&r[2].u64[1]);
GSVector4i xyz = xy.upl16(xy.srl<4>()).upl32(z);
m_v.m[1] = xyz.upl64(GSVector4i::loadl(&m_v.UV)); // TODO: only store the last one
VertexKick<prim>(r[2].XYZ2.Skip());
r += 3;
}
m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time
}
void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r, uint32 size)
{
}
// GIFRegHandler*
void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r)
@ -553,13 +596,13 @@ void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r)
// ASSERT(0);
}
__forceinline void GSState::ApplyPRIM(const GIFRegPRIM& prim)
__forceinline void GSState::ApplyPRIM(uint32 prim)
{
// ASSERT(r->PRIM.PRIM < 7);
if(GSUtil::GetPrimClass(m_env.PRIM.PRIM) == GSUtil::GetPrimClass(prim.PRIM)) // NOTE: assume strips/fans are converted to lists
if(GSUtil::GetPrimClass(m_env.PRIM.PRIM) == GSUtil::GetPrimClass(prim & 7)) // NOTE: assume strips/fans are converted to lists
{
if((m_env.PRIM.u32[0] ^ prim.u32[0]) & 0x7f8) // all fields except PRIM
if((m_env.PRIM.u32[0] ^ prim) & 0x7f8) // all fields except PRIM
{
Flush();
}
@ -569,8 +612,8 @@ __forceinline void GSState::ApplyPRIM(const GIFRegPRIM& prim)
Flush();
}
m_env.PRIM = (GSVector4i)prim;
m_env.PRMODE._PRIM = prim.PRIM;
m_env.PRIM.u32[0] = prim;
m_env.PRMODE._PRIM = prim;
UpdateContext();
@ -590,7 +633,7 @@ void GSState::GIFRegHandlerPRIM(const GIFReg* RESTRICT r)
{
ALIGN_STACK(32);
ApplyPRIM(r->PRIM);
ApplyPRIM(r->PRIM.u32[0]);
}
void GSState::GIFRegHandlerRGBAQ(const GIFReg* RESTRICT r)
@ -681,17 +724,49 @@ template<int i> void GSState::ApplyTEX0(GIFRegTEX0& TEX0)
if(wt)
{
GIFRegBITBLTBUF BITBLTBUF;
BITBLTBUF.SBP = TEX0.CBP;
BITBLTBUF.SBW = 1;
BITBLTBUF.SPSM = TEX0.CSM;
GSVector4i r;
GSVector4i r = GSVector4i::zero();
if(TEX0.CSM == 0)
{
BITBLTBUF.SBP = TEX0.CBP;
BITBLTBUF.SBW = 1;
BITBLTBUF.SPSM = TEX0.CSM;
r.right = GSLocalMemory::m_psm[TEX0.CPSM].pgs.x;
r.bottom = GSLocalMemory::m_psm[TEX0.CPSM].pgs.y;
r.left = 0;
r.top = 0;
r.right = GSLocalMemory::m_psm[TEX0.CPSM].bs.x;
r.bottom = GSLocalMemory::m_psm[TEX0.CPSM].bs.y;
int blocks = 4;
if(GSLocalMemory::m_psm[TEX0.CPSM].bpp == 16)
{
blocks >>= 1;
}
if(GSLocalMemory::m_psm[TEX0.PSM].bpp == 4)
{
blocks >>= 1;
}
InvalidateLocalMem(BITBLTBUF, r, true);
for(int j = 0; j < blocks; j++, BITBLTBUF.SBP++)
{
InvalidateLocalMem(BITBLTBUF, r, true);
}
}
else
{
BITBLTBUF.SBP = TEX0.CBP;
BITBLTBUF.SBW = m_env.TEXCLUT.CBW;
BITBLTBUF.SPSM = TEX0.CSM;
r.left = m_env.TEXCLUT.COU;
r.top = m_env.TEXCLUT.COV;
r.right = r.left + GSLocalMemory::m_psm[TEX0.CPSM].pal;
r.bottom = r.top + 1;
InvalidateLocalMem(BITBLTBUF, r, true);
}
m_mem.m_clut.Write(m_env.CTXT[i].TEX0, m_env.TEXCLUT);
}
@ -701,8 +776,13 @@ template<int i> void GSState::GIFRegHandlerTEX0(const GIFReg* RESTRICT r)
{
GIFRegTEX0 TEX0 = r->TEX0;
if(TEX0.TW > 10) TEX0.TW = 10;
if(TEX0.TH > 10) TEX0.TH = 10;
// Tokyo Xtreme Racer Drift 2, TW/TH == 0, PRIM->FST == 1
// Just setting the max texture size to make the texture cache allocate some surface.
// The vertex trace will narrow the updated area down to the minimum, upper-left 8x8
// for a single letter, but it may address the whole thing if it wants to.
if(TEX0.TW > 10 || TEX0.TW == 0) TEX0.TW = 10;
if(TEX0.TH > 10 || TEX0.TH == 0) TEX0.TH = 10;
if((TEX0.TBW & 1) && (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT4))
{
@ -915,7 +995,7 @@ void GSState::GIFRegHandlerFOGCOL(const GIFReg* RESTRICT r)
void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* RESTRICT r)
{
// TRACE(_T("TEXFLUSH\n"));
m_texflush = true;
}
template<int i> void GSState::GIFRegHandlerSCISSOR(const GIFReg* RESTRICT r)
@ -1037,7 +1117,8 @@ template<int i> void GSState::GIFRegHandlerFRAME(const GIFReg* RESTRICT r)
{
m_env.CTXT[i].offset.fb = m_mem.GetOffset(r->FRAME.Block(), r->FRAME.FBW, r->FRAME.PSM);
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), r->FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(r->FRAME, m_env.CTXT[i].ZBUF);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF);
}
m_env.CTXT[i].FRAME = (GSVector4i)r->FRAME;
@ -1075,7 +1156,8 @@ template<int i> void GSState::GIFRegHandlerZBUF(const GIFReg* RESTRICT r)
if((m_env.CTXT[i].ZBUF.u32[0] ^ ZBUF.u32[0]) & 0x3f0001ff) // ZBP PSM
{
m_env.CTXT[i].offset.zb = m_mem.GetOffset(ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, ZBUF.PSM);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, ZBUF);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF);
}
m_env.CTXT[i].ZBUF = (GSVector4i)ZBUF;
@ -1230,40 +1312,8 @@ void GSState::FlushPrim()
{
if(m_index.tail > 0)
{
if(0)
{
uint8* buff = new uint8[m_vertex.next];
GSVertex buff[2];
memset(buff, 0, m_vertex.next);
for(size_t i = 0; i < m_index.tail; i++)
{
ASSERT(m_index.buff[i] < m_vertex.next);
buff[m_index.buff[i]] = 1;
}
size_t count = 0;
for(size_t i = 0; i < m_vertex.next; i++)
{
if(buff[i] == 0)
{
count++;
}
}
if(count > 0)
{
printf("unref %lld %d/%d\n", m_perfmon.GetFrame(), count, m_vertex.next);
}
delete [] buff;
}
uint8* buff = m_vertex.tmp;
size_t stride = m_vertex.stride;
size_t head = m_vertex.head;
size_t tail = m_vertex.tail;
size_t next = m_vertex.next;
@ -1282,11 +1332,11 @@ void GSState::FlushPrim()
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
unused = tail - head;
memcpy(buff, &m_vertex.buff[stride * head], stride * unused);
memcpy(buff, &m_vertex.buff[head], sizeof(GSVertex) * unused);
break;
case GS_TRIANGLEFAN:
memcpy(buff, &m_vertex.buff[stride * head], stride); unused = 1;
if(tail - 1 > head) {memcpy(&buff[stride], &m_vertex.buff[stride * (tail - 1)], stride); unused = 2;}
buff[0] = m_vertex.buff[head]; unused = 1;
if(tail - 1 > head) {buff[1] = m_vertex.buff[tail - 1]; unused = 2;}
break;
case GS_INVALID:
break;
@ -1301,7 +1351,7 @@ void GSState::FlushPrim()
{
// FIXME: berserk fpsm = 27 (8H)
m_vt->Update(m_vertex.buff, m_index.buff, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM));
m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM));
Draw();
@ -1315,7 +1365,7 @@ void GSState::FlushPrim()
if(unused > 0)
{
memcpy(m_vertex.buff, buff, stride * unused);
memcpy(m_vertex.buff, buff, sizeof(GSVertex) * unused);
m_vertex.tail = unused;
m_vertex.next = next > head ? next - head : 0;
@ -1641,7 +1691,7 @@ void GSState::SoftReset(uint32 mask)
m_env.TRXDIR.XDIR = 3; //-1 ; set it to invalid value
m_q = 1;
m_q = 1.0f;
}
void GSState::ReadFIFO(uint8* mem, int size)
@ -1665,6 +1715,8 @@ template void GSState::Transfer<1>(const uint8* mem, uint32 size);
template void GSState::Transfer<2>(const uint8* mem, uint32 size);
template void GSState::Transfer<3>(const uint8* mem, uint32 size);
static hash_map<uint64, uint64> s_tags;
template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
{
GSPerfMonAutoTimer pmat(&m_perfmon);
@ -1679,6 +1731,16 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
{
path.SetTag(mem);
if(0)
{
GIFTag* t = (GIFTag*)mem;
uint64 hash;
if(t->NREG < 8) hash = t->u32[2] & ((1 << t->NREG * 4) - 1);
else if(t->NREG < 16) {hash = t->u32[2]; ((uint32*)&hash)[1] = t->u32[3] & ((1 << (t->NREG - 8) * 4) - 1);}
else hash = t->u64[1];
s_tags[hash] += path.nloop * path.nreg;
}
mem += sizeof(GIFTag);
size--;
@ -1690,9 +1752,7 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
if(path.tag.PRE && path.tag.FLG == GIF_FLG_PACKED)
{
GIFRegPRIM r;
r.u64 = path.tag.PRIM;
ApplyPRIM(r);
ApplyPRIM(path.tag.PRIM);
}
}
}
@ -1726,8 +1786,28 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
{
size -= total;
if(path.adonly)
switch(path.type)
{
case GIFPath::TYPE_UNKNOWN:
{
uint32 reg = 0;
do
{
(this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem);
mem += sizeof(GIFPackedReg);
reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg
}
while(--total > 0);
}
break;
case GIFPath::TYPE_ADONLY: // very common
do
{
(this->*m_fpGIFRegHandlers[((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r);
@ -1735,20 +1815,28 @@ template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
mem += sizeof(GIFPackedReg);
}
while(--total > 0);
}
else
{
uint32 reg = 0;
do
{
(this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem);
break;
case GIFPath::TYPE_STQRGBAXYZF2: // majority of the vertices are formatted like this
mem += sizeof(GIFPackedReg);
(this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2])((GIFPackedReg*)mem, total);
reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg
}
while(--total > 0);
mem += total * sizeof(GIFPackedReg);
break;
case GIFPath::TYPE_STQRGBAXYZ2:
(this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2])((GIFPackedReg*)mem, total);
mem += total * sizeof(GIFPackedReg);
break;
default:
__assume(0);
}
path.nloop = 0;
@ -1952,6 +2040,12 @@ int GSState::Freeze(GSFreezeData* fd, bool sizeonly)
{
m_path[i].tag.NREG = m_path[i].nreg;
m_path[i].tag.NLOOP = m_path[i].nloop;
m_path[i].tag.REGS = 0;
for(size_t j = 0; j < countof(m_path[i].regs.u8); j++)
{
m_path[i].tag.u32[2 + (j >> 3)] |= m_path[i].regs.u8[j] << ((j & 7) << 2);
}
WriteState(data, &m_path[i].tag);
WriteState(data, &m_path[i].reg);
@ -2070,7 +2164,8 @@ int GSState::Defrost(const GSFreezeData* fd)
m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM);
m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM);
m_env.CTXT[i].offset.tex = m_mem.GetOffset(m_env.CTXT[i].TEX0.TBP0, m_env.CTXT[i].TEX0.TBW, m_env.CTXT[i].TEX0.PSM);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF);
}
UpdateScissor();
@ -2104,6 +2199,8 @@ void GSState::UpdateScissor()
void GSState::UpdateVertexKick()
{
if(m_frameskip) return;
uint32 prim = PRIM->PRIM;
m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = m_fpGIFPackedRegHandlerXYZ[prim][0];
@ -2116,19 +2213,20 @@ void GSState::UpdateVertexKick()
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = m_fpGIFRegHandlerXYZ[prim][2];
m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = m_fpGIFRegHandlerXYZ[prim][3];
m_cvf = m_cv[prim][PRIM->TME][PRIM->FST];
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim];
m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = m_fpGIFPackedRegHandlerSTQRGBAXYZ2[prim];
}
void GSState::GrowVertexBuffer()
{
int maxcount = std::max<int>(m_vertex.maxcount * 3 / 2, 10000);
uint8* vertex = (uint8*)_aligned_malloc(m_vertex.stride * maxcount, 16);
GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 16);
uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 16); // worst case is slightly less than vertex number * 3
if(m_vertex.buff != NULL)
{
memcpy(vertex, m_vertex.buff, m_vertex.stride * m_vertex.tail);
memcpy(vertex, m_vertex.buff, sizeof(GSVertex) * m_vertex.tail);
_aligned_free(m_vertex.buff);
}
@ -2160,17 +2258,13 @@ __forceinline void GSState::VertexKick(uint32 skip)
GSVector4i v0(m_v.m[0]);
GSVector4i v1(m_v.m[1]);
GSVector4i* RESTRICT tailptr = (GSVector4i*)&m_vertex.buff[m_vertex.stride * tail];
GSVector4i* RESTRICT tailptr = (GSVector4i*)&m_vertex.buff[tail];
tailptr[0] = v0;
tailptr[1] = v1;
m_vertex.xy[xy_tail & 3] = GSVector4(v1.upl32(v1.sub16(GSVector4i::load(m_ofxy)).sra16(4)).upl16()); // zw not sign extended, only useful for eq tests
#ifdef _DEBUG
memset(&tailptr[2], 0, m_vertex.stride - sizeof(GSVertex));
#endif
m_vertex.tail = ++tail;
m_vertex.xy_tail = ++xy_tail;
@ -2286,8 +2380,6 @@ __forceinline void GSState::VertexKick(uint32 skip)
uint32* RESTRICT buff = &m_index.buff[m_index.tail];
size_t src_index = head;
switch(prim)
{
case GS_POINTLIST:
@ -2295,7 +2387,6 @@ __forceinline void GSState::VertexKick(uint32 skip)
m_vertex.head = head + 1;
m_vertex.next = head + 1;
m_index.tail += 1;
(this->*m_cvf)(head, head);
break;
case GS_LINELIST:
buff[0] = head + 0;
@ -2303,18 +2394,20 @@ __forceinline void GSState::VertexKick(uint32 skip)
m_vertex.head = head + 2;
m_vertex.next = head + 2;
m_index.tail += 2;
(this->*m_cvf)(head + 0, head + 0);
(this->*m_cvf)(head + 1, head + 1);
break;
case GS_LINESTRIP:
if(next < head) {head = next; m_vertex.tail = next + 2;}
if(next < head)
{
m_vertex.buff[next + 0] = m_vertex.buff[head + 0];
m_vertex.buff[next + 1] = m_vertex.buff[head + 1];
head = next;
m_vertex.tail = next + 2;
}
buff[0] = head + 0;
buff[1] = head + 1;
m_vertex.head = head + 1;
m_vertex.next = head + 2;
m_index.tail += 2;
if(head + 0 >= next) (this->*m_cvf)(head + 0, src_index + 0);
/*if(head + 1 >= next)*/ (this->*m_cvf)(head + 1, src_index + 1); // this is always a new vertex
break;
case GS_TRIANGLELIST:
buff[0] = head + 0;
@ -2323,21 +2416,22 @@ __forceinline void GSState::VertexKick(uint32 skip)
m_vertex.head = head + 3;
m_vertex.next = head + 3;
m_index.tail += 3;
(this->*m_cvf)(head + 0, head + 0);
(this->*m_cvf)(head + 1, head + 1);
(this->*m_cvf)(head + 2, head + 2);
break;
case GS_TRIANGLESTRIP:
if(next < head) {head = next; m_vertex.tail = next + 3;}
if(next < head)
{
m_vertex.buff[next + 0] = m_vertex.buff[head + 0];
m_vertex.buff[next + 1] = m_vertex.buff[head + 1];
m_vertex.buff[next + 2] = m_vertex.buff[head + 2];
head = next;
m_vertex.tail = next + 3;
}
buff[0] = head + 0;
buff[1] = head + 1;
buff[2] = head + 2;
m_vertex.head = head + 1;
m_vertex.next = head + 3;
m_index.tail += 3;
if(src_index + 0 >= next) (this->*m_cvf)(head + 0, src_index + 0);
if(src_index + 1 >= next) (this->*m_cvf)(head + 1, src_index + 1);
/*if(src_index + 2 >= next)*/ (this->*m_cvf)(head + 2, src_index + 2); // this is always a new vertex
break;
case GS_TRIANGLEFAN:
// TODO: remove gaps, next == head && head < tail - 3 || next > head && next < tail - 2 (very rare)
@ -2346,9 +2440,6 @@ __forceinline void GSState::VertexKick(uint32 skip)
buff[2] = tail - 1;
m_vertex.next = tail;
m_index.tail += 3;
if(head >= next) (this->*m_cvf)(head, head);
if(tail - 2 >= next) (this->*m_cvf)(tail - 2, tail - 2);
/*if(tail - 1 >= next)*/ (this->*m_cvf)(tail - 1, tail - 1); // this is always a new vertex
break;
case GS_SPRITE:
buff[0] = head + 0;
@ -2356,10 +2447,8 @@ __forceinline void GSState::VertexKick(uint32 skip)
m_vertex.head = head + 2;
m_vertex.next = head + 2;
m_index.tail += 2;
(this->*m_cvf)(head + 0, head + 0);
(this->*m_cvf)(head + 1, head + 1);
break;
case GS_INVALID:
case GS_INVALID:
m_vertex.tail = head;
break;
default:
@ -2425,7 +2514,7 @@ void GSState::GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFR
if(wms + wmt < 6)
{
GSVector4 st = m_vt->m_min.t.xyxy(m_vt->m_max.t);
GSVector4 st = m_vt.m_min.t.xyxy(m_vt.m_max.t);
if(linear)
{
@ -2503,7 +2592,7 @@ void GSState::GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFR
void GSState::GetAlphaMinMax()
{
if(m_vt->m_alpha.valid)
if(m_vt.m_alpha.valid)
{
return;
}
@ -2511,7 +2600,7 @@ void GSState::GetAlphaMinMax()
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context;
GSVector4i a = m_vt->m_min.c.uph32(m_vt->m_max.c).zzww();
GSVector4i a = m_vt.m_min.c.uph32(m_vt.m_max.c).zzww();
if(PRIM->TME && context->TEX0.TCC)
{
@ -2563,9 +2652,9 @@ void GSState::GetAlphaMinMax()
}
}
m_vt->m_alpha.min = a.x;
m_vt->m_alpha.max = a.z;
m_vt->m_alpha.valid = true;
m_vt.m_alpha.min = a.x;
m_vt.m_alpha.max = a.z;
m_vt.m_alpha.valid = true;
}
bool GSState::TryAlphaTest(uint32& fm, uint32& zm)
@ -2582,8 +2671,8 @@ bool GSState::TryAlphaTest(uint32& fm, uint32& zm)
{
GetAlphaMinMax();
int amin = m_vt->m_alpha.min;
int amax = m_vt->m_alpha.max;
int amin = m_vt.m_alpha.min;
int amax = m_vt.m_alpha.max;
int aref = context->TEST.AREF;
@ -2667,8 +2756,8 @@ bool GSState::IsOpaque()
{
GetAlphaMinMax();
amin = m_vt->m_alpha.min;
amax = m_vt->m_alpha.max;
amin = m_vt.m_alpha.min;
amax = m_vt.m_alpha.max;
}
else if(context->ALPHA.C == 1)
{

View File

@ -59,8 +59,18 @@ class GSState : public GSAlignedClass<32>
GIFRegHandler m_fpGIFRegHandlers[256];
GIFRegHandler m_fpGIFRegHandlerXYZ[8][4];
typedef void (GSState::*GIFPackedRegHandlerC)(const GIFPackedReg* RESTRICT r, uint32 size);
GIFPackedRegHandlerC m_fpGIFPackedRegHandlersC[2];
GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZF2[8];
GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZ2[8];
template<uint32 prim> void GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size);
template<uint32 prim> void GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uint32 size);
void GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r, uint32 size);
template<int i> void ApplyTEX0(GIFRegTEX0& TEX0);
void ApplyPRIM(const GIFRegPRIM& PRIM);
void ApplyPRIM(uint32 prim);
void GIFRegHandlerNull(const GIFReg* RESTRICT r);
void GIFRegHandlerPRIM(const GIFReg* RESTRICT r);
@ -133,15 +143,14 @@ protected:
float m_q;
GSVector4 m_scissor;
uint32 m_ofxy;
bool m_texflush;
struct
{
uint8* buff;
size_t stride;
GSVertex* buff;
size_t head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1
GSVector4 xy[4];
size_t xy_tail;
uint8* tmp;
} m_vertex;
struct
@ -150,26 +159,6 @@ protected:
size_t tail;
} m_index;
typedef void (GSState::*ConvertVertexPtr)(size_t dst_index, size_t src_index);
ConvertVertexPtr m_cv[8][2][2], m_cvf; // [PRIM][TME][FST]
#define InitConvertVertex2(T, P) \
m_cv[P][0][0] = (ConvertVertexPtr)&T::ConvertVertex<P, 0, 0>; \
m_cv[P][0][1] = (ConvertVertexPtr)&T::ConvertVertex<P, 0, 1>; \
m_cv[P][1][0] = (ConvertVertexPtr)&T::ConvertVertex<P, 1, 0>; \
m_cv[P][1][1] = (ConvertVertexPtr)&T::ConvertVertex<P, 1, 1>; \
#define InitConvertVertex(T) \
InitConvertVertex2(T, GS_POINTLIST) \
InitConvertVertex2(T, GS_LINELIST) \
InitConvertVertex2(T, GS_LINESTRIP) \
InitConvertVertex2(T, GS_TRIANGLELIST) \
InitConvertVertex2(T, GS_TRIANGLESTRIP) \
InitConvertVertex2(T, GS_TRIANGLEFAN) \
InitConvertVertex2(T, GS_SPRITE) \
InitConvertVertex2(T, GS_INVALID) \
void UpdateContext();
void UpdateScissor();
@ -182,7 +171,7 @@ protected:
// following functions need m_vt to be initialized
GSVertexTrace* m_vt;
GSVertexTrace m_vt;
void GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFRegCLAMP& CLAMP, bool linear);
void GetAlphaMinMax();
@ -205,8 +194,14 @@ public:
GSDump m_dump;
bool m_nativeres;
int s_n;
bool s_dump;
bool s_save;
bool s_savez;
int s_saven;
public:
GSState(GSVertexTrace* vt, size_t vertex_stride);
GSState();
virtual ~GSState();
void ResetHandlers();

View File

@ -167,6 +167,18 @@ GSTexture11::operator ID3D11ShaderResourceView*()
return m_srv;
}
GSTexture11::operator ID3D11UnorderedAccessView*()
{
if(!m_uav && m_dev && m_texture)
{
ASSERT(!m_msaa);
m_dev->CreateUnorderedAccessView(m_texture, NULL, &m_uav);
}
return m_uav;
}
GSTexture11::operator ID3D11RenderTargetView*()
{
ASSERT(m_dev);

View File

@ -30,6 +30,7 @@ class GSTexture11 : public GSTexture
CComPtr<ID3D11Texture2D> m_texture;
D3D11_TEXTURE2D_DESC m_desc;
CComPtr<ID3D11ShaderResourceView> m_srv;
CComPtr<ID3D11UnorderedAccessView> m_uav;
CComPtr<ID3D11RenderTargetView> m_rtv;
CComPtr<ID3D11DepthStencilView> m_dsv;
@ -43,6 +44,7 @@ public:
operator ID3D11Texture2D*();
operator ID3D11ShaderResourceView*();
operator ID3D11UnorderedAccessView*();
operator ID3D11RenderTargetView*();
operator ID3D11DepthStencilView*();
};

View File

@ -281,6 +281,8 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int
{
return NULL;
}
m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); // new frame buffers after reset should be cleared, don't display memory garbage
}
else
{

View File

@ -178,6 +178,11 @@ GSTextureCacheSW::Texture::Texture(GSState* state, uint32 tw0, const GIFRegTEX0&
m_TEX0 = TEX0;
m_TEXA = TEXA;
if(m_tw == 0)
{
m_tw = std::max<int>(m_TEX0.TW, GSLocalMemory::m_psm[m_TEX0.PSM].pal == 0 ? 3 : 5); // makes one row 32 bytes at least, matches the smallest block size that is allocated for m_buff
}
memset(m_valid, 0, sizeof(m_valid));
memset(m_pages.bm, 0, sizeof(m_pages.bm));
@ -239,17 +244,6 @@ bool GSTextureCacheSW::Texture::Update(const GSVector4i& rect)
if(m_buff == NULL)
{
uint32 tw0 = std::max<int>(m_TEX0.TW, 5 - shift); // makes one row 32 bytes at least, matches the smallest block size that is allocated for m_buff
if(m_tw == 0)
{
m_tw = tw0;
}
else
{
ASSERT(m_tw >= tw0);
}
uint32 pitch = (1 << m_tw) << shift;
m_buff = _aligned_malloc(pitch * th * 4, 32);

View File

@ -82,13 +82,6 @@ bool GSDevice11::CreateTextureFX()
return true;
}
void GSDevice11::SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim)
{
IASetVertexBuffer(vertex, sizeof(GSVertexHW11), vertex_count);
IASetIndexBuffer(index, index_count);
IASetPrimitiveTopology((D3D11_PRIMITIVE_TOPOLOGY)prim);
}
void GSDevice11::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
{
hash_map<uint32, GSVertexShader11 >::const_iterator i = m_vs.find(sel);
@ -118,6 +111,7 @@ void GSDevice11::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
{"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0},
{"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0},
};

View File

@ -61,13 +61,6 @@ GSTexture* GSDevice9::CreateMskFix(uint32 size, uint32 msk, uint32 fix)
return t;
}
void GSDevice9::SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim)
{
IASetVertexBuffer(vertex, sizeof(GSVertexHW9), vertex_count);
IASetIndexBuffer(index, index_count);
IASetPrimitiveTopology((D3DPRIMITIVETYPE)prim);
}
void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb)
{
hash_map<uint32, GSVertexShader9>::const_iterator i = m_vs.find(sel);

View File

@ -28,9 +28,13 @@ InitializeConditionVariablePtr pInitializeConditionVariable;
WakeConditionVariablePtr pWakeConditionVariable;
WakeAllConditionVariablePtr pWakeAllConditionVariable;
SleepConditionVariableSRWPtr pSleepConditionVariableSRW;
InitializeSRWLockPtr pInitializeSRWLock;;
InitializeSRWLockPtr pInitializeSRWLock;
AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive;
TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;
ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
AcquireSRWLockSharedPtr pAcquireSRWLockShared;
TryAcquireSRWLockSharedPtr pTryAcquireSRWLockShared;
ReleaseSRWLockSharedPtr pReleaseSRWLockShared;
class InitCondVar
{
@ -47,7 +51,11 @@ public:
pSleepConditionVariableSRW = (SleepConditionVariableSRWPtr)GetProcAddress(m_kernel32, "SleepConditionVariableSRW");
pInitializeSRWLock = (InitializeSRWLockPtr)GetProcAddress(m_kernel32, "InitializeSRWLock");
pAcquireSRWLockExclusive = (AcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "AcquireSRWLockExclusive");
pTryAcquireSRWLockExclusive = (TryAcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockExclusive");
pReleaseSRWLockExclusive = (ReleaseSRWLockExclusivePtr)GetProcAddress(m_kernel32, "ReleaseSRWLockExclusive");
pAcquireSRWLockShared = (AcquireSRWLockSharedPtr)GetProcAddress(m_kernel32, "AcquireSRWLockShared");
pTryAcquireSRWLockShared = (TryAcquireSRWLockSharedPtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockShared");
pReleaseSRWLockShared = (ReleaseSRWLockSharedPtr)GetProcAddress(m_kernel32, "ReleaseSRWLockShared");
}
virtual ~InitCondVar()

View File

@ -21,25 +21,56 @@
#pragma once
#include "GSdx.h"
class IGSThread
{
protected:
virtual void ThreadProc() = 0;
};
class IGSLock
{
public:
virtual void Lock() = 0;
virtual bool TryLock() = 0;
virtual void Unlock() = 0;
};
class IGSEvent
{
public:
virtual void Set() = 0;
virtual bool Wait(IGSLock* l) = 0;
};
#ifdef _WINDOWS
typedef void (WINAPI * InitializeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable);
typedef void (WINAPI * WakeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable);
typedef void (WINAPI * WakeAllConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable);
typedef void (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags);
typedef BOOL (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags);
typedef void (WINAPI * InitializeSRWLockPtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * AcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * AcquireSRWLockSharedPtr)(SRWLOCK* SRWLock);
typedef BOOLEAN (WINAPI * TryAcquireSRWLockSharedPtr)(SRWLOCK* SRWLock);
typedef void (WINAPI * ReleaseSRWLockSharedPtr)(SRWLOCK* SRWLock);
extern InitializeConditionVariablePtr pInitializeConditionVariable;
extern WakeConditionVariablePtr pWakeConditionVariable;
extern WakeAllConditionVariablePtr pWakeAllConditionVariable;
extern SleepConditionVariableSRWPtr pSleepConditionVariableSRW;
extern InitializeSRWLockPtr pInitializeSRWLock;;
extern InitializeSRWLockPtr pInitializeSRWLock;
extern AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive;
extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive;
extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive;
extern AcquireSRWLockSharedPtr pAcquireSRWLockShared;
extern TryAcquireSRWLockSharedPtr pTryAcquireSRWLockShared;
extern ReleaseSRWLockSharedPtr pReleaseSRWLockShared;
class GSThread
class GSThread : public IGSThread
{
DWORD m_ThreadId;
HANDLE m_hThread;
@ -47,8 +78,6 @@ class GSThread
static DWORD WINAPI StaticThreadProc(void* lpParam);
protected:
virtual void ThreadProc() = 0;
void CreateThread();
void CloseThread();
@ -57,7 +86,7 @@ public:
virtual ~GSThread();
};
class GSCritSec
class GSCritSec : public IGSLock
{
CRITICAL_SECTION m_cs;
@ -65,26 +94,25 @@ public:
GSCritSec() {InitializeCriticalSection(&m_cs);}
~GSCritSec() {DeleteCriticalSection(&m_cs);}
void Lock() {EnterCriticalSection(&m_cs);}
bool TryLock() {return TryEnterCriticalSection(&m_cs) == TRUE;}
void Unlock() {LeaveCriticalSection(&m_cs);}
void Lock() {EnterCriticalSection(&m_cs);}
bool TryLock() {return TryEnterCriticalSection(&m_cs) == TRUE;}
void Unlock() {LeaveCriticalSection(&m_cs);}
};
class GSEvent
class GSEvent : public IGSEvent
{
protected:
HANDLE m_hEvent;
public:
GSEvent(bool manual = false, bool initial = false) {m_hEvent = CreateEvent(NULL, manual, initial, NULL);}
GSEvent() {m_hEvent = CreateEvent(NULL, FALSE, FALSE, NULL);}
~GSEvent() {CloseHandle(m_hEvent);}
void Set() {SetEvent(m_hEvent);}
void Reset() {ResetEvent(m_hEvent);}
bool Wait() {return WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0;}
bool Wait(IGSLock* l) {if(l) l->Unlock(); bool b = WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0; if(l) l->Lock(); return b;}
};
class GSCondVarLock
class GSCondVarLock : public IGSLock
{
SRWLOCK m_lock;
@ -92,12 +120,13 @@ public:
GSCondVarLock() {pInitializeSRWLock(&m_lock);}
void Lock() {pAcquireSRWLockExclusive(&m_lock);}
bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;}
void Unlock() {pReleaseSRWLockExclusive(&m_lock);}
operator SRWLOCK* () {return &m_lock;}
};
class GSCondVar
class GSCondVar : public IGSEvent
{
CONDITION_VARIABLE m_cv;
@ -105,7 +134,7 @@ public:
GSCondVar() {pInitializeConditionVariable(&m_cv);}
void Set() {pWakeConditionVariable(&m_cv);}
void Wait(GSCondVarLock& lock) {pSleepConditionVariableSRW(&m_cv, lock, INFINITE, 0);}
bool Wait(IGSLock* l) {return pSleepConditionVariableSRW(&m_cv, *(GSCondVarLock*)l, INFINITE, 0) != 0;}
operator CONDITION_VARIABLE* () {return &m_cv;}
};
@ -114,9 +143,8 @@ public:
#include <pthread.h>
#include <semaphore.h>
#include "GSdx.h"
class GSThread
class GSThread : public IGSThread
{
pthread_attr_t m_thread_attr;
pthread_t m_thread;
@ -124,8 +152,6 @@ class GSThread
static void* StaticThreadProc(void* param);
protected:
virtual void ThreadProc() = 0;
void CreateThread();
void CloseThread();
@ -134,16 +160,16 @@ public:
virtual ~GSThread();
};
class GSCritSec
class GSCritSec : public IGSLock
{
pthread_mutexattr_t m_mutex_attr;
pthread_mutex_t m_mutex;
public:
GSCritSec()
GSCritSec(bool recursive = true)
{
pthread_mutexattr_init(&m_mutex_attr);
pthread_mutexattr_settype(&m_mutex_attr, PTHREAD_MUTEX_RECURSIVE);
pthread_mutexattr_settype(&m_mutex_attr, recursive ? PTHREAD_MUTEX_RECURSIVE : PTHREAD_MUTEX_NORMAL);
pthread_mutex_init(&m_mutex, &m_mutex_attr);
}
@ -158,7 +184,7 @@ public:
void Unlock() {pthread_mutex_unlock(&m_mutex);}
};
class GSEvent
class GSEvent : public IGSEvent
{
protected:
sem_t m_sem;
@ -168,35 +194,18 @@ public:
~GSEvent() {sem_destroy(&m_sem);}
void Set() {sem_post(&m_sem);}
bool Wait() {return sem_wait(&m_sem) == 0;}
bool Wait(IGSLock* l) {if(l) l->Unlock(); bool b = sem_wait(&m_sem) == 0; if(l) l->Lock(); return b;}
};
// Note except the mutex attribute the code is same as GSCritSec object
class GSCondVarLock
class GSCondVarLock : public GSCritSec
{
pthread_mutexattr_t m_mutex_attr;
pthread_mutex_t m_mutex;
public:
GSCondVarLock()
GSCondVarLock() : GSCritSec(false)
{
pthread_mutexattr_init(&m_mutex_attr);
pthread_mutexattr_settype(&m_mutex_attr, PTHREAD_MUTEX_NORMAL);
pthread_mutex_init(&m_mutex, &m_mutex_attr);
}
virtual ~GSCondVarLock()
{
pthread_mutex_destroy(&m_mutex);
pthread_mutexattr_destroy(&m_mutex_attr);
}
void Lock() {pthread_mutex_lock(&m_mutex);}
void Unlock() {pthread_mutex_unlock(&m_mutex);}
operator pthread_mutex_t* () {return &m_mutex;}
};
class GSCondVar
class GSCondVar : public IGSEvent
{
pthread_cond_t m_cv;
pthread_condattr_t m_cv_attr;
@ -207,6 +216,7 @@ public:
pthread_condattr_init(&m_cv_attr);
pthread_cond_init(&m_cv, &m_cv_attr);
}
virtual ~GSCondVar()
{
pthread_condattr_destroy(&m_cv_attr);
@ -214,7 +224,7 @@ public:
}
void Set() {pthread_cond_signal(&m_cv);}
void Wait(GSCondVarLock& lock) {pthread_cond_wait(&m_cv, lock);}
bool Wait(IGSLock* l) {pthread_cond_wait(&m_cv, *(GSCondVarLock*)l) == 0;}
operator pthread_cond_t* () {return &m_cv;}
};
@ -223,102 +233,49 @@ public:
class GSAutoLock
{
protected:
GSCritSec* m_cs;
IGSLock* m_lock;
public:
GSAutoLock(GSCritSec* cs) {m_cs = cs; m_cs->Lock();}
~GSAutoLock() {m_cs->Unlock();}
};
class GSEventSpin
{
protected:
volatile long m_sync;
volatile bool m_manual;
public:
GSEventSpin(bool manual = false, bool initial = false) {m_sync = initial ? 1 : 0; m_manual = manual;}
~GSEventSpin() {}
void Set() {_interlockedbittestandset(&m_sync, 0);}
void Reset() {_interlockedbittestandreset(&m_sync, 0);}
bool Wait()
{
if(m_manual) while(!m_sync) _mm_pause();
else while(!_interlockedbittestandreset(&m_sync, 0)) _mm_pause();
return true;
}
GSAutoLock(IGSLock* l) {(m_lock = l)->Lock();}
~GSAutoLock() {m_lock->Unlock();}
};
template<class T> class GSJobQueue : private GSThread
{
protected:
int m_count;
queue<T> m_queue;
volatile long m_count; // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent)
volatile bool m_exit;
struct {GSCritSec lock; GSEvent notempty; volatile long count;} m_ev;
struct {GSCondVar notempty, empty; GSCondVarLock lock; bool available;} m_cv;
IGSEvent* m_notempty;
IGSEvent* m_empty;
IGSLock* m_lock;
void ThreadProc()
{
if(m_cv.available)
m_lock->Lock();
while(true)
{
m_cv.lock.Lock();
while(true)
while(m_queue.empty())
{
while(m_queue.empty())
{
m_cv.notempty.Wait(m_cv.lock);
m_notempty->Wait(m_lock);
if(m_exit) {m_cv.lock.Unlock(); return;}
}
T& item = m_queue.front();
m_cv.lock.Unlock();
Process(item);
m_cv.lock.Lock();
m_queue.pop();
if(m_queue.empty())
{
m_cv.empty.Set();
}
if(m_exit) {m_lock->Unlock(); return;}
}
}
else
{
m_ev.lock.Lock();
while(true)
T& item = m_queue.front();
m_lock->Unlock();
Process(item);
m_lock->Lock();
m_queue.pop();
if(--m_count == 0)
{
while(m_queue.empty())
{
m_ev.lock.Unlock();
m_ev.notempty.Wait();
if(m_exit) {return;}
m_ev.lock.Lock();
}
T& item = m_queue.front();
m_ev.lock.Unlock();
Process(item);
m_ev.lock.Lock();
m_queue.pop();
_InterlockedDecrement(&m_ev.count);
m_empty->Set();
}
}
}
@ -328,19 +285,30 @@ public:
: m_count(0)
, m_exit(false)
{
m_ev.count = 0;
bool condvar = !!theApp.GetConfig("condvar", 1);
#ifdef _WINDOWS
m_cv.available = pInitializeConditionVariable != NULL;
#elif defined(_LINUX)
//m_cv.available = true;
m_cv.available = !!theApp.GetConfig("condvar", 1);
if(pInitializeConditionVariable == NULL)
{
condvar = false;
}
#endif
if(condvar)
{
m_notempty = new GSCondVar();
m_empty = new GSCondVar();
m_lock = new GSCondVarLock();
}
else
{
m_notempty = new GSEvent();
m_empty = new GSEvent();
m_lock = new GSCritSec();
}
CreateThread();
}
@ -348,68 +316,51 @@ public:
{
m_exit = true;
if(m_cv.available)
{
m_cv.notempty.Set();
}
else
{
m_ev.notempty.Set();
}
m_notempty->Set();
CloseThread();
delete m_notempty;
delete m_empty;
delete m_lock;
}
int GetCount() const
bool IsEmpty() const
{
return m_count;
ASSERT(m_count >= 0);
return m_count == 0;
}
virtual void Push(const T& item)
void Push(const T& item)
{
if(m_cv.available)
{
m_cv.lock.Lock();
m_lock->Lock();
m_queue.push(item);
m_queue.push(item);
m_cv.lock.Unlock();
m_cv.notempty.Set();
}
else
if(m_count++ == 0)
{
GSAutoLock l(&m_ev.lock);
m_queue.push(item);
_InterlockedIncrement(&m_ev.count);
m_ev.notempty.Set();
m_notempty->Set();
}
m_count++;
m_lock->Unlock();
}
virtual void Wait()
void Wait()
{
if(m_cv.available)
if(m_count > 0)
{
m_cv.lock.Lock();
m_lock->Lock();
while(!m_queue.empty())
while(m_count != 0)
{
m_cv.empty.Wait(m_cv.lock);
m_empty->Wait(m_lock);
}
m_cv.lock.Unlock();
ASSERT(m_queue.empty());
m_lock->Unlock();
}
else
{
// NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent)
while(m_ev.count > 0) _mm_pause();
}
m_count++;
}
virtual void Process(T& item) = 0;

View File

@ -22,6 +22,48 @@
#include "stdafx.h"
#include "GSVector.h"
const GSVector4i GSVector4i::m_xff[17] =
{
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x000000ff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff),
GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff),
};
const GSVector4i GSVector4i::m_x0f[17] =
{
GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0000000f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f),
GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f),
};
const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f);
const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
const GSVector4 GSVector4::m_half(0.5f);

View File

@ -79,6 +79,9 @@ class GSVector4;
__aligned(class, 16) GSVector4i
{
static const GSVector4i m_xff[17];
static const GSVector4i m_x0f[17];
public:
union
{
@ -2343,6 +2346,9 @@ public:
__forceinline static GSVector4i xfff8(const GSVector4i& v) {return xffffffff(v).sll16( 3);}
__forceinline static GSVector4i xfffc(const GSVector4i& v) {return xffffffff(v).sll16( 2);}
__forceinline static GSVector4i xfffe(const GSVector4i& v) {return xffffffff(v).sll16( 1);}
__forceinline static GSVector4i xff(int n) {return m_xff[n];}
__forceinline static GSVector4i x0f(int n) {return m_x0f[n];}
};
__aligned(class, 16) GSVector4
@ -2909,6 +2915,11 @@ public:
return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p));
}
__forceinline static void storent(void* p, const GSVector4& v)
{
_mm_stream_ps((float*)p, v.m);
}
__forceinline static void storel(void* p, const GSVector4& v)
{
_mm_store_sd((double*)p, _mm_castps_pd(v.m));

View File

@ -37,7 +37,8 @@ __aligned(struct, 32) GSVertex
GIFRegST ST;
GIFRegRGBAQ RGBAQ;
GIFRegXYZ XYZ;
uint32 UV, FOG;
union {uint32 UV; struct {uint16 U, V;};};
uint32 FOG;
};
__m128i m[2];

View File

@ -37,45 +37,4 @@ __aligned(struct, 32) GSVertexHW9
GSVertexHW9& operator = (GSVertexHW9& v) {t = v.t; p = v.p; return *this;}
};
__aligned(union, 32) GSVertexHW11
{
struct
{
union
{
struct {float x, y;} t;
GIFRegST ST;
};
union
{
union {struct {uint8 r, g, b, a; float q;}; uint32 c0;};
GIFRegRGBAQ RGBAQ;
};
union
{
struct {union {struct {uint16 x, y;}; uint32 xy;}; uint32 z;} p;
GIFRegXYZ XYZ;
};
union
{
struct {uint32 _pad; union {struct {uint8 ta0, ta1, res, f;}; uint32 c1;};};
GIFRegFOG FOG;
};
};
GSVertexHW11& operator = (GSVertexHW11& v)
{
GSVector4i* RESTRICT src = (GSVector4i*)&v;
GSVector4i* RESTRICT dst = (GSVector4i*)this;
dst[0] = src[0];
dst[1] = src[1];
return *this;
}
};
#pragma pack(pop)

View File

@ -29,10 +29,38 @@ const GSVector4 GSVertexTrace::s_minmax(FLT_MAX, -FLT_MAX);
GSVertexTrace::GSVertexTrace(const GSState* state)
: m_state(state)
{
#define InitUpdate3(P, IIP, TME, FST, COLOR) \
m_fmm[COLOR][FST][TME][IIP][P] = &GSVertexTrace::FindMinMax<P, IIP, TME, FST, COLOR>;
#define InitUpdate2(P, IIP, TME) \
InitUpdate3(P, IIP, TME, 0, 0) \
InitUpdate3(P, IIP, TME, 0, 1) \
InitUpdate3(P, IIP, TME, 1, 0) \
InitUpdate3(P, IIP, TME, 1, 1) \
#define InitUpdate(P) \
InitUpdate2(P, 0, 0) \
InitUpdate2(P, 0, 1) \
InitUpdate2(P, 1, 0) \
InitUpdate2(P, 1, 1) \
InitUpdate(GS_POINT_CLASS);
InitUpdate(GS_LINE_CLASS);
InitUpdate(GS_TRIANGLE_CLASS);
InitUpdate(GS_SPRITE_CLASS);
}
void GSVertexTrace::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass)
{
m_primclass = primclass;
uint32 iip = m_state->PRIM->IIP;
uint32 tme = m_state->PRIM->TME;
uint32 fst = m_state->PRIM->FST;
uint32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC);
(this->*m_fmm[color][fst][tme][iip][primclass])(vertex, index, count);
m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20);
m_alpha.valid = false;
@ -82,90 +110,350 @@ void GSVertexTrace::Update(const void* vertex, const uint32* index, int count, G
}
}
uint32 GSVertexTrace::Hash(GS_PRIM_CLASS primclass)
template<GS_PRIM_CLASS primclass, uint32 iip, uint32 tme, uint32 fst, uint32 color>
void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int count)
{
m_primclass = primclass;
uint32 hash = m_primclass | (m_state->PRIM->IIP << 2) | (m_state->PRIM->TME << 3) | (m_state->PRIM->FST << 4);
if(!(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC))
{
hash |= 1 << 5;
}
return hash;
}
GSVertexTraceSW::GSVertexTraceSW(const GSState* state)
: GSVertexTrace(state)
, m_map("VertexTraceSW", NULL)
{
}
void GSVertexTraceSW::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass)
{
m_map[Hash(primclass)](count, vertex, index, m_min, m_max);
GSVertexTrace::Update(vertex, index, count, primclass);
}
GSVertexTraceDX9::GSVertexTraceDX9(const GSState* state)
: GSVertexTrace(state)
, m_map("VertexTraceHW9", NULL)
{
}
void GSVertexTraceDX9::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass)
{
m_map[Hash(primclass)](count, vertex, index, m_min, m_max);
const GSDrawingContext* context = m_state->m_context;
GSVector4 o(context->XYOFFSET);
GSVector4 s(1.0f / 16, 1.0f / 16, 1.0f, 1.0f);
int n = 1;
m_min.p = (m_min.p - o) * s;
m_max.p = (m_max.p - o) * s;
if(m_state->PRIM->TME)
switch(primclass)
{
if(m_state->PRIM->FST)
{
s = GSVector4(1 << (16 - 4), 1).xxyy();
}
else
{
s = GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH, 1, 1);
}
m_min.t *= s;
m_max.t *= s;
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
GSVertexTrace::Update(vertex, index, count, primclass);
}
GSVector4 tmin = s_minmax.xxxx();
GSVector4 tmax = s_minmax.yyyy();
GSVector4i cmin = GSVector4i::xffffffff();
GSVector4i cmax = GSVector4i::zero();
GSVertexTraceDX11::GSVertexTraceDX11(const GSState* state)
: GSVertexTrace(state)
, m_map("VertexTraceHW11", NULL)
{
}
#if _M_SSE >= 0x401
void GSVertexTraceDX11::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass)
{
m_map[Hash(primclass)](count, vertex, index, m_min, m_max);
GSVector4i pmin = GSVector4i::xffffffff();
GSVector4i pmax = GSVector4i::zero();
const GSDrawingContext* context = m_state->m_context;
#else
GSVector4 pmin = s_minmax.xxxx();
GSVector4 pmax = s_minmax.yyyy();
#endif
const GSVertex* RESTRICT v = (GSVertex*)vertex;
for(int i = 0; i < count; i += n)
{
if(primclass == GS_POINT_CLASS)
{
GSVector4i c(v[index[i]].m[0]);
if(color)
{
cmin = cmin.min_u8(c);
cmax = cmax.max_u8(c);
}
if(tme)
{
if(!fst)
{
GSVector4 stq = GSVector4::cast(c);
GSVector4 q = stq.wwww();
stq = (stq.xyww() * q.rcpnr()).xyww(q);
tmin = tmin.min(stq);
tmax = tmax.max(stq);
}
else
{
GSVector4i uv(v[index[i]].m[1]);
GSVector4 st = GSVector4(uv.uph16()).xyxy();
tmin = tmin.min(st);
tmax = tmax.max(st);
}
}
GSVector4i xyzf(v[index[i]].m[1]);
GSVector4i xy = xyzf.upl16();
GSVector4i z = xyzf.yyyy();
#if _M_SSE >= 0x401
GSVector4i p = xy.blend16<0xf0>(z.uph32(xyzf));
pmin = pmin.min_u32(p);
pmax = pmax.max_u32(p);
#else
GSVector4 p = GSVector4(xy.upl64(z.srl32(1).upl32(xyzf.wwww())));
pmin = pmin.min(p);
pmax = pmax.max(p);
#endif
}
else if(primclass == GS_LINE_CLASS)
{
GSVector4i c0(v[index[i + 0]].m[0]);
GSVector4i c1(v[index[i + 1]].m[0]);
if(color)
{
if(iip)
{
cmin = cmin.min_u8(c0.min_u8(c1));
cmax = cmax.max_u8(c0.max_u8(c1));
}
else
{
cmin = cmin.min_u8(c1);
cmax = cmax.max_u8(c1);
}
}
if(tme)
{
if(!fst)
{
GSVector4 stq0 = GSVector4::cast(c0);
GSVector4 stq1 = GSVector4::cast(c1);
GSVector4 q = stq0.wwww(stq1).rcpnr();
stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0);
stq1 = (stq1.xyww() * q.zzzz()).xyww(stq1);
tmin = tmin.min(stq0.min(stq1));
tmax = tmax.max(stq0.max(stq1));
}
else
{
GSVector4i uv0(v[index[i + 0]].m[1]);
GSVector4i uv1(v[index[i + 1]].m[1]);
GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
tmin = tmin.min(st0.min(st1));
tmax = tmax.max(st0.max(st1));
}
}
GSVector4i xyzf0(v[index[i + 0]].m[1]);
GSVector4i xyzf1(v[index[i + 1]].m[1]);
GSVector4i xy0 = xyzf0.upl16();
GSVector4i z0 = xyzf0.yyyy();
GSVector4i xy1 = xyzf1.upl16();
GSVector4i z1 = xyzf1.yyyy();
#if _M_SSE >= 0x401
GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
pmin = pmin.min_u32(p0.min_u32(p1));
pmax = pmax.max_u32(p0.max_u32(p1));
#else
GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww())));
GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
pmin = pmin.min(p0.min(p1));
pmax = pmax.max(p0.max(p1));
#endif
}
else if(primclass == GS_TRIANGLE_CLASS)
{
GSVector4i c0(v[index[i + 0]].m[0]);
GSVector4i c1(v[index[i + 1]].m[0]);
GSVector4i c2(v[index[i + 2]].m[0]);
if(color)
{
if(iip)
{
cmin = cmin.min_u8(c2).min_u8(c0.min_u8(c1));
cmax = cmax.max_u8(c2).max_u8(c0.max_u8(c1));
}
else
{
cmin = cmin.min_u8(c2);
cmax = cmax.max_u8(c2);
}
}
if(tme)
{
if(!fst)
{
GSVector4 stq0 = GSVector4::cast(c0);
GSVector4 stq1 = GSVector4::cast(c1);
GSVector4 stq2 = GSVector4::cast(c2);
GSVector4 q = stq0.wwww(stq1).xzww(stq2).rcpnr();
stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0);
stq1 = (stq1.xyww() * q.yyyy()).xyww(stq1);
stq2 = (stq2.xyww() * q.zzzz()).xyww(stq2);
tmin = tmin.min(stq2).min(stq0.min(stq1));
tmax = tmax.max(stq2).max(stq0.max(stq1));
}
else
{
GSVector4i uv0(v[index[i + 0]].m[1]);
GSVector4i uv1(v[index[i + 1]].m[1]);
GSVector4i uv2(v[index[i + 2]].m[1]);
GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
GSVector4 st2 = GSVector4(uv2.uph16()).xyxy();
tmin = tmin.min(st2).min(st0.min(st1));
tmax = tmax.max(st2).max(st0.max(st1));
}
}
GSVector4i xyzf0(v[index[i + 0]].m[1]);
GSVector4i xyzf1(v[index[i + 1]].m[1]);
GSVector4i xyzf2(v[index[i + 2]].m[1]);
GSVector4i xy0 = xyzf0.upl16();
GSVector4i z0 = xyzf0.yyyy();
GSVector4i xy1 = xyzf1.upl16();
GSVector4i z1 = xyzf1.yyyy();
GSVector4i xy2 = xyzf2.upl16();
GSVector4i z2 = xyzf2.yyyy();
#if _M_SSE >= 0x401
GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0));
GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
GSVector4i p2 = xy2.blend16<0xf0>(z2.uph32(xyzf2));
pmin = pmin.min_u32(p2).min_u32(p0.min_u32(p1));
pmax = pmax.max_u32(p2).max_u32(p0.max_u32(p1));
#else
GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww())));
GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
GSVector4 p2 = GSVector4(xy2.upl64(z2.srl32(1).upl32(xyzf2.wwww())));
pmin = pmin.min(p2).min(p0.min(p1));
pmax = pmax.max(p2).max(p0.max(p1));
#endif
}
else if(primclass == GS_SPRITE_CLASS)
{
GSVector4i c0(v[index[i + 0]].m[0]);
GSVector4i c1(v[index[i + 1]].m[0]);
if(color)
{
if(iip)
{
cmin = cmin.min_u8(c0.min_u8(c1));
cmax = cmax.max_u8(c0.max_u8(c1));
}
else
{
cmin = cmin.min_u8(c1);
cmax = cmax.max_u8(c1);
}
}
if(tme)
{
if(!fst)
{
GSVector4 stq0 = GSVector4::cast(c0);
GSVector4 stq1 = GSVector4::cast(c1);
GSVector4 q = stq1.wwww().rcpnr();
stq0 = (stq0.xyww() * q).xyww(stq1);
stq1 = (stq1.xyww() * q).xyww(stq1);
tmin = tmin.min(stq0.min(stq1));
tmax = tmax.max(stq0.max(stq1));
}
else
{
GSVector4i uv0(v[index[i + 0]].m[1]);
GSVector4i uv1(v[index[i + 1]].m[1]);
GSVector4 st0 = GSVector4(uv0.uph16()).xyxy();
GSVector4 st1 = GSVector4(uv1.uph16()).xyxy();
tmin = tmin.min(st0.min(st1));
tmax = tmax.max(st0.max(st1));
}
}
GSVector4i xyzf0(v[index[i + 0]].m[1]);
GSVector4i xyzf1(v[index[i + 1]].m[1]);
GSVector4i xy0 = xyzf0.upl16();
GSVector4i z0 = xyzf0.yyyy();
GSVector4i xy1 = xyzf1.upl16();
GSVector4i z1 = xyzf1.yyyy();
#if _M_SSE >= 0x401
GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf1));
GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1));
pmin = pmin.min_u32(p0.min_u32(p1));
pmax = pmax.max_u32(p0.max_u32(p1));
#else
GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf1.wwww())));
GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww())));
pmin = pmin.min(p0.min(p1));
pmax = pmax.max(p0.max(p1));
#endif
}
}
#if _M_SSE >= 0x401
pmin = pmin.blend16<0x30>(pmin.srl32(1));
pmax = pmax.blend16<0x30>(pmax.srl32(1));
#endif
GSVector4 o(context->XYOFFSET);
GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f);
m_min.p = (m_min.p - o) * s;
m_max.p = (m_max.p - o) * s;
m_min.p = (GSVector4(pmin) - o) * s;
m_max.p = (GSVector4(pmax) - o) * s;
if(m_state->PRIM->TME)
if(tme)
{
if(m_state->PRIM->FST)
if(fst)
{
s = GSVector4(1 << (16 - 4), 1).xxyy();
}
@ -174,10 +462,23 @@ void GSVertexTraceDX11::Update(const void* vertex, const uint32* index, int coun
s = GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH, 1, 1);
}
m_min.t *= s;
m_max.t *= s;
m_min.t = tmin * s;
m_max.t = tmax * s;
}
else
{
m_min.t = GSVector4::zero();
m_max.t = GSVector4::zero();
}
GSVertexTrace::Update(vertex, index, count, primclass);
if(color)
{
m_min.c = cmin.zzzz().u8to32();
m_max.c = cmax.zzzz().u8to32();
}
else
{
m_min.c = GSVector4i::zero();
m_max.c = GSVector4i::zero();
}
}

View File

@ -38,12 +38,15 @@ public:
protected:
const GSState* m_state;
uint32 Hash(GS_PRIM_CLASS primclass);
typedef void (*VertexTracePtr)(int count, const void* vertex, const uint32* index, Vertex& min, Vertex& max);
static const GSVector4 s_minmax;
typedef void (GSVertexTrace::*FindMinMaxPtr)(const void* vertex, const uint32* index, int count);
FindMinMaxPtr m_fmm[2][2][2][2][4];
template<GS_PRIM_CLASS primclass, uint32 iip, uint32 tme, uint32 fst, uint32 color>
void FindMinMax(const void* vertex, const uint32* index, int count);
public:
GS_PRIM_CLASS m_primclass;
@ -69,55 +72,7 @@ public:
GSVertexTrace(const GSState* state);
virtual ~GSVertexTrace() {}
virtual void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass);
void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass);
bool IsLinear() const {return m_filter.linear;}
};
__aligned(class, 32) GSVertexTraceSW : public GSVertexTrace
{
class CG : public GSCodeGenerator
{
public:
CG(const void* param, uint32 key, void* code, size_t maxsize);
};
GSCodeGeneratorFunctionMap<CG, uint32, VertexTracePtr> m_map;
public:
GSVertexTraceSW(const GSState* state);
void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass);
};
__aligned(class, 32) GSVertexTraceDX9 : public GSVertexTrace
{
class CG : public GSCodeGenerator
{
public:
CG(const void* param, uint32 key, void* code, size_t maxsize);
};
GSCodeGeneratorFunctionMap<CG, uint32, VertexTracePtr> m_map;
public:
GSVertexTraceDX9(const GSState* state);
void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass);
};
__aligned(class, 32) GSVertexTraceDX11 : public GSVertexTrace
{
class CG : public GSCodeGenerator
{
public:
CG(const void* param, uint32 key, void* code, size_t maxsize);
};
GSCodeGeneratorFunctionMap<CG, uint32, VertexTracePtr> m_map;
public:
GSVertexTraceDX11(const GSState* state);
void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass);
};

View File

@ -1,496 +0,0 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSVertexTrace.h"
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
vbroadcastss(xmm4, ptr[rax + 0]);
vbroadcastss(xmm5, ptr[rax + 4]);
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
vmovaps(xmm2, xmm4);
vmovaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
vminps(xmm2, xmm0);
vmaxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
}
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(rdx, n * sizeof(GSVertexSW));
sub(ecx, n);
jg("loop");
// }
if(color)
{
vcvttps2dq(xmm2, xmm2);
vpsrld(xmm2, 7);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vcvttps2dq(xmm3, xmm3);
vpsrld(xmm3, 7);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
ret();
}
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
case GS_SPRITE_CLASS:
n = 6;
break;
}
sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
vbroadcastss(xmm4, ptr[rax + 0]);
vbroadcastss(xmm5, ptr[rax + 4]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
vdivps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(rdx, n * sizeof(GSVertexHW9));
sub(ecx, n);
jg("loop");
// }
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
ret();
}
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
vbroadcastss(xmm4, ptr[rax + 0]);
vbroadcastss(xmm5, ptr[rax + 4]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
{
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
vmovaps(xmm1, xmm0);
}
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
vmovdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]);
vpmovzxwd(xmm1, xmm0);
vpsrld(xmm0, 1);
vpunpcklqdq(xmm1, xmm0);
vcvtdq2ps(xmm1, xmm1);
vminps(xmm4, xmm1);
vmaxps(xmm5, xmm1);
}
add(rdx, n * sizeof(GSVertexHW11));
sub(ecx, n);
jg("loop");
// }
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
ret();
}
#endif

View File

@ -1,543 +0,0 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSVertexTrace.h"
#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
sub(rsp, 8 + 2 * 16);
movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
movss(xmm4, ptr[rax + 0]);
movss(xmm5, ptr[rax + 4]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
movaps(xmm2, xmm4);
movaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]);
minps(xmm2, xmm0);
maxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(rdx, n * sizeof(GSVertexSW));
sub(rcx, n);
jg("loop");
// }
if(color)
{
cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7);
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
ret();
}
GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
case GS_SPRITE_CLASS:
n = 6;
break;
}
sub(rsp, 8 + 2 * 16);
movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
movss(xmm4, ptr[rax + 0]);
movss(xmm5, ptr[rax + 16]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
divps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(rdx, n * sizeof(GSVertexHW9));
sub(ecx, n);
jg("loop");
// }
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
ret();
}
GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
sub(rsp, 8 + 2 * 16);
movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
mov(rax, (size_t)&s_minmax);
movss(xmm4, ptr[rax + 0]);
movss(xmm5, ptr[rax + 16]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]);
}
if(color && (iip || j == n - 1))
{
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
movaps(xmm1, xmm0);
}
shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
movdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]);
if(m_cpu.has(util::Cpu::tSSE41))
{
pmovzxwd(xmm1, xmm0);
}
else
{
movdqa(xmm1, xmm0);
punpcklwd(xmm1, xmm1);
psrld(xmm1, 16);
}
psrld(xmm0, 1);
punpcklqdq(xmm1, xmm0);
cvtdq2ps(xmm1, xmm1);
minps(xmm4, xmm1);
maxps(xmm5, xmm1);
}
add(rdx, n * sizeof(GSVertexHW11));
sub(ecx, n);
jg("loop");
// }
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]);
add(rsp, 8 + 2 * 16);
ret();
}
#endif

View File

@ -1,513 +0,0 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSVertexTrace.h"
#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
static const int _args = 4;
static const int _count = _args + 4; // rcx
static const int _vertex = _args + 8; // rdx
static const int _index = _args + 12; // r8
static const int _min = _args + 16; // r9
static const int _max = _args + 20; // _args + 4
GSVertexTraceSW::CG::CG(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
push(ebx);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_minmax.x]);
vbroadcastss(xmm5, ptr[&s_minmax.y]);
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
vmovaps(xmm2, xmm4);
vmovaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _vertex]);
mov(ebx, dword[esp + _index]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
mov(eax, ptr[ebx + 1 * sizeof(uint32)]);
shl(eax, 6); // * sizeof(GSVertexSW)
vmovaps(xmm1, ptr[edx + eax + offsetof(GSVertexSW, t)]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
mov(eax, ptr[ebx + j * sizeof(uint32)]);
shl(eax, 6); // * sizeof(GSVertexSW)
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, c)]);
vminps(xmm2, xmm0);
vmaxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, p)]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, t)]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
}
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(ebx, n * sizeof(uint32));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
vcvttps2dq(xmm2, xmm2);
vpsrld(xmm2, 7);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vcvttps2dq(xmm3, xmm3);
vpsrld(xmm3, 7);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
pop(ebx);
ret();
}
GSVertexTraceDX9::CG::CG(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_SPRITE_CLASS:
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
push(ebx);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_minmax.x]);
vbroadcastss(xmm5, ptr[&s_minmax.y]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _vertex]);
mov(ebx, dword[esp + _index]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
mov(eax, ptr[ebx + 1 * sizeof(uint32)]);
shl(eax, 5); // * sizeof(GSVertexHW9)
vmovaps(xmm1, ptr[edx + eax + offsetof(GSVertexHW9, p)]);
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
mov(eax, ptr[ebx + j * sizeof(uint32)]);
shl(eax, 5); // * sizeof(GSVertexHW9)
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexHW9, p)]);
vminps(xmm4, xmm0);
vmaxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexHW9, t)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
vdivps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
}
add(ebx, n * sizeof(uint32));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
pop(ebx);
ret();
}
GSVertexTraceDX11::CG::CG(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
push(ebx);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
vbroadcastss(xmm4, ptr[&s_minmax.x]);
vbroadcastss(xmm5, ptr[&s_minmax.y]);
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
vpcmpeqd(xmm2, xmm2);
vpxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
vmovaps(xmm6, xmm4);
vmovaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _vertex]);
mov(ebx, dword[esp + _index]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
mov(eax, ptr[ebx + j * sizeof(uint32)]);
shl(eax, 5); // * sizeof(GSVertexHW11)
if(color && (iip || j == n - 1) || tme)
{
vmovaps(xmm0, ptr[edx + eax]);
}
if(color && (iip || j == n - 1))
{
vpminub(xmm2, xmm0);
vpmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
vmovaps(xmm1, xmm0);
}
vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
vdivps(xmm0, xmm1);
vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
vminps(xmm6, xmm0);
vmaxps(xmm7, xmm0);
}
vmovdqa(xmm0, ptr[edx + eax + 16]);
vpmovzxwd(xmm1, xmm0);
vpsrld(xmm0, 1);
vpunpcklqdq(xmm1, xmm0);
vcvtdq2ps(xmm1, xmm1);
vminps(xmm4, xmm1);
vmaxps(xmm5, xmm1);
}
add(ebx, n * sizeof(uint32));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm2, xmm2);
vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
vpmovzxbd(xmm3, xmm3);
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
pop(ebx);
ret();
}
#endif

View File

@ -1,562 +0,0 @@
/*
* Copyright (C) 2007-2009 Gabest
* http://www.gabest.org
*
* This Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with GNU Make; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
* http://www.gnu.org/copyleft/gpl.html
*
*/
#include "stdafx.h"
#include "GSVertexTrace.h"
#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64))
using namespace Xbyak;
static const int _args = 4;
static const int _count = _args + 4; // rcx
static const int _vertex = _args + 8; // rdx
static const int _index = _args + 12; // r8
static const int _min = _args + 16; // r9
static const int _max = _args + 20; // _args + 4
GSVertexTraceSW::CG::CG(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
push(ebx);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_minmax.x]);
movss(xmm5, ptr[&s_minmax.y]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = FLT_MAX;
// max.c = -FLT_MAX;
movaps(xmm2, xmm4);
movaps(xmm3, xmm5);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _vertex]);
mov(ebx, dword[esp + _index]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
mov(eax, ptr[ebx + 1 * sizeof(uint32)]);
shl(eax, 6); // * sizeof(GSVertexSW)
movaps(xmm1, ptr[edx + eax + offsetof(GSVertexSW, t)]);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
for(int j = 0; j < n; j++)
{
mov(eax, ptr[ebx + j * sizeof(uint32)]);
shl(eax, 6); // * sizeof(GSVertexSW)
if(color && (iip || j == n - 1))
{
// min.c = min.c.minv(v[i + j].c);
// max.c = max.c.maxv(v[i + j].c);
movaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, c)]);
minps(xmm2, xmm0);
maxps(xmm3, xmm0);
}
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, p)]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme)
{
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
movaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, t)]);
if(!fst)
{
if(primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2));
}
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0));
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(ebx, n * sizeof(uint32));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
cvttps2dq(xmm2, xmm2);
psrld(xmm2, 7);
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
cvttps2dq(xmm3, xmm3);
psrld(xmm3, 7);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
pop(ebx);
ret();
}
GSVertexTraceDX9::CG::CG(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
case GS_SPRITE_CLASS:
n = 6;
break;
}
push(ebx);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_minmax.x]);
movss(xmm5, ptr[&s_minmax.y]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _vertex]);
mov(ebx, dword[esp + _index]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
if(tme && !fst && primclass == GS_SPRITE_CLASS)
{
mov(eax, ptr[ebx + 1 * sizeof(uint32)]);
shl(eax, 5); // * sizeof(GSVertexHW9)
movaps(xmm1, ptr[edx + eax + offsetof(GSVertexHW9, p)]);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
for(int j = 0; j < n; j++)
{
mov(eax, ptr[ebx + j * sizeof(uint32)]);
shl(eax, 5); // * sizeof(GSVertexHW9)
// min.p = min.p.minv(v[i + j].p);
// max.p = max.p.maxv(v[i + j].p);
movaps(xmm0, ptr[edx + eax + offsetof(GSVertexHW9, p)]);
minps(xmm4, xmm0);
maxps(xmm5, xmm0);
if(tme && !fst && primclass != GS_SPRITE_CLASS)
{
movaps(xmm1, xmm0);
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
}
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[edx + eax + offsetof(GSVertexHW9, t)]);
}
if(color && (iip || j == n - 1))
{
// min.c = min.c.min_u8(v[i + j].c);
// max.c = max.c.min_u8(v[i + j].c);
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral
if(!fst)
{
// t /= p.wwww();
divps(xmm0, xmm1);
}
// min.t = min.t.minv(v[i + j].t);
// max.t = max.t.maxv(v[i + j].t);
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
}
add(ebx, n * sizeof(uint32));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin;
// m_max.p = pmax;
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin.xyww(pmin);
// m_max.t = tmax.xyww(pmax);
shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
pop(ebx);
ret();
}
GSVertexTraceDX11::CG::CG(const void* param, uint32 key, void* code, size_t maxsize)
: GSCodeGenerator(code, maxsize)
{
uint32 primclass = (key >> 0) & 3;
uint32 iip = (key >> 2) & 1;
uint32 tme = (key >> 3) & 1;
uint32 fst = (key >> 4) & 1;
uint32 color = (key >> 5) & 1;
int n = 1;
switch(primclass)
{
case GS_POINT_CLASS:
n = 1;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
n = 2;
break;
case GS_TRIANGLE_CLASS:
n = 3;
break;
}
push(ebx);
// min.p = FLT_MAX;
// max.p = -FLT_MAX;
movss(xmm4, ptr[&s_minmax.x]);
movss(xmm5, ptr[&s_minmax.y]);
shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0));
if(color)
{
// min.c = 0xffffffff;
// max.c = 0;
pcmpeqd(xmm2, xmm2);
pxor(xmm3, xmm3);
}
if(tme)
{
// min.t = FLT_MAX;
// max.t = -FLT_MAX;
movaps(xmm6, xmm4);
movaps(xmm7, xmm5);
}
// for(int i = 0; i < count; i += step) {
mov(edx, dword[esp + _vertex]);
mov(ebx, dword[esp + _index]);
mov(ecx, dword[esp + _count]);
align(16);
L("loop");
for(int j = 0; j < n; j++)
{
mov(eax, ptr[ebx + j * sizeof(uint32)]);
shl(eax, 5); // * sizeof(GSVertexHW11)
if(color && (iip || j == n - 1) || tme)
{
movaps(xmm0, ptr[edx + eax]);
}
if(color && (iip || j == n - 1))
{
pminub(xmm2, xmm0);
pmaxub(xmm3, xmm0);
}
if(tme)
{
if(!fst)
{
movaps(xmm1, xmm0);
}
shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral
if(!fst)
{
shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3));
divps(xmm0, xmm1);
shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q
}
minps(xmm6, xmm0);
maxps(xmm7, xmm0);
}
movdqa(xmm0, ptr[edx + eax + 16]);
if(m_cpu.has(util::Cpu::tSSE41))
{
pmovzxwd(xmm1, xmm0);
}
else
{
movdqa(xmm1, xmm0);
punpcklwd(xmm1, xmm1);
psrld(xmm1, 16);
}
psrld(xmm0, 1);
punpcklqdq(xmm1, xmm0);
cvtdq2ps(xmm1, xmm1);
minps(xmm4, xmm1);
maxps(xmm5, xmm1);
}
add(ebx, n * sizeof(uint32));
sub(ecx, n);
jg("loop");
// }
mov(eax, dword[esp + _min]);
mov(edx, dword[esp + _max]);
if(color)
{
// m_min.c = cmin.zzzz().u8to32();
// m_max.c = cmax.zzzz().u8to32();
if(m_cpu.has(util::Cpu::tSSE41))
{
pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm2, xmm2);
pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2));
pmovzxbd(xmm3, xmm3);
}
else
{
pxor(xmm0, xmm0);
punpckhbw(xmm2, xmm0);
punpcklwd(xmm2, xmm0);
punpckhbw(xmm3, xmm0);
punpcklwd(xmm3, xmm0);
}
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3);
}
// m_min.p = pmin.xyww();
// m_max.p = pmax.xyww();
shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0));
shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0));
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5);
if(tme)
{
// m_min.t = tmin;
// m_max.t = tmax;
movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6);
movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7);
}
pop(ebx);
ret();
}
#endif

View File

@ -618,62 +618,6 @@
<ClCompile Include="GSVertexList.cpp" />
<ClCompile Include="GSVertexSW.cpp" />
<ClCompile Include="GSVertexTrace.cpp" />
<ClCompile Include="GSVertexTrace.x64.avx.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSVertexTrace.x64.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSVertexTrace.x86.avx.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSVertexTrace.x86.cpp">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE2|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSE4|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release SSSE3|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug AVX|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release AVX|Win32'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="GSWnd.cpp" />
<ClCompile Include="stdafx.cpp">
<PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug SSE2|Win32'">Create</PrecompiledHeader>

View File

@ -288,18 +288,6 @@
<ClCompile Include="GSDeviceSDL.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSVertexTrace.x64.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSVertexTrace.x64.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSVertexTrace.x86.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSVertexTrace.x86.cpp">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="GSSetupPrimCodeGenerator.x64.avx.cpp">
<Filter>Source Files</Filter>
</ClCompile>

View File

@ -1024,6 +1024,10 @@
RelativePath=".\GSRenderer.cpp"
>
</File>
<File
RelativePath=".\GSRendererCS.cpp"
>
</File>
<File
RelativePath=".\GSRendererDX.cpp"
>
@ -1240,110 +1244,6 @@
RelativePath=".\GSVertexTrace.cpp"
>
</File>
<File
RelativePath=".\GSVertexTrace.x64.cpp"
>
<FileConfiguration
Name="Debug SSE2|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Release SSE2|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Release SSSE3|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug SSSE3|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug SSE4|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Release SSE4|Win32"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\GSVertexTrace.x86.cpp"
>
<FileConfiguration
Name="Debug SSE2|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Release SSE2|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Release SSSE3|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug SSSE3|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug SSE4|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
<FileConfiguration
Name="Release SSE4|x64"
ExcludedFromBuild="true"
>
<Tool
Name="VCCLCompilerTool"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\GSWnd.cpp"
>
@ -1630,6 +1530,10 @@
RelativePath=".\GSRenderer.h"
>
</File>
<File
RelativePath=".\GSRendererCS.h"
>
</File>
<File
RelativePath=".\GSRendererDX.h"
>

View File

@ -1,73 +1,383 @@
struct Vertex
#ifndef VS_TME
#define VS_TME 1
#define VS_FST 1
#endif
#ifndef GS_IIP
#define GS_IIP 0
#define GS_PRIM 2
#endif
#ifndef PS_BATCH_SIZE
#define PS_BATCH_SIZE 2048
#define PS_FPSM PSM_PSMCT32
#define PS_ZPSM PSM_PSMZ16
#endif
#define PSM_PSMCT32 0
#define PSM_PSMCT24 1
#define PSM_PSMCT16 2
#define PSM_PSMCT16S 10
#define PSM_PSMT8 19
#define PSM_PSMT4 20
#define PSM_PSMT8H 27
#define PSM_PSMT4HL 36
#define PSM_PSMT4HH 44
#define PSM_PSMZ32 48
#define PSM_PSMZ24 49
#define PSM_PSMZ16 50
#define PSM_PSMZ16S 58
struct VS_INPUT
{
float2 st;
uint c;
float q;
uint xy, z;
uint uv, f;
float2 st : TEXCOORD0;
float4 c : COLOR0;
float q : TEXCOORD1;
uint2 p : POSITION0;
uint z : POSITION1;
uint2 uv : TEXCOORD2;
float4 f : COLOR1;
};
struct VS_OUTPUT
{
float4 p : SV_Position;
float2 z : TEXCOORD0;
float4 t : TEXCOORD1;
float4 c : COLOR0;
};
struct GS_OUTPUT
{
float4 p : SV_Position;
float2 z : TEXCOORD0;
float4 t : TEXCOORD1;
float4 c : COLOR0;
uint id : SV_PrimitiveID;
};
cbuffer VSConstantBuffer : register(c0)
{
float4 VertexScale;
float4 VertexOffset;
};
cbuffer PSConstantBuffer : register(c0)
{
uint2 WriteMask;
};
struct FragmentLinkItem
{
uint c, z, id, next;
};
RWByteAddressBuffer VideoMemory : register(u0);
RWStructuredBuffer<FragmentLinkItem> FragmentLinkBuffer : register(u1);
RWByteAddressBuffer StartOffsetBuffer : register(u2);
//RWTexture2D<uint> VideoMemory : register(u2); // 8192 * 512 R8_UINT
StructuredBuffer<Vertex> VertexBuffer : register(t0);
Buffer<uint> IndexBuffer : register(t1);
Buffer<int2> FZRowOffset : register(t0);
Buffer<int2> FZColOffset : register(t1);
Texture2D<float4> Palette : register(t2);
Texture2D<float4> Texture : register(t3);
Buffer<int> FrameRowOffset : register(t2);
Buffer<int> FrameColOffset : register(t3);
Buffer<int> ZBufRowOffset : register(t4);
Buffer<int> ZBufColOffset : register(t5);
cbuffer DrawingEnvironment : register(c0)
VS_OUTPUT vs_main(VS_INPUT input)
{
// TODO
};
VS_OUTPUT output;
// one group is 16x8 pixels and one thread does 2 pixels, otherwise could not read-merge-write 16-bit targets safely
// neighburing pixels are next to eachother in memory, at least we don't have to calculate the address twice
output.p = float4(input.p, 0.0f, 0.0f) * VertexScale - VertexOffset;
output.z = float2(input.z & 0xffff, input.z >> 16); // TODO: min(input.z, 0xffffff00) ?
// TODO: they say groupshared memory is faster, try unswizzling the corresponding chunk of memory initially (how to do that once by only one thread?) then write-back when finished, unless it was untouched
[numthreads(8, 8, 1)]
void cs_main(uint3 gid : SV_GroupID, uint3 tid : SV_GroupThreadID)
{
uint count;
IndexBuffer.GetDimensions(count);
// #if GS_PRIM == 2 (triangle)
for(uint i = 0; i < count; i += 3)
if(VS_TME)
{
Vertex v0 = VertexBuffer[IndexBuffer[i + 0]];
Vertex v1 = VertexBuffer[IndexBuffer[i + 1]];
Vertex v2 = VertexBuffer[IndexBuffer[i + 2]];
uint x = gid.x + tid.x * 2;
uint y = gid.y + tid.y;
uint fa = FrameRowOffset[y] + FrameColOffset[x];
uint za = ZBufRowOffset[y] + ZBufColOffset[x];
// TODO: quickly reject if x, y is outside the triangle
// TODO: calculate interpolated values at x, y
// TODO: run the GS pipeline
// TODO: repeat for x+1, y
// TODO: output two pixels (might be better to process a single pixel, more threads, if there is no 16-bit target involved)
// testing...
uint4 c = VideoMemory.Load4(fa); // does this load 4*4 bytes? or 4 bytes each expanded uint?
c = (v0.c >> uint4(0, 8, 16, 24)) & 0xff; // => ushr r1.yzw, r1.xxxx, l(0, 8, 16, 24), v0.c auto-converted to uint4 and per-component shift in one instruction, SSE is embarrassed
VideoMemory.Store4(fa, c); // same question, 4*4 bytes or compressed to uint
if(VS_FST)
{
output.t.xy = input.uv;
output.t.w = 1.0f;
}
else
{
output.t.xy = input.st;
output.t.w = input.q;
}
}
else
{
output.t.xy = 0;
output.t.w = 1.0f;
}
// #endif
output.c = input.c;
output.t.z = input.f.r;
return output;
}
// TODO: DrawPoint (this is going to be a waste of resources)
// TODO: DrawLine (line hit-test, will it work?)
// TODO: DrawSprite (similar to DrawTriangle)
// TODO: if read-backs are too slow, implement GSState::Write/FlushWrite/Read/clut.Write in a compute shader
// TODO: unswizzle pages from VideoMemory to the texture cache (if they are marked as valid, otherwise upload from GSLocalMemory::m_vm8)
#if GS_PRIM == 0
[maxvertexcount(1)]
void gs_main(point VS_OUTPUT input[1], inout PointStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
{
GS_OUTPUT output;
output.p = input[0].p;
output.z = input[0].z;
output.t = input[0].t;
output.c = input[0].c;
output.id = id;
stream.Append(output);
}
#elif GS_PRIM == 1
[maxvertexcount(2)]
void gs_main(line VS_OUTPUT input[2], inout LineStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
{
[unroll]
for(int i = 0; i < 2; i++)
{
GS_OUTPUT output;
output.p = input[i].p;
output.z = input[i].z;
output.t = input[i].t;
output.c = input[i].c;
output.id = id;
#if GS_IIP == 0
if(i != 1) output.c = input[1].c;
#endif
stream.Append(output);
}
}
#elif GS_PRIM == 2
[maxvertexcount(3)]
void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
{
[unroll]
for(int i = 0; i < 3; i++)
{
GS_OUTPUT output;
output.p = input[i].p;
output.z = input[i].z;
output.t = input[i].t;
output.c = input[i].c;
output.id = id;
#if GS_IIP == 0
if(i != 2) output.c = input[2].c;
#endif
stream.Append(output);
}
}
#elif GS_PRIM == 3
[maxvertexcount(4)]
void gs_main(line VS_OUTPUT input[2], inout TriangleStream<GS_OUTPUT> stream, uint id : SV_PrimitiveID)
{
GS_OUTPUT lt, rb, lb, rt;
lt.p = input[0].p;
lt.z = input[1].z;
lt.t.xy = input[0].t.xy;
lt.t.zw = input[1].t.zw;
lt.c = input[0].c;
lt.id = id;
#if GS_IIP == 0
lt.c = input[1].c;
#endif
rb.p = input[1].p;
rb.z = input[1].z;
rb.t = input[1].t;
rb.c = input[1].c;
rb.id = id;
lb = lt;
lb.p.y = rb.p.y;
lb.t.y = rb.t.y;
rt = rb;
rt.p.y = lt.p.y;
rt.t.y = lt.t.y;
stream.Append(lt);
stream.Append(lb);
stream.Append(rt);
stream.Append(rb);
}
#endif
uint CompressColor32(float4 f)
{
uint4 c = (uint4)(f * 0xff) << uint4(0, 8, 16, 24);
return c.r | c.g | c.b | c.a;
}
uint DecompressColor16(uint c)
{
uint r = (c & 0x001f) << 3;
uint g = (c & 0x03e0) << 6;
uint b = (c & 0x7c00) << 9;
uint a = (c & 0x8000) << 15;
return r | g | b | a;
}
uint ReadPixel(uint addr)
{
return VideoMemory.Load(addr) >> ((addr & 2) << 3);
}
void WritePixel(uint addr, uint value, uint psm)
{
uint tmp;
switch(psm)
{
case PSM_PSMCT32:
case PSM_PSMZ32:
case PSM_PSMCT24:
case PSM_PSMZ24:
VideoMemory.Store(addr, value);
break;
case PSM_PSMCT16:
case PSM_PSMCT16S:
case PSM_PSMZ16:
case PSM_PSMZ16S:
tmp = (addr & 2) << 3;
value = ((value << tmp) ^ VideoMemory.Load(addr)) & (0x0000ffff << tmp);
VideoMemory.InterlockedXor(addr, value, tmp);
break;
}
}
void ps_main0(GS_OUTPUT input)
{
uint x = (uint)input.p.x;
uint y = (uint)input.p.y;
uint tail = FragmentLinkBuffer.IncrementCounter();
uint index = (y << 11) + x;
uint next = 0;
StartOffsetBuffer.InterlockedExchange(index * 4, tail, next);
FragmentLinkItem item;
// TODO: preprocess color (tfx, alpha test), z-test
item.c = CompressColor32(input.c);
item.z = (uint)(input.z.y * 0x10000 + input.z.x);
item.id = input.id;
item.next = next;
FragmentLinkBuffer[tail] = item;
}
void ps_main1(GS_OUTPUT input)
{
uint2 pos = (uint2)input.p.xy;
// sort fragments
uint StartOffsetIndex = (pos.y << 11) + pos.x;
int index[PS_BATCH_SIZE];
int count = 0;
uint next = StartOffsetBuffer.Load(StartOffsetIndex * 4);
StartOffsetBuffer.Store(StartOffsetIndex * 4, 0);
[allow_uav_condition]
while(next != 0)
{
index[count++] = next;
next = FragmentLinkBuffer[next].next;
}
int N2 = 1 << (int)(ceil(log2(count)));
[allow_uav_condition]
for(int i = count; i < N2; i++)
{
index[i] = 0;
}
[allow_uav_condition]
for(int k = 2; k <= N2; k = 2 * k)
{
[allow_uav_condition]
for(int j = k >> 1; j > 0 ; j = j >> 1)
{
[allow_uav_condition]
for(int i = 0; i < N2; i++)
{
uint i_id = FragmentLinkBuffer[index[i]].id;
int ixj = i ^ j;
if(ixj > i)
{
uint ixj_id = FragmentLinkBuffer[index[ixj]].id;
if((i & k) == 0 && i_id > ixj_id)
{
int temp = index[i];
index[i] = index[ixj];
index[ixj] = temp;
}
if((i & k) != 0 && i_id < ixj_id)
{
int temp = index[i];
index[i] = index[ixj];
index[ixj] = temp;
}
}
}
}
}
uint2 addr = (uint2)(FZRowOffset[pos.y] + FZColOffset[pos.x]) << 1;
uint dc = ReadPixel(addr.x);
uint dz = ReadPixel(addr.y);
uint sc = dc;
uint sz = dz;
[allow_uav_condition]
while(--count >= 0)
{
FragmentLinkItem f = FragmentLinkBuffer[index[count]];
// TODO
if(sz < f.z)
{
sc = f.c;
sz = f.z;
}
}
uint c = sc; // (dc & ~WriteMask.x) | (sc & WriteMask.x);
uint z = 0;//sz; //(dz & ~WriteMask.y) | (sz & WriteMask.y);
WritePixel(addr.x, c, PS_FPSM);
WritePixel(addr.y, z, PS_ZPSM);
}

View File

@ -40,11 +40,12 @@
struct VS_INPUT
{
float2 st : TEXCOORD0;
float4 c : COLOR0;
float q : TEXCOORD1;
uint2 p : POSITION0;
uint z : POSITION1;
float2 t : TEXCOORD0;
float q : TEXCOORD1;
float4 c : COLOR0;
uint2 uv : TEXCOORD2;
float4 f : COLOR1;
};
@ -602,12 +603,12 @@ VS_OUTPUT vs_main(VS_INPUT input)
{
if(VS_FST)
{
output.t.xy = input.t * TextureScale;
output.t.xy = input.uv * TextureScale;
output.t.w = 1.0f;
}
else
{
output.t.xy = input.t;
output.t.xy = input.st;
output.t.w = input.q;
}
}