diff --git a/CMakeLists.txt b/CMakeLists.txt index d9fd4f5f48..ae57e9c949 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,7 +57,11 @@ include(SelectPcsx2Plugins) # add additional project-wide include directories include_directories(${PROJECT_SOURCE_DIR}/common/include ${PROJECT_SOURCE_DIR}/common/include/Utilities - ${PROJECT_SOURCE_DIR}/common/include/x86emitter) + ${PROJECT_SOURCE_DIR}/common/include/x86emitter + # WORKAROUND Some issue with multiarch on Debian/Ubuntu + /usr/include/i386-linux-gnu + /usr/include/x86_64-linux-gnu + ) # make the translation if(EXISTS "${PROJECT_SOURCE_DIR}/locales") diff --git a/bin/GameIndex.dbf b/bin/GameIndex.dbf index 335e5beb1c..c86f951c85 100644 --- a/bin/GameIndex.dbf +++ b/bin/GameIndex.dbf @@ -5649,6 +5649,7 @@ Serial = SLUS-20911 Name = Shin Megami Tensei - Nocturne Region = NTSC-U Compat = 5 +eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level --------------------------------------------- Serial = SLUS-20912 Name = Superbikes TT @@ -10338,6 +10339,7 @@ Region = NTSC-U Serial = SLUS-28045 Name = Shin Megami Tensei - Nocturne [Trade Demo] Region = NTSC-U +eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level --------------------------------------------- Serial = SLUS-28046 Name = Guilty Gear Isuka [Trade Demo] @@ -13611,6 +13613,7 @@ Region = NTSC-K Serial = SLKA-25160 Name = Shin Megami Tensei III - Nocturne Maniax Region = NTSC-K +eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level --------------------------------------------- Serial = SLKA-25165 Name = Mobile Suit Gundam - Seed Destiny - Rengou vs. Z.A.F.T. II Plus @@ -17250,10 +17253,12 @@ Region = NTSC-J Serial = SLPM-65241 Name = Shin Megami Tensei 3 - Nocturne [Limited Edition] Region = NTSC-J +eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level --------------------------------------------- Serial = SLPM-65242 Name = Shin Megami Tensei 3 - Nocturne Region = NTSC-J +eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level --------------------------------------------- Serial = SLPM-65243 Name = Densha de Go! Professional 2 @@ -18019,11 +18024,13 @@ Region = NTSC-J Serial = SLPM-65461 Name = Shin Megami Tensei 3 - Nocturne - Maniacs Region = NTSC-J +eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level --------------------------------------------- Serial = SLPM-65462 Name = Shin Megami Tensei 3 - Nocturne - Maniacs Region = NTSC-J Compat = 5 +eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level --------------------------------------------- Serial = SLPM-65463 Name = Rocky @@ -23700,6 +23707,7 @@ Region = NTSC-J Serial = SLPM-74205 Name = Shin Megami Tensei III - Nocturne [PlayStation 2 The Best] Region = NTSC-J +eeRoundMode = 0 // Ladder glitch in "Assembly of Nihilo B11" level --------------------------------------------- Serial = SLPM-74206 Name = Onimusha [PlayStation 2 The Best] diff --git a/pcsx2/gui/i18n.cpp b/pcsx2/gui/i18n.cpp index 938f999782..88d9b00b97 100644 --- a/pcsx2/gui/i18n.cpp +++ b/pcsx2/gui/i18n.cpp @@ -169,6 +169,11 @@ static wxLanguage i18n_FallbackToAnotherLang( wxLanguage wxLangId ) case wxLANGUAGE_CHINESE_SINGAPORE : return wxLANGUAGE_CHINESE_SIMPLIFIED; case wxLANGUAGE_SAMI : + // The correct fallback for Sami would be + // however, currently wxWidgets (2.9.3) only supports wxLANGUAGE_SAMI. + // case: wxLANGUAGE_SAMI_LULE_SWEDEN : + // case: wxLANGUAGE_SAMI_NORTHERN_SWEDEN : + // case: wxLANGUAGE_SAMI_SOUTHERN_SWEDEN : case wxLANGUAGE_SWEDISH_FINLAND : return wxLANGUAGE_SWEDISH; case wxLANGUAGE_PORTUGUESE : return wxLANGUAGE_PORTUGUESE_BRAZILIAN; @@ -178,8 +183,30 @@ static wxLanguage i18n_FallbackToAnotherLang( wxLanguage wxLangId ) case wxLANGUAGE_GERMAN_BELGIUM : case wxLANGUAGE_GERMAN_LIECHTENSTEIN : case wxLANGUAGE_GERMAN_LUXEMBOURG : + // Currently wxWidgets (2.9.3) doesn't support Sorbian. + // case wxLANGUAGE_LOWER_SORBIAN : + // case wxLANGUAGE_UPPER_SORBIAN : case wxLANGUAGE_GERMAN_SWISS : return wxLANGUAGE_GERMAN; + case wxLANGUAGE_SPANISH_ARGENTINA: + case wxLANGUAGE_SPANISH_BOLIVIA: + case wxLANGUAGE_SPANISH_CHILE: + case wxLANGUAGE_SPANISH_COLOMBIA: + case wxLANGUAGE_SPANISH_COSTA_RICA: + case wxLANGUAGE_SPANISH_DOMINICAN_REPUBLIC: + case wxLANGUAGE_SPANISH_ECUADOR: + case wxLANGUAGE_SPANISH_EL_SALVADOR: + case wxLANGUAGE_SPANISH_GUATEMALA: + case wxLANGUAGE_SPANISH_HONDURAS: + case wxLANGUAGE_SPANISH_MEXICAN: + case wxLANGUAGE_SPANISH_NICARAGUA: + case wxLANGUAGE_SPANISH_PANAMA: + case wxLANGUAGE_SPANISH_PARAGUAY: + case wxLANGUAGE_SPANISH_PERU: + case wxLANGUAGE_SPANISH_PUERTO_RICO: + case wxLANGUAGE_SPANISH_URUGUAY: + case wxLANGUAGE_SPANISH_VENEZUELA: return wxLANGUAGE_SPANISH_MODERN; + case wxLANGUAGE_ITALIAN_SWISS : return wxLANGUAGE_ITALIAN; default : break; diff --git a/plugins/GSdx/CMakeLists.txt b/plugins/GSdx/CMakeLists.txt index d89ed12b76..5fa1f3b991 100644 --- a/plugins/GSdx/CMakeLists.txt +++ b/plugins/GSdx/CMakeLists.txt @@ -107,10 +107,6 @@ set(GSdxSources GSUtil.cpp GSVector.cpp GSVertexTrace.cpp - GSVertexTrace.x64.avx.cpp - GSVertexTrace.x86.cpp - GSVertexTrace.x86.avx.cpp - GSVertexTrace.x64.cpp GSWnd.cpp GSdx.cpp stdafx.cpp diff --git a/plugins/GSdx/GPULocalMemory.cpp b/plugins/GSdx/GPULocalMemory.cpp index 820b0534dd..c74a76c441 100644 --- a/plugins/GSdx/GPULocalMemory.cpp +++ b/plugins/GSdx/GPULocalMemory.cpp @@ -28,8 +28,8 @@ const GSVector4i GPULocalMemory::m_xxbx(0x00007c00); const GSVector4i GPULocalMemory::m_xgxx(0x000003e0); const GSVector4i GPULocalMemory::m_rxxx(0x0000001f); -#define VM_SIZE ((1 << (12 + 11)) * sizeof(uint16)) -#define VM_ALLOC_SIZE (VM_SIZE * 2) +#define VM_REAL_SIZE ((1 << (12 + 11)) * sizeof(uint16)) +#define VM_ALLOC_SIZE (VM_REAL_SIZE * 2) #define TEX_ALLOC_SIZE (256 * 256 * (1 + 1 + 4) * 32) GPULocalMemory::GPULocalMemory() @@ -39,7 +39,7 @@ GPULocalMemory::GPULocalMemory() // - int size = VM_SIZE; + int size = VM_REAL_SIZE; m_vm = (uint16*)vmalloc(VM_ALLOC_SIZE, false); diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index d16e1af9ef..582556ae8e 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -214,7 +214,7 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) s_gs = NULL; } - if(renderer == 12) + if(renderer == 15) { #ifdef _WINDOWS @@ -225,12 +225,11 @@ static int _GSopen(void** dsp, char* title, int renderer, int threads = -1) return -1; } - if(s_gs == NULL) - { - s_gs = new GSRendererCS(); + delete s_gs; - s_renderer = renderer; - } + s_gs = new GSRendererCS(); + + s_renderer = renderer; #endif } diff --git a/plugins/GSdx/GS.h b/plugins/GSdx/GS.h index 9b10f47ac9..981322bcf3 100644 --- a/plugins/GSdx/GS.h +++ b/plugins/GSdx/GS.h @@ -90,6 +90,12 @@ enum GIF_REG GIF_REG_NOP = 0x0f, }; +enum GIF_REG_COMPLEX +{ + GIF_REG_STQRGBAXYZF2 = 0x00, + GIF_REG_STQRGBAXYZ2 = 0x01, +}; + enum GIF_A_D_REG { GIF_A_D_REG_PRIM = 0x00, @@ -821,7 +827,16 @@ union }; }; REG_END2 - __forceinline bool IsRepeating() {return (1 << TW) > (int)(TBW << 6) || (PSM == PSM_PSMT8 || PSM == PSM_PSMT4) && TBW == 1;} + __forceinline bool IsRepeating() + { + if(TBW < 2) + { + if(PSM == PSM_PSMT8) return TW > 7 || TH > 6; + if(PSM == PSM_PSMT4) return TW > 7 || TH > 7; + } + + return (TBW << 6) < (1u << TW); + } REG_END2 REG64_(GIFReg, TEX1) @@ -1090,21 +1105,77 @@ REG_SET_END __aligned(struct, 32) GIFPath { GIFTag tag; - uint32 reg; - uint32 nreg; uint32 nloop; - uint32 adonly; + uint32 nreg; + uint32 reg; + uint32 type; GSVector4i regs; - void SetTag(const void* mem) + enum {TYPE_UNKNOWN, TYPE_ADONLY, TYPE_STQRGBAXYZF2, TYPE_STQRGBAXYZ2}; + + __forceinline void SetTag(const void* mem) { - GSVector4i v = GSVector4i::load(mem); - GSVector4i::store(&tag, v); + const GIFTag* RESTRICT src = (const GIFTag*)mem; + + // the compiler has a hard time not reloading every time a field of src is accessed + + uint32 a = src->u32[0]; + uint32 b = src->u32[1]; + + tag.u32[0] = a; + tag.u32[1] = b; + + nloop = a & 0x7fff; + + if(nloop == 0) return; + + GSVector4i v = GSVector4i::loadl(&src->REGS); // REGS not stored to tag.REGS, only into this->regs, restored before saving the state though + + nreg = (b & 0xf0000000) ? (b >> 28) : 16; // src->NREG + regs = v.upl8(v >> 4) & GSVector4i::x0f(nreg); reg = 0; - regs = v.uph8(v >> 4) & 0x0f0f0f0f; - nreg = tag.NREG ? tag.NREG : 16; - nloop = tag.NLOOP; - adonly = regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1; + + type = TYPE_UNKNOWN; + + if(tag.FLG == GIF_FLG_PACKED) + { + if(regs.eq8(GSVector4i(0x0e0e0e0e)).mask() == (1 << nreg) - 1) + { + type = TYPE_ADONLY; + } + else + { + switch(nreg) + { + case 1: break; + case 2: break; + case 3: + if(regs.u32[0] == 0x00040102) type = TYPE_STQRGBAXYZF2; // many games, TODO: formats mixed with NOPs (xeno2: 040f010f02, 04010f020f, mgs3: 04010f0f02, 0401020f0f, 04010f020f) + if(regs.u32[0] == 0x00050102) type = TYPE_STQRGBAXYZ2; // GoW (has other crazy formats, like ...030503050103) + // TODO: common types with UV instead + break; + case 4: break; + case 5: break; + case 6: break; + case 7: break; + case 8: break; + case 9: + if(regs.u32[0] == 0x02040102 && regs.u32[1] == 0x01020401 && regs.u32[2] == 0x00000004) {type = TYPE_STQRGBAXYZF2; nreg = 3; nloop *= 3;} // ffx + break; + case 10: break; + case 11: break; + case 12: + if(regs.u32[0] == 0x02040102 && regs.u32[1] == 0x01020401 && regs.u32[2] == 0x04010204) {type = TYPE_STQRGBAXYZF2; nreg = 3; nloop *= 4;} // dq8 (not many, mostly 040102) + break; + case 13: break; + case 14: break; + case 15: break; + case 16: break; + default: + __assume(0); + } + } + } } __forceinline uint8 GetReg() diff --git a/plugins/GSdx/GSBlock.h b/plugins/GSdx/GSBlock.h index 0a835f04f3..ef5e4885a3 100644 --- a/plugins/GSdx/GSBlock.h +++ b/plugins/GSdx/GSBlock.h @@ -884,7 +884,7 @@ public: } } - static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs + template static void ExpandBlock16(const uint16* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) // do not inline, uses too many xmm regs { const GSVector4i* s = (const GSVector4i*)src; @@ -895,44 +895,36 @@ public: GSVector4i bm = m_xxbx; GSVector4i l, h; - if(TEXA.AEM) + for(int i = 0; i < 8; i++, dst += dstpitch) { - for(int i = 0; i < 8; i++, dst += dstpitch) + GSVector4i v0 = s[i * 2 + 0]; + + l = v0.upl16(v0); + h = v0.uph16(v0); + + if(AEM) { - GSVector4i v0 = s[i * 2 + 0]; - - l = v0.upl16(v0); - h = v0.uph16(v0); - ((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero()); ((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero()); + } + else + { + ((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15)); + ((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15)); + } - GSVector4i v1 = s[i * 2 + 1]; + GSVector4i v1 = s[i * 2 + 1]; - l = v1.upl16(v1); - h = v1.uph16(v1); + l = v1.upl16(v1); + h = v1.uph16(v1); + if(AEM) + { ((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend8(TA1, l.sra16(15)).andnot(l == GSVector4i::zero()); ((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend8(TA1, h.sra16(15)).andnot(h == GSVector4i::zero()); } - } - else - { - for(int i = 0; i < 8; i++, dst += dstpitch) + else { - GSVector4i v0 = s[i * 2 + 0]; - - l = v0.upl16(v0); - h = v0.uph16(v0); - - ((GSVector4i*)dst)[0] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15)); - ((GSVector4i*)dst)[1] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15)); - - GSVector4i v1 = s[i * 2 + 1]; - - l = v1.upl16(v1); - h = v1.uph16(v1); - ((GSVector4i*)dst)[2] = ((l & rm) << 3) | ((l & gm) << 6) | ((l & bm) << 9) | TA0.blend(TA1, l.sra16(15)); ((GSVector4i*)dst)[3] = ((h & rm) << 3) | ((h & gm) << 6) | ((h & bm) << 9) | TA0.blend(TA1, h.sra16(15)); } @@ -1432,6 +1424,56 @@ public: } } } + template __forceinline static GSVector4i Expand16to32(const GSVector4i& c, const GSVector4i& TA0, const GSVector4i& TA1) + { + return ((c & m_rxxx) << 3) | ((c & m_xgxx) << 6) | ((c & m_xxbx) << 9) | (AEM ? TA0.blend8(TA1, c.sra16(15)).andnot(c == GSVector4i::zero()) : TA0.blend(TA1, c.sra16(15))); + } + + template __forceinline static void ReadAndExpandBlock16(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const GIFRegTEXA& TEXA) + { + #if 0 // not faster + + const GSVector4i* s = (const GSVector4i*)src; + + GSVector4i TA0(TEXA.TA0 << 24); + GSVector4i TA1(TEXA.TA1 << 24); + + for(int i = 0; i < 4; i++, dst += dstpitch * 2) + { + GSVector4i v0 = s[i * 4 + 0]; + GSVector4i v1 = s[i * 4 + 1]; + GSVector4i v2 = s[i * 4 + 2]; + GSVector4i v3 = s[i * 4 + 3]; + + GSVector4i::sw16(v0, v1, v2, v3); + GSVector4i::sw32(v0, v1, v2, v3); + GSVector4i::sw16(v0, v2, v1, v3); + + GSVector4i* d0 = (GSVector4i*)&dst[dstpitch * 0]; + + d0[0] = Expand16to32(v0.upl16(v0), TA0, TA1); + d0[1] = Expand16to32(v0.uph16(v0), TA0, TA1); + d0[2] = Expand16to32(v1.upl16(v1), TA0, TA1); + d0[3] = Expand16to32(v1.uph16(v1), TA0, TA1); + + GSVector4i* d1 = (GSVector4i*)&dst[dstpitch * 1]; + + d1[0] = Expand16to32(v2.upl16(v2), TA0, TA1); + d1[1] = Expand16to32(v2.uph16(v2), TA0, TA1); + d1[2] = Expand16to32(v3.upl16(v3), TA0, TA1); + d1[3] = Expand16to32(v3.uph16(v3), TA0, TA1); + } + + #else + + __aligned(uint16, 32) block[16 * 8]; + + ReadBlock16(src, (uint8*)block, sizeof(block) / 8); + + ExpandBlock16(block, dst, dstpitch, TEXA); + + #endif + } __forceinline static void ReadAndExpandBlock8_32(const uint8* RESTRICT src, uint8* RESTRICT dst, int dstpitch, const uint32* RESTRICT pal) { diff --git a/plugins/GSdx/GSClut.cpp b/plugins/GSdx/GSClut.cpp index bb032f1ac5..582aff7af6 100644 --- a/plugins/GSdx/GSClut.cpp +++ b/plugins/GSdx/GSClut.cpp @@ -389,6 +389,8 @@ void GSClut::GetAlphaMinMax32(int& amin, int& amax) void GSClut::WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut) { + // 4 blocks + for(int i = 0; i < 64; i += 16) { WriteCLUT_T32_I4_CSM1(&src[i + 0], &clut[i * 2 + 0]); @@ -400,6 +402,8 @@ void GSClut::WriteCLUT_T32_I8_CSM1(const uint32* RESTRICT src, uint16* RESTRICT __forceinline void GSClut::WriteCLUT_T32_I4_CSM1(const uint32* RESTRICT src, uint16* RESTRICT clut) { + // 1 block + GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)clut; @@ -420,6 +424,8 @@ __forceinline void GSClut::WriteCLUT_T32_I4_CSM1(const uint32* RESTRICT src, uin void GSClut::WriteCLUT_T16_I8_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut) { + // 2 blocks + GSVector4i* s = (GSVector4i*)src; GSVector4i* d = (GSVector4i*)clut; @@ -443,6 +449,8 @@ void GSClut::WriteCLUT_T16_I8_CSM1(const uint16* RESTRICT src, uint16* RESTRICT __forceinline void GSClut::WriteCLUT_T16_I4_CSM1(const uint16* RESTRICT src, uint16* RESTRICT clut) { + // 1 block (half) + for(int i = 0; i < 16; i++) { clut[i] = src[clutTableT16I4[i]]; diff --git a/plugins/GSdx/GSDevice.h b/plugins/GSdx/GSDevice.h index 8ed3d884f0..3ce8770a2f 100644 --- a/plugins/GSdx/GSDevice.h +++ b/plugins/GSdx/GSDevice.h @@ -103,6 +103,7 @@ public: virtual void BeginScene() {} virtual void DrawPrimitive() {}; virtual void DrawIndexedPrimitive() {} + virtual void DrawIndexedPrimitive(int offset, int count) {} virtual void EndScene(); virtual void ClearRenderTarget(GSTexture* t, const GSVector4& c) {} diff --git a/plugins/GSdx/GSDevice11.cpp b/plugins/GSdx/GSDevice11.cpp index 7e253f04e3..f936be4079 100644 --- a/plugins/GSdx/GSDevice11.cpp +++ b/plugins/GSdx/GSDevice11.cpp @@ -98,8 +98,6 @@ bool GSDevice11::Create(GSWnd* wnd) hr = D3D11CreateDeviceAndSwapChain(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, flags, levels, countof(levels), D3D11_SDK_VERSION, &scd, &m_swapchain, &m_dev, &level, &m_ctx); // hr = D3D11CreateDeviceAndSwapChain(NULL, D3D_DRIVER_TYPE_REFERENCE, NULL, flags, NULL, 0, D3D11_SDK_VERSION, &scd, &m_swapchain, &m_dev, &level, &m_ctx); - //return false; - if(FAILED(hr)) return false; if(!SetFeatureLevel(level, true)) @@ -360,6 +358,13 @@ void GSDevice11::DrawIndexedPrimitive() m_ctx->DrawIndexed(m_index.count, m_index.start, m_vertex.start); } +void GSDevice11::DrawIndexedPrimitive(int offset, int count) +{ + ASSERT(offset + count <= m_index.count); + + m_ctx->DrawIndexed(count, m_index.start + offset, m_vertex.start); +} + void GSDevice11::Dispatch(uint32 x, uint32 y, uint32 z) { m_ctx->Dispatch(x, y, z); @@ -720,6 +725,18 @@ void GSDevice11::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* vert } void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t count) +{ + void* ptr = NULL; + + if(IAMapVertexBuffer(&ptr, stride, count)) + { + GSVector4i::storent(ptr, vertex, count * stride); + + IAUnmapVertexBuffer(); + } +} + +bool GSDevice11::IAMapVertexBuffer(void** vertex, size_t stride, size_t count) { ASSERT(m_vertex.count == 0); @@ -729,7 +746,6 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou m_vb = NULL; m_vertex.start = 0; - m_vertex.count = 0; m_vertex.limit = std::max(count * 3 / 2, 11000); } @@ -748,7 +764,7 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou hr = m_dev->CreateBuffer(&bd, NULL, &m_vb); - if(FAILED(hr)) return; + if(FAILED(hr)) return false; } D3D11_MAP type = D3D11_MAP_WRITE_NO_OVERWRITE; @@ -762,17 +778,24 @@ void GSDevice11::IASetVertexBuffer(const void* vertex, size_t stride, size_t cou D3D11_MAPPED_SUBRESOURCE m; - if(SUCCEEDED(m_ctx->Map(m_vb, 0, type, 0, &m))) + if(FAILED(m_ctx->Map(m_vb, 0, type, 0, &m))) { - GSVector4i::storent((uint8*)m.pData + m_vertex.start * stride, vertex, count * stride); - - m_ctx->Unmap(m_vb, 0); + return false; } + *vertex = (uint8*)m.pData + m_vertex.start * stride; + m_vertex.count = count; m_vertex.stride = stride; - IASetVertexBuffer(m_vb, stride); + return true; +} + +void GSDevice11::IAUnmapVertexBuffer() +{ + m_ctx->Unmap(m_vb, 0); + + IASetVertexBuffer(m_vb, m_vertex.stride); } void GSDevice11::IASetVertexBuffer(ID3D11Buffer* vb, size_t stride) @@ -798,7 +821,7 @@ void GSDevice11::IASetIndexBuffer(const void* index, size_t count) m_ib_old = m_ib; m_ib = NULL; - m_index.count = 0; + m_index.start = 0; m_index.limit = std::max(count * 3 / 2, 11000); } @@ -904,7 +927,11 @@ void GSDevice11::PSSetShaderResources(GSTexture* sr0, GSTexture* sr1) { PSSetShaderResource(0, sr0); PSSetShaderResource(1, sr1); - PSSetShaderResource(2, NULL); + + for(int i = 2; i < countof(m_state.ps_srv); i++) + { + PSSetShaderResource(i, NULL); + } } void GSDevice11::PSSetShaderResource(int i, GSTexture* sr) @@ -913,6 +940,13 @@ void GSDevice11::PSSetShaderResource(int i, GSTexture* sr) if(sr) srv = *(GSTexture11*)sr; + PSSetShaderResourceView(i, srv); +} + +void GSDevice11::PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv) +{ + ASSERT(i < countof(m_state.ps_srv)); + if(m_state.ps_srv[i] != srv) { m_state.ps_srv[i] = srv; @@ -944,14 +978,14 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb) if(m_srv_changed) { - m_ctx->PSSetShaderResources(0, 3, m_state.ps_srv); + m_ctx->PSSetShaderResources(0, countof(m_state.ps_srv), m_state.ps_srv); m_srv_changed = false; } if(m_ss_changed) { - m_ctx->PSSetSamplers(0, 3, m_state.ps_ss); + m_ctx->PSSetSamplers(0, countof(m_state.ps_ss), m_state.ps_ss); m_ss_changed = false; } @@ -966,9 +1000,9 @@ void GSDevice11::PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb) void GSDevice11::CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv) { - // TODO: if(m_state.cs_srv[i] != srv) + if(m_state.cs_srv[i] != srv) { - // TODO: m_state.cs_srv[i] = srv; + m_state.cs_srv[i] = srv; m_ctx->CSSetShaderResources(i, 1, &srv); } @@ -976,17 +1010,14 @@ void GSDevice11::CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv) void GSDevice11::CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav) { - // TODO: if(m_state.cs_uav[i] != uav) - { - // TODO: m_state.cs_uav[i] = uav; + uint32 counters[8]; + + memset(counters, 0, sizeof(counters)); - // uint32 count[] = {-1}; - - m_ctx->CSSetUnorderedAccessViews(i, 1, &uav, NULL); - } + m_ctx->CSSetUnorderedAccessViews(i, 1, &uav, counters); } -void GSDevice11::CSSetShader(ID3D11ComputeShader* cs) +void GSDevice11::CSSetShader(ID3D11ComputeShader* cs, ID3D11Buffer* cs_cb) { if(m_state.cs != cs) { @@ -994,6 +1025,13 @@ void GSDevice11::CSSetShader(ID3D11ComputeShader* cs) m_ctx->CSSetShader(cs, NULL, 0); } + + if(m_state.cs_cb != cs_cb) + { + m_state.cs_cb = cs_cb; + + m_ctx->CSSetConstantBuffers(0, 1, &cs_cb); + } } void GSDevice11::OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref) @@ -1064,6 +1102,41 @@ void GSDevice11::OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector } } +void GSDevice11::OMSetRenderTargets(const GSVector2i& rtsize, int count, ID3D11UnorderedAccessView** uav, uint32* counters, const GSVector4i* scissor) +{ + m_ctx->OMSetRenderTargetsAndUnorderedAccessViews(0, NULL, NULL, 0, count, uav, counters); + + m_state.rtv = NULL; + m_state.dsv = NULL; + + if(m_state.viewport != rtsize) + { + m_state.viewport = rtsize; + + D3D11_VIEWPORT vp; + + memset(&vp, 0, sizeof(vp)); + + vp.TopLeftX = 0; + vp.TopLeftY = 0; + vp.Width = (float)rtsize.x; + vp.Height = (float)rtsize.y; + vp.MinDepth = 0.0f; + vp.MaxDepth = 1.0f; + + m_ctx->RSSetViewports(1, &vp); + } + + GSVector4i r = scissor ? *scissor : GSVector4i(rtsize).zwxy(); + + if(!m_state.scissor.eq(r)) + { + m_state.scissor = r; + + m_ctx->RSSetScissorRects(1, r); + } +} + HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il) { HRESULT hr; @@ -1135,6 +1208,38 @@ HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MAC return hr; } +HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count) +{ + HRESULT hr; + + vector m; + + PrepareShaderMacro(m, macro); + + CComPtr shader, error; + + hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.gs.c_str(), 0, 0, NULL, &shader, &error, NULL); + + if(error) + { + printf("%s\n", (const char*)error->GetBufferPointer()); + } + + if(FAILED(hr)) + { + return hr; + } + + hr = m_dev->CreateGeometryShaderWithStreamOutput((void*)shader->GetBufferPointer(), shader->GetBufferSize(), layout, count, NULL, 0, D3D11_SO_NO_RASTERIZED_STREAM, NULL, gs); + + if(FAILED(hr)) + { + return hr; + } + + return hr; +} + HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps) { HRESULT hr; @@ -1177,7 +1282,7 @@ HRESULT GSDevice11::CompileShader(uint32 id, const char* entry, D3D11_SHADER_MAC CComPtr shader, error; - hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.ps.c_str(), 0, 0, NULL, &shader, &error, NULL); + hr = D3DX11CompileFromResource(theApp.GetModuleHandle(), MAKEINTRESOURCE(id), NULL, &m[0], NULL, entry, m_shader.cs.c_str(), 0, 0, NULL, &shader, &error, NULL); if(error) { diff --git a/plugins/GSdx/GSDevice11.h b/plugins/GSdx/GSDevice11.h index 443e6f43d3..8e8b4afb9a 100644 --- a/plugins/GSdx/GSDevice11.h +++ b/plugins/GSdx/GSDevice11.h @@ -60,11 +60,13 @@ class GSDevice11 : public GSDeviceDX ID3D11VertexShader* vs; ID3D11Buffer* vs_cb; ID3D11GeometryShader* gs; - ID3D11ShaderResourceView* ps_srv[3]; + ID3D11ShaderResourceView* ps_srv[16]; ID3D11PixelShader* ps; ID3D11Buffer* ps_cb; ID3D11SamplerState* ps_ss[3]; + ID3D11ShaderResourceView* cs_srv[16]; ID3D11ComputeShader* cs; + ID3D11Buffer* cs_cb; GSVector2i viewport; GSVector4i scissor; ID3D11DepthStencilState* dss; @@ -146,6 +148,7 @@ public: void DrawPrimitive(); void DrawIndexedPrimitive(); + void DrawIndexedPrimitive(int offset, int count); void Dispatch(uint32 x, uint32 y, uint32 z); void ClearRenderTarget(GSTexture* t, const GSVector4& c); @@ -169,6 +172,8 @@ public: void StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, const GSVector4& dr, ID3D11PixelShader* ps, ID3D11Buffer* ps_cb, ID3D11BlendState* bs, bool linear = true); void IASetVertexBuffer(const void* vertex, size_t stride, size_t count); + bool IAMapVertexBuffer(void** vertex, size_t stride, size_t count); + void IAUnmapVertexBuffer(); void IASetVertexBuffer(ID3D11Buffer* vb, size_t stride); void IASetIndexBuffer(const void* index, size_t count); void IASetIndexBuffer(ID3D11Buffer* ib); @@ -178,16 +183,17 @@ public: void GSSetShader(ID3D11GeometryShader* gs); void PSSetShaderResources(GSTexture* sr0, GSTexture* sr1); void PSSetShaderResource(int i, GSTexture* sr); + void PSSetShaderResourceView(int i, ID3D11ShaderResourceView* srv); void PSSetShader(ID3D11PixelShader* ps, ID3D11Buffer* ps_cb); void PSSetSamplerState(ID3D11SamplerState* ss0, ID3D11SamplerState* ss1, ID3D11SamplerState* ss2 = NULL); void CSSetShaderSRV(int i, ID3D11ShaderResourceView* srv); void CSSetShaderUAV(int i, ID3D11UnorderedAccessView* uav); - void CSSetShader(ID3D11ComputeShader* cs); + void CSSetShader(ID3D11ComputeShader* cs, ID3D11Buffer* cs_cb); void OMSetDepthStencilState(ID3D11DepthStencilState* dss, uint8 sref); void OMSetBlendState(ID3D11BlendState* bs, float bf); void OMSetRenderTargets(GSTexture* rt, GSTexture* ds, const GSVector4i* scissor = NULL); + void OMSetRenderTargets(const GSVector2i& rtsize, int count, ID3D11UnorderedAccessView** uav, uint32* counters, const GSVector4i* scissor = NULL); - void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim); void SetupVS(VSSelector sel, const VSConstantBuffer* cb); void SetupGS(GSSelector sel); void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel); @@ -202,6 +208,7 @@ public: HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11VertexShader** vs, D3D11_INPUT_ELEMENT_DESC* layout, int count, ID3D11InputLayout** il); HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs); + HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11GeometryShader** gs, D3D11_SO_DECLARATION_ENTRY* layout, int count); HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11PixelShader** ps); HRESULT CompileShader(uint32 id, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs); HRESULT CompileShader(const char* fn, const char* entry, D3D11_SHADER_MACRO* macro, ID3D11ComputeShader** cs); diff --git a/plugins/GSdx/GSDevice9.cpp b/plugins/GSdx/GSDevice9.cpp index 3a8c875f50..b912b90198 100644 --- a/plugins/GSdx/GSDevice9.cpp +++ b/plugins/GSdx/GSDevice9.cpp @@ -911,6 +911,18 @@ void GSDevice9::SetupDATE(GSTexture* rt, GSTexture* ds, const GSVertexPT1* verti } void GSDevice9::IASetVertexBuffer(const void* vertex, size_t stride, size_t count) +{ + void* ptr = NULL; + + if(IAMapVertexBuffer(&ptr, stride, count)) + { + GSVector4i::storent(ptr, vertex, count * stride); + + IAUnmapVertexBuffer(); + } +} + +bool GSDevice9::IAMapVertexBuffer(void** vertex, size_t stride, size_t count) { ASSERT(m_vertex.count == 0); @@ -930,7 +942,7 @@ void GSDevice9::IASetVertexBuffer(const void* vertex, size_t stride, size_t coun hr = m_dev->CreateVertexBuffer(m_vertex.limit * stride, D3DUSAGE_DYNAMIC | D3DUSAGE_WRITEONLY, 0, D3DPOOL_DEFAULT, &m_vb, NULL); - if(FAILED(hr)) return; + if(FAILED(hr)) return false; } uint32 flags = D3DLOCK_NOOVERWRITE; @@ -942,19 +954,22 @@ void GSDevice9::IASetVertexBuffer(const void* vertex, size_t stride, size_t coun flags = D3DLOCK_DISCARD; } - void* ptr = NULL; - - if(SUCCEEDED(m_vb->Lock(m_vertex.start * stride, count * stride, &ptr, flags))) + if(FAILED(m_vb->Lock(m_vertex.start * stride, count * stride, vertex, flags))) { - GSVector4i::storent(ptr, vertex, count * stride); - - m_vb->Unlock(); + return false; } m_vertex.count = count; m_vertex.stride = stride; - IASetVertexBuffer(m_vb, stride); + return true; +} + +void GSDevice9::IAUnmapVertexBuffer() +{ + m_vb->Unlock(); + + IASetVertexBuffer(m_vb, m_vertex.stride); } void GSDevice9::IASetVertexBuffer(IDirect3DVertexBuffer9* vb, size_t stride) diff --git a/plugins/GSdx/GSDevice9.h b/plugins/GSdx/GSDevice9.h index 3e46c94214..319e063d0f 100644 --- a/plugins/GSdx/GSDevice9.h +++ b/plugins/GSdx/GSDevice9.h @@ -196,6 +196,8 @@ public: void StretchRect(GSTexture* st, const GSVector4& sr, GSTexture* dt, const GSVector4& dr, IDirect3DPixelShader9* ps, const float* ps_cb, int ps_cb_len, Direct3DBlendState9* bs, bool linear = true); void IASetVertexBuffer(const void* vertex, size_t stride, size_t count); + bool IAMapVertexBuffer(void** vertex, size_t stride, size_t count); + void IAUnmapVertexBuffer(); void IASetVertexBuffer(IDirect3DVertexBuffer9* vb, size_t stride); void IASetIndexBuffer(const void* index, size_t count); void IASetIndexBuffer(IDirect3DIndexBuffer9* ib); @@ -216,7 +218,6 @@ public: HRESULT CompileShader(uint32 id, const string& entry, const D3DXMACRO* macro, IDirect3DVertexShader9** vs, const D3DVERTEXELEMENT9* layout, int count, IDirect3DVertexDeclaration9** il); HRESULT CompileShader(uint32 id, const string& entry, const D3DXMACRO* macro, IDirect3DPixelShader9** ps); - void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim); void SetupVS(VSSelector sel, const VSConstantBuffer* cb); void SetupGS(GSSelector sel) {} void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel); diff --git a/plugins/GSdx/GSDeviceDX.h b/plugins/GSdx/GSDeviceDX.h index 9d2f954472..4ce845665b 100644 --- a/plugins/GSdx/GSDeviceDX.h +++ b/plugins/GSdx/GSDeviceDX.h @@ -279,7 +279,6 @@ public: bool SetFeatureLevel(D3D_FEATURE_LEVEL level, bool compat_mode); void GetFeatureLevel(D3D_FEATURE_LEVEL& level) const {level = m_shader.level;} - virtual void SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim) = 0; virtual void SetupVS(VSSelector sel, const VSConstantBuffer* cb) = 0; virtual void SetupGS(GSSelector sel) = 0; virtual void SetupPS(PSSelector sel, const PSConstantBuffer* cb, PSSamplerSelector ssel) = 0; diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index 3678d2d687..af79af75f0 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -91,6 +91,7 @@ void GSDrawScanline::BeginDraw(const GSRasterizerData* data) sel.fb = m_global.sel.fb; sel.zb = m_global.sel.zb; sel.zoverflow = m_global.sel.zoverflow; + sel.notest = m_global.sel.notest; m_sp = m_sp_map[sel]; } @@ -272,17 +273,24 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS // Init - int skip = left & 3; + int skip, steps; - left -= skip; - - int steps = pixels + skip - 4; + if(!sel.notest) + { + skip = left & 3; + steps = pixels + skip - 4; + left -= skip; + test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + } + else + { + skip = 0; + steps = pixels - 4; + } const GSVector2i* fza_base = &m_global.fzbr[top]; const GSVector2i* fza_offset = &m_global.fzbc[left >> 2]; - test = GSDrawScanlineCodeGenerator::m_test[skip] | GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; - if(sel.prim != GS_SPRITE_CLASS) { if(sel.fwrite && sel.fge) @@ -318,7 +326,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } else if(sel.ltf) { - vf = v.xxzzlh().srl16(1); + vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); } s = GSVector4::cast(u); @@ -508,8 +516,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS u -= 0x8000; v -= 0x8000; - uf = u.xxzzlh().srl16(1); - vf = v.xxzzlh().srl16(1); + uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); + vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); } GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); @@ -629,8 +637,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS u -= 0x8000; v -= 0x8000; - uf = u.xxzzlh().srl16(1); - vf = v.xxzzlh().srl16(1); + uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); + vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); } GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); @@ -764,11 +772,11 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS if(sel.ltf) { - uf = u.xxzzlh().srl16(1); + uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); if(sel.prim != GS_SPRITE_CLASS) { - vf = v.xxzzlh().srl16(1); + vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); } } @@ -1000,27 +1008,30 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS int fzm = 0; - if(sel.fwrite) + if(!sel.notest) { - fm |= test; - } + if(sel.fwrite) + { + fm |= test; + } - if(sel.zwrite) - { - zm |= test; - } + if(sel.zwrite) + { + zm |= test; + } - if(sel.fwrite && sel.zwrite) - { - fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); - } - else if(sel.fwrite) - { - fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask(); - } - else if(sel.zwrite) - { - fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask(); + if(sel.fwrite && sel.zwrite) + { + fzm = ~(fm == GSVector4i::xffffffff()).ps32(zm == GSVector4i::xffffffff()).mask(); + } + else if(sel.fwrite) + { + fzm = ~(fm == GSVector4i::xffffffff()).ps32().mask(); + } + else if(sel.zwrite) + { + fzm = ~(zm == GSVector4i::xffffffff()).ps32().mask(); + } } // WriteZBuf @@ -1030,16 +1041,39 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS if(sel.ztest && sel.zpsm < 2) { zs = zs.blend8(zd, zm); + } - if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs); - if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs); + bool fast = sel.ztest ? sel.zpsm < 2 : sel.zpsm == 0 && sel.notest; + + if(sel.notest) + { + if(fast) + { + GSVector4i::storel((uint8*)m_global.vm + za * 2, zs); + GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs); + } + else + { + WritePixel(zs, za, 0, sel.zpsm); + WritePixel(zs, za, 1, sel.zpsm); + WritePixel(zs, za, 2, sel.zpsm); + WritePixel(zs, za, 3, sel.zpsm); + } } else { - if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm); - if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm); - if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm); - if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm); + if(fast) + { + if(fzm & 0x0f00) GSVector4i::storel((uint8*)m_global.vm + za * 2, zs); + if(fzm & 0xf000) GSVector4i::storeh((uint8*)m_global.vm + za * 2 + 16, zs); + } + else + { + if(fzm & 0x0300) WritePixel(zs, za, 0, sel.zpsm); + if(fzm & 0x0c00) WritePixel(zs, za, 1, sel.zpsm); + if(fzm & 0x3000) WritePixel(zs, za, 2, sel.zpsm); + if(fzm & 0xc000) WritePixel(zs, za, 3, sel.zpsm); + } } } @@ -1197,17 +1231,37 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS fs = fs.blend(fd, fm); } - if(sel.rfb && sel.fpsm < 2) + bool fast = sel.rfb ? sel.fpsm < 2 : sel.fpsm == 0 && sel.notest; + + if(sel.notest) { - if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs); - if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs); + if(fast) + { + GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs); + GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs); + } + else + { + WritePixel(fs, fa, 0, sel.fpsm); + WritePixel(fs, fa, 1, sel.fpsm); + WritePixel(fs, fa, 2, sel.fpsm); + WritePixel(fs, fa, 3, sel.fpsm); + } } else { - if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm); - if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm); - if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm); - if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm); + if(fast) + { + if(fzm & 0x000f) GSVector4i::storel((uint8*)m_global.vm + fa * 2, fs); + if(fzm & 0x00f0) GSVector4i::storeh((uint8*)m_global.vm + fa * 2 + 16, fs); + } + else + { + if(fzm & 0x0003) WritePixel(fs, fa, 0, sel.fpsm); + if(fzm & 0x000c) WritePixel(fs, fa, 1, sel.fpsm); + if(fzm & 0x0030) WritePixel(fs, fa, 2, sel.fpsm); + if(fzm & 0x00c0) WritePixel(fs, fa, 3, sel.fpsm); + } } } } @@ -1273,7 +1327,10 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } } - test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + if(!sel.notest) + { + test = GSDrawScanlineCodeGenerator::m_test[7 + (steps & (steps >> 31))]; + } } } @@ -1492,6 +1549,7 @@ void GSDrawScanline::DrawRectT(const int* RESTRICT row, const int* RESTRICT col, if(masked) ASSERT(mask.u32[0] != 0); color = color.andnot(mask); + c = color.extract32<0>(); GSVector4i br = r.ralign(GSVector2i(8 * 4 / sizeof(T), 8)); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index b67d9dfd30..c0d938f10c 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -250,31 +250,40 @@ L("exit"); void GSDrawScanlineCodeGenerator::Init() { - // int skip = left & 3; + if(!m_sel.notest) + { + // int skip = left & 3; - mov(ebx, edx); - and(edx, 3); + mov(ebx, edx); + and(edx, 3); - // left -= skip; + // int steps = pixels + skip - 4; - sub(ebx, edx); + lea(ecx, ptr[ecx + edx - 4]); - // int steps = pixels + skip - 4; + // left -= skip; - lea(ecx, ptr[ecx + edx - 4]); + sub(ebx, edx); - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - shl(edx, 4); + shl(edx, 4); - vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); + vmovdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - shl(eax, 4); + mov(eax, ecx); + sar(eax, 31); + and(eax, ecx); + shl(eax, 4); - vpor(xmm7, ptr[eax + (size_t)&m_test[7]]); + vpor(xmm7, ptr[eax + (size_t)&m_test[7]]); + } + else + { + mov(ebx, edx); // left + xor(edx, edx); // skip + lea(ecx, ptr[ecx - 4]); // steps + } // GSVector2i* fza_base = &m_local.gd->fzbr[top]; @@ -380,7 +389,8 @@ void GSDrawScanlineCodeGenerator::Init() { vpshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 1); + vpsrlw(xmm6, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm6, 15 - GS_BILINEAR_PRECISION); vmovdqa(ptr[&m_local.temp.vf], xmm6); } } @@ -573,14 +583,17 @@ void GSDrawScanlineCodeGenerator::Step() } } - // test = m_test[7 + (steps & (steps >> 31))]; + if(!m_sel.notest) + { + // test = m_test[7 + (steps & (steps >> 31))]; - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - shl(edx, 4); + mov(edx, ecx); + sar(edx, 31); + and(edx, ecx); + shl(edx, 4); - vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); + vmovdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); + } } void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) @@ -730,7 +743,8 @@ void GSDrawScanlineCodeGenerator::SampleTexture() vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 1); + vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); vmovdqa(ptr[&m_local.temp.uf], xmm0); if(m_sel.prim != GS_SPRITE_CLASS) @@ -739,7 +753,8 @@ void GSDrawScanlineCodeGenerator::SampleTexture() vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 1); + vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); vmovdqa(ptr[&m_local.temp.vf], xmm0); } } @@ -1283,14 +1298,16 @@ return; vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 1); + vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); vmovdqa(ptr[&m_local.temp.uf], xmm0); // GSVector4i vf = v.xxzzlh().srl16(1); vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 1); + vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); vmovdqa(ptr[&m_local.temp.vf], xmm0); } @@ -1524,14 +1541,16 @@ return; vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 1); + vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); vmovdqa(ptr[&m_local.temp.uf], xmm0); // GSVector4i vf = v.xxzzlh().srl16(1); vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 1); + vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); vmovdqa(ptr[&m_local.temp.vf], xmm0); } @@ -2302,6 +2321,11 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::WriteMask() { + if(m_sel.notest) + { + return; + } + // fm |= test; // zm |= test; @@ -2348,17 +2372,17 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() return; } - bool fast = m_sel.ztest && m_sel.zpsm < 2; - vmovdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]); - if(fast) + if(m_sel.ztest && m_sel.zpsm < 2) { // zs = zs.blend8(zd, zm); vpblendvb(xmm1, ptr[&m_local.temp.zd], xmm4); } + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; + WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); } @@ -2664,7 +2688,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame() blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm } - bool fast = m_sel.rfb && m_sel.fpsm < 2; + bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); } @@ -2677,49 +2701,67 @@ void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) { - if(fast) + if(m_sel.notest) { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - L("@@"); - - // vmaskmovps? + if(fast) + { + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + } + else + { + WritePixel(src, addr, 0, psm); + WritePixel(src, addr, 1, psm); + WritePixel(src, addr, 2, psm); + WritePixel(src, addr, 3, psm); + } } else { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); + test(mask, 0x0f); + je("@f"); + vmovq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + L("@@"); - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); + test(mask, 0xf0); + je("@f"); + vmovhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + L("@@"); - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); + // vmaskmovps? + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); + test(mask, 0x03); + je("@f"); + WritePixel(src, addr, 0, psm); + L("@@"); + + test(mask, 0x0c); + je("@f"); + WritePixel(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel(src, addr, 3, psm); + L("@@"); + } } } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index aad049e47b..b37dc11638 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -250,31 +250,40 @@ L("exit"); void GSDrawScanlineCodeGenerator::Init() { - // int skip = left & 3; + if(!m_sel.notest) + { + // int skip = left & 3; - mov(ebx, edx); - and(edx, 3); + mov(ebx, edx); + and(edx, 3); - // left -= skip; + // int steps = pixels + skip - 4; - sub(ebx, edx); + lea(ecx, ptr[ecx + edx - 4]); - // int steps = pixels + skip - 4; + // left -= skip; - lea(ecx, ptr[ecx + edx - 4]); + sub(ebx, edx); - // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; + // GSVector4i test = m_test[skip] | m_test[7 + (steps & (steps >> 31))]; - shl(edx, 4); + shl(edx, 4); - movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); + movdqa(xmm7, ptr[edx + (size_t)&m_test[0]]); - mov(eax, ecx); - sar(eax, 31); - and(eax, ecx); - shl(eax, 4); + mov(eax, ecx); + sar(eax, 31); + and(eax, ecx); + shl(eax, 4); - por(xmm7, ptr[eax + (size_t)&m_test[7]]); + por(xmm7, ptr[eax + (size_t)&m_test[7]]); + } + else + { + mov(ebx, edx); // left + xor(edx, edx); // skip + lea(ecx, ptr[ecx - 4]); // steps + } // GSVector2i* fza_base = &m_local.gd->fzbr[top]; @@ -380,7 +389,8 @@ void GSDrawScanlineCodeGenerator::Init() { pshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm6, 1); + psrlw(xmm6, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) psllw(xmm6, 15 - GS_BILINEAR_PRECISION); movdqa(ptr[&m_local.temp.vf], xmm6); } } @@ -578,14 +588,17 @@ void GSDrawScanlineCodeGenerator::Step() } } - // test = m_test[7 + (steps & (steps >> 31))]; + if(!m_sel.notest) + { + // test = m_test[7 + (steps & (steps >> 31))]; - mov(edx, ecx); - sar(edx, 31); - and(edx, ecx); - shl(edx, 4); + mov(edx, ecx); + sar(edx, 31); + and(edx, ecx); + shl(edx, 4); - movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); + movdqa(xmm7, ptr[edx + (size_t)&m_test[7]]); + } } void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) @@ -735,7 +748,8 @@ void GSDrawScanlineCodeGenerator::SampleTexture() pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 1); + psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); movdqa(ptr[&m_local.temp.uf], xmm0); if(m_sel.prim != GS_SPRITE_CLASS) @@ -744,7 +758,8 @@ void GSDrawScanlineCodeGenerator::SampleTexture() pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 1); + psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); movdqa(ptr[&m_local.temp.vf], xmm0); } } @@ -1338,14 +1353,16 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 1); + psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); movdqa(ptr[&m_local.temp.uf], xmm0); // GSVector4i vf = v.xxzzlh().srl16(1); pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 1); + psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); movdqa(ptr[&m_local.temp.vf], xmm0); } @@ -1591,14 +1608,16 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 1); + psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); movdqa(ptr[&m_local.temp.uf], xmm0); // GSVector4i vf = v.xxzzlh().srl16(1); pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 1); + psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); + if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); movdqa(ptr[&m_local.temp.vf], xmm0); } @@ -2415,6 +2434,11 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha() void GSDrawScanlineCodeGenerator::WriteMask() { + if(m_sel.notest) + { + return; + } + // fm |= test; // zm |= test; @@ -2462,11 +2486,9 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() return; } - bool fast = m_sel.ztest && m_sel.zpsm < 2; - movdqa(xmm1, ptr[m_sel.prim != GS_SPRITE_CLASS ? &m_local.temp.zs : &m_local.p.z]); - if(fast) + if(m_sel.ztest && m_sel.zpsm < 2) { // zs = zs.blend8(zd, zm); @@ -2475,6 +2497,8 @@ void GSDrawScanlineCodeGenerator::WriteZBuf() blend8(xmm1, xmm7); } + bool fast = m_sel.ztest ? m_sel.zpsm < 2 : m_sel.zpsm == 0 && m_sel.notest; + WritePixel(xmm1, ebp, dh, fast, m_sel.zpsm, 1); } @@ -2804,7 +2828,7 @@ void GSDrawScanlineCodeGenerator::WriteFrame() blend(xmm5, xmm2, xmm3); // TODO: could be skipped in certain cases, depending on fpsm and fm } - bool fast = m_sel.rfb && m_sel.fpsm < 2; + bool fast = m_sel.rfb ? m_sel.fpsm < 2 : m_sel.fpsm == 0 && m_sel.notest; WritePixel(xmm5, ebx, dl, fast, m_sel.fpsm, 0); } @@ -2817,47 +2841,65 @@ void GSDrawScanlineCodeGenerator::ReadPixel(const Xmm& dst, const Reg32& addr) void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg32& addr, const Reg8& mask, bool fast, int psm, int fz) { - if(fast) + if(m_sel.notest) { - // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); - // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - - test(mask, 0x0f); - je("@f"); - movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); - L("@@"); - - test(mask, 0xf0); - je("@f"); - movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); - L("@@"); + if(fast) + { + movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + } + else + { + WritePixel(src, addr, 0, psm); + WritePixel(src, addr, 1, psm); + WritePixel(src, addr, 2, psm); + WritePixel(src, addr, 3, psm); + } } else { - // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); - // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); - // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); - // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); + if(fast) + { + // if(fzm & 0x0f) GSVector4i::storel(&vm16[addr + 0], fs); + // if(fzm & 0xf0) GSVector4i::storeh(&vm16[addr + 8], fs); - test(mask, 0x03); - je("@f"); - WritePixel(src, addr, 0, psm); - L("@@"); + test(mask, 0x0f); + je("@f"); + movq(qword[addr * 2 + (size_t)m_local.gd->vm], src); + L("@@"); - test(mask, 0x0c); - je("@f"); - WritePixel(src, addr, 1, psm); - L("@@"); + test(mask, 0xf0); + je("@f"); + movhps(qword[addr * 2 + (size_t)m_local.gd->vm + 8 * 2], src); + L("@@"); + } + else + { + // if(fzm & 0x03) WritePixel(fpsm, &vm16[addr + 0], fs.extract32<0>()); + // if(fzm & 0x0c) WritePixel(fpsm, &vm16[addr + 2], fs.extract32<1>()); + // if(fzm & 0x30) WritePixel(fpsm, &vm16[addr + 8], fs.extract32<2>()); + // if(fzm & 0xc0) WritePixel(fpsm, &vm16[addr + 10], fs.extract32<3>()); - test(mask, 0x30); - je("@f"); - WritePixel(src, addr, 2, psm); - L("@@"); + test(mask, 0x03); + je("@f"); + WritePixel(src, addr, 0, psm); + L("@@"); - test(mask, 0xc0); - je("@f"); - WritePixel(src, addr, 3, psm); - L("@@"); + test(mask, 0x0c); + je("@f"); + WritePixel(src, addr, 1, psm); + L("@@"); + + test(mask, 0x30); + je("@f"); + WritePixel(src, addr, 2, psm); + L("@@"); + + test(mask, 0xc0); + je("@f"); + WritePixel(src, addr, 3, psm); + L("@@"); + } } } diff --git a/plugins/GSdx/GSDrawingContext.h b/plugins/GSdx/GSDrawingContext.h index d3a7f8b996..73f3206c08 100644 --- a/plugins/GSdx/GSDrawingContext.h +++ b/plugins/GSdx/GSDrawingContext.h @@ -53,7 +53,8 @@ public: GSOffset* fb; GSOffset* zb; GSOffset* tex; - GSPixelOffset4* fzb; + GSPixelOffset* fzb; + GSPixelOffset4* fzb4; } offset; GSDrawingContext() diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index 4bffdf475a..dfdd11274a 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -342,55 +342,55 @@ GSLocalMemory::GSLocalMemory() m_psm[PSM_PSMCT24].rtx = &GSLocalMemory::ReadTexture24; m_psm[PSM_PSMCT16].rtx = &GSLocalMemory::ReadTexture16; - m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16S; + m_psm[PSM_PSMCT16S].rtx = &GSLocalMemory::ReadTexture16; m_psm[PSM_PSMT8].rtx = &GSLocalMemory::ReadTexture8; m_psm[PSM_PSMT4].rtx = &GSLocalMemory::ReadTexture4; m_psm[PSM_PSMT8H].rtx = &GSLocalMemory::ReadTexture8H; m_psm[PSM_PSMT4HL].rtx = &GSLocalMemory::ReadTexture4HL; m_psm[PSM_PSMT4HH].rtx = &GSLocalMemory::ReadTexture4HH; - m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32Z; - m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24Z; - m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16Z; - m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16SZ; + m_psm[PSM_PSMZ32].rtx = &GSLocalMemory::ReadTexture32; + m_psm[PSM_PSMZ24].rtx = &GSLocalMemory::ReadTexture24; + m_psm[PSM_PSMZ16].rtx = &GSLocalMemory::ReadTexture16; + m_psm[PSM_PSMZ16S].rtx = &GSLocalMemory::ReadTexture16; m_psm[PSM_PSMCT24].rtxP = &GSLocalMemory::ReadTexture24; m_psm[PSM_PSMCT16].rtxP = &GSLocalMemory::ReadTexture16; - m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16S; + m_psm[PSM_PSMCT16S].rtxP = &GSLocalMemory::ReadTexture16; m_psm[PSM_PSMT8].rtxP = &GSLocalMemory::ReadTexture8P; m_psm[PSM_PSMT4].rtxP = &GSLocalMemory::ReadTexture4P; m_psm[PSM_PSMT8H].rtxP = &GSLocalMemory::ReadTexture8HP; m_psm[PSM_PSMT4HL].rtxP = &GSLocalMemory::ReadTexture4HLP; m_psm[PSM_PSMT4HH].rtxP = &GSLocalMemory::ReadTexture4HHP; - m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32Z; - m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24Z; - m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16Z; - m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16SZ; + m_psm[PSM_PSMZ32].rtxP = &GSLocalMemory::ReadTexture32; + m_psm[PSM_PSMZ24].rtxP = &GSLocalMemory::ReadTexture24; + m_psm[PSM_PSMZ16].rtxP = &GSLocalMemory::ReadTexture16; + m_psm[PSM_PSMZ16S].rtxP = &GSLocalMemory::ReadTexture16; m_psm[PSM_PSMCT24].rtxb = &GSLocalMemory::ReadTextureBlock24; m_psm[PSM_PSMCT16].rtxb = &GSLocalMemory::ReadTextureBlock16; - m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16S; + m_psm[PSM_PSMCT16S].rtxb = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMT8].rtxb = &GSLocalMemory::ReadTextureBlock8; m_psm[PSM_PSMT4].rtxb = &GSLocalMemory::ReadTextureBlock4; m_psm[PSM_PSMT8H].rtxb = &GSLocalMemory::ReadTextureBlock8H; m_psm[PSM_PSMT4HL].rtxb = &GSLocalMemory::ReadTextureBlock4HL; m_psm[PSM_PSMT4HH].rtxb = &GSLocalMemory::ReadTextureBlock4HH; - m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32Z; - m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24Z; - m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16Z; - m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16SZ; + m_psm[PSM_PSMZ32].rtxb = &GSLocalMemory::ReadTextureBlock32; + m_psm[PSM_PSMZ24].rtxb = &GSLocalMemory::ReadTextureBlock24; + m_psm[PSM_PSMZ16].rtxb = &GSLocalMemory::ReadTextureBlock16; + m_psm[PSM_PSMZ16S].rtxb = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMCT24].rtxbP = &GSLocalMemory::ReadTextureBlock24; m_psm[PSM_PSMCT16].rtxbP = &GSLocalMemory::ReadTextureBlock16; - m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16S; + m_psm[PSM_PSMCT16S].rtxbP = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMT8].rtxbP = &GSLocalMemory::ReadTextureBlock8P; m_psm[PSM_PSMT4].rtxbP = &GSLocalMemory::ReadTextureBlock4P; m_psm[PSM_PSMT8H].rtxbP = &GSLocalMemory::ReadTextureBlock8HP; m_psm[PSM_PSMT4HL].rtxbP = &GSLocalMemory::ReadTextureBlock4HLP; m_psm[PSM_PSMT4HH].rtxbP = &GSLocalMemory::ReadTextureBlock4HHP; - m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32Z; - m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24Z; - m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16Z; - m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16SZ; + m_psm[PSM_PSMZ32].rtxbP = &GSLocalMemory::ReadTextureBlock32; + m_psm[PSM_PSMZ24].rtxbP = &GSLocalMemory::ReadTextureBlock24; + m_psm[PSM_PSMZ16].rtxbP = &GSLocalMemory::ReadTextureBlock16; + m_psm[PSM_PSMZ16S].rtxbP = &GSLocalMemory::ReadTextureBlock16; m_psm[PSM_PSMCT16].bpp = m_psm[PSM_PSMCT16S].bpp = 16; m_psm[PSM_PSMT8].bpp = 8; @@ -473,6 +473,62 @@ GSOffset* GSLocalMemory::GetOffset(uint32 bp, uint32 bw, uint32 psm) return o; } +GSPixelOffset* GSLocalMemory::GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF) +{ + uint32 fbp = FRAME.Block(); + uint32 zbp = ZBUF.Block(); + uint32 fpsm = FRAME.PSM; + uint32 zpsm = ZBUF.PSM; + uint32 bw = FRAME.FBW; + + ASSERT(m_psm[fpsm].trbpp > 8 || m_psm[zpsm].trbpp > 8); + + // "(psm & 0x0f) ^ ((psm & 0xf0) >> 2)" creates 4 bit unique identifiers for render target formats (only) + + uint32 fpsm_hash = (fpsm & 0x0f) ^ ((fpsm & 0x30) >> 2); + uint32 zpsm_hash = (zpsm & 0x0f) ^ ((zpsm & 0x30) >> 2); + + uint32 hash = (FRAME.FBP << 0) | (ZBUF.ZBP << 9) | (bw << 18) | (fpsm_hash << 24) | (zpsm_hash << 28); + + hash_map::iterator i = m_pomap.find(hash); + + if(i != m_pomap.end()) + { + return i->second; + } + + GSPixelOffset* o = (GSPixelOffset*)_aligned_malloc(sizeof(GSPixelOffset), 32); + + o->hash = hash; + o->fbp = fbp; + o->zbp = zbp; + o->fpsm = fpsm; + o->zpsm = zpsm; + o->bw = bw; + + pixelAddress fpa = m_psm[fpsm].pa; + pixelAddress zpa = m_psm[zpsm].pa; + + int fs = m_psm[fpsm].bpp >> 5; + int zs = m_psm[zpsm].bpp >> 5; + + for(int i = 0; i < 2048; i++) + { + o->row[i].x = (int)fpa(0, i, fbp, bw) << fs; + o->row[i].y = (int)zpa(0, i, zbp, bw) << zs; + } + + for(int i = 0; i < 2048; i++) + { + o->col[i].x = m_psm[fpsm].rowOffset[0][i] << fs; + o->col[i].y = m_psm[zpsm].rowOffset[0][i] << zs; + } + + m_pomap[hash] = o; + + return o; +} + GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF) { uint32 fbp = FRAME.Block(); @@ -1550,28 +1606,22 @@ void GSLocalMemory::ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& void GSLocalMemory::ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) { - __aligned(uint16, 32) block[16 * 8]; - - FOREACH_BLOCK_START(r, 16, 8, 32) + if(TEXA.AEM) { - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); + FOREACH_BLOCK_START(r, 16, 8, 32) + { + ReadAndExpandBlock16(src, dst, dstpitch, TEXA); + } + FOREACH_BLOCK_END } - FOREACH_BLOCK_END -} - -void GSLocalMemory::ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - __aligned(uint16, 32) block[16 * 8]; - - FOREACH_BLOCK_START(r, 16, 8, 32) + else { - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); + FOREACH_BLOCK_START(r, 16, 8, 32) + { + ReadAndExpandBlock16(src, dst, dstpitch, TEXA); + } + FOREACH_BLOCK_END } - FOREACH_BLOCK_END } void GSLocalMemory::ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) @@ -1629,61 +1679,6 @@ void GSLocalMemory::ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i& FOREACH_BLOCK_END } -void GSLocalMemory::ReadTexture32Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - FOREACH_BLOCK_START(r, 8, 8, 32) - { - ReadBlock32(src, dst, dstpitch); - } - FOREACH_BLOCK_END -} - -void GSLocalMemory::ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - if(TEXA.AEM) - { - FOREACH_BLOCK_START(r, 8, 8, 32) - { - ReadAndExpandBlock24(src, dst, dstpitch, TEXA); - } - FOREACH_BLOCK_END - } - else - { - FOREACH_BLOCK_START(r, 8, 8, 32) - { - ReadAndExpandBlock24(src, dst, dstpitch, TEXA); - } - FOREACH_BLOCK_END - } -} - -void GSLocalMemory::ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - __aligned(uint16, 32) block[16 * 8]; - - FOREACH_BLOCK_START(r, 16, 8, 32) - { - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); - } - FOREACH_BLOCK_END -} - -void GSLocalMemory::ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) -{ - __aligned(uint16, 32) block[16 * 8]; - - FOREACH_BLOCK_START(r, 16, 8, 32) - { - ReadBlock16(src, (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); - } - FOREACH_BLOCK_END -} - /////////////////// void GSLocalMemory::ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const @@ -1709,20 +1704,16 @@ void GSLocalMemory::ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, cons void GSLocalMemory::ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const { - __aligned(uint16, 32) block[16 * 8]; + ALIGN_STACK(32); - ReadBlock16(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); -} - -void GSLocalMemory::ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - __aligned(uint16, 32) block[16 * 8]; - - ReadBlock16(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); + if(TEXA.AEM) + { + ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); + } + else + { + ReadAndExpandBlock16(BlockPtr(bp), dst, dstpitch, TEXA); + } } void GSLocalMemory::ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const @@ -1760,45 +1751,6 @@ void GSLocalMemory::ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, con ReadAndExpandBlock4HH_32(BlockPtr(bp), dst, dstpitch, m_clut); } -void GSLocalMemory::ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - ALIGN_STACK(32); - - ReadBlock32(BlockPtr(bp), dst, dstpitch); -} - -void GSLocalMemory::ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - ALIGN_STACK(32); - - if(TEXA.AEM) - { - ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); - } - else - { - ReadAndExpandBlock24(BlockPtr(bp), dst, dstpitch, TEXA); - } -} - -void GSLocalMemory::ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - __aligned(uint16, 32) block[16 * 8]; - - ReadBlock16(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); -} - -void GSLocalMemory::ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const -{ - __aligned(uint16, 32) block[16 * 8]; - - ReadBlock16(BlockPtr(bp), (uint8*)block, sizeof(block) / 8); - - ExpandBlock16(block, dst, dstpitch, TEXA); -} - /////////////////// void GSLocalMemory::ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index e76bde3f00..8cda1b73b2 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -56,6 +56,16 @@ public: uint32* GetPages(const GSVector4i& rect, uint32* pages = NULL, GSVector4i* bbox = NULL); }; +struct GSPixelOffset +{ + // 16 bit offsets (m_vm16[...]) + + GSVector2i row[2048]; // f yn | z yn + GSVector2i col[2048]; // f xn | z xn + uint32 hash; + uint32 fbp, zbp, fpsm, zpsm, bw; +}; + struct GSPixelOffset4 { // 16 bit offsets (m_vm16[...]) @@ -158,6 +168,7 @@ protected: // hash_map m_omap; + hash_map m_pomap; hash_map m_po4map; hash_map*> m_p2tmap; @@ -166,6 +177,7 @@ public: virtual ~GSLocalMemory(); GSOffset* GetOffset(uint32 bp, uint32 bw, uint32 psm); + GSPixelOffset* GetPixelOffset(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF); GSPixelOffset4* GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF); vector* GetPage2TileMap(const GIFRegTEX0& TEX0); @@ -863,32 +875,22 @@ public: void ReadTexture32(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture24(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture16(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture16S(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture8(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture4(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture8H(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture4HL(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture4HH(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture32Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture24Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture16Z(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); - void ReadTexture16SZ(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTexture(const GSOffset* RESTRICT o, const GSVector4i& r, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA); void ReadTextureBlock32(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock24(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock16(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock16S(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock8(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock4(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock8H(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock4HL(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; void ReadTextureBlock4HH(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock32Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock24Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock16Z(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; - void ReadTextureBlock16SZ(uint32 bp, uint8* dst, int dstpitch, const GIFRegTEXA& TEXA) const; // pal ? 8 : 32 diff --git a/plugins/GSdx/GSPerfMon.h b/plugins/GSdx/GSPerfMon.h index 907af994bf..f9b023b932 100644 --- a/plugins/GSdx/GSPerfMon.h +++ b/plugins/GSdx/GSPerfMon.h @@ -35,7 +35,7 @@ public: enum counter_t { - Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, + Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, SyncPoint, CounterLast, }; diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 1872b6844a..d1644195d8 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -30,6 +30,8 @@ #define THREAD_HEIGHT 4 +int GSRasterizerData::s_counter = 0; + GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon) : m_ds(ds) , m_id(id) @@ -40,7 +42,7 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false); m_edge.count = 0; - m_myscanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64); + m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64); int row = 0; @@ -48,14 +50,14 @@ GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* pe { for(int i = 0; i < threads; i++, row++) { - m_myscanline[row] = i == id ? 1 : 0; + m_scanline[row] = i == id ? 1 : 0; } } } GSRasterizer::~GSRasterizer() { - _aligned_free(m_myscanline); + _aligned_free(m_scanline); if(m_edge.buff != NULL) vmfree(m_edge.buff, sizeof(GSVertexSW) * 2048); @@ -66,7 +68,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top) const { ASSERT(top >= 0 && top < 2048); - return m_myscanline[top >> THREAD_HEIGHT] != 0; + return m_scanline[top >> THREAD_HEIGHT] != 0; } bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const @@ -78,7 +80,7 @@ bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const while(top < bottom) { - if(m_myscanline[top++]) + if(m_scanline[top++]) { return true; } @@ -91,9 +93,9 @@ int GSRasterizer::FindMyNextScanline(int top) const { int i = top >> THREAD_HEIGHT; - if(m_myscanline[i] == 0) + if(m_scanline[i] == 0) { - while(m_myscanline[++i] == 0); + while(m_scanline[++i] == 0); top = i << THREAD_HEIGHT; } @@ -124,6 +126,8 @@ void GSRasterizer::Draw(GSRasterizerData* data) if(data->vertex != NULL && data->vertex_count == 0 || data->index != NULL && data->index_count == 0) return; + data->start = __rdtsc(); + m_ds->BeginDraw(data); const GSVertexSW* vertex = data->vertex; @@ -140,8 +144,6 @@ void GSRasterizer::Draw(GSRasterizerData* data) m_fscissor_x = GSVector4(data->scissor).xzxz(); m_fscissor_y = GSVector4(data->scissor).ywyw(); - uint64 start = __rdtsc(); - switch(data->primclass) { case GS_POINT_CLASS: @@ -206,7 +208,9 @@ void GSRasterizer::Draw(GSRasterizerData* data) __assume(0); } - uint64 ticks = __rdtsc() - start; + data->pixels = m_pixels; + + uint64 ticks = __rdtsc() - data->start; m_ds->EndDraw(data->frame, ticks, m_pixels); } @@ -444,28 +448,18 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertex, const uint32* index) GSVector4 dxy01c = dxy01 * cross; - GSVector4 _z = dxy01c * dv[1].p.zzzz(dv[0].p); // dx0 * z1, dy0 * z1, dx1 * z0, dy1 * z0 - GSVector4 _f = dxy01c * dv[1].p.wwww(dv[0].p); // dx0 * f1, dy0 * f1, dx1 * f0, dy1 * f0 + /* + dscan = dv[1] * dxy01c.yyyy() - dv[0] * dxy01c.wwww(); + dedge = dv[0] * dxy01c.zzzz() - dv[1] * dxy01c.xxxx(); + */ - GSVector4 _zf = _z.ywyw(_f).hsub(_z.zxzx(_f)); // dy0 * z1 - dy1 * z0, dy0 * f1 - dy1 * f0, dx1 * z0 - dx0 * z1, dx1 * f0 - dx0 * f1 + dscan.p = dv[1].p * dxy01c.yyyy() - dv[0].p * dxy01c.wwww(); + dscan.t = dv[1].t * dxy01c.yyyy() - dv[0].t * dxy01c.wwww(); + dscan.c = dv[1].c * dxy01c.yyyy() - dv[0].c * dxy01c.wwww(); - dscan.p = _zf.zwxy(); // dy0 * z1 - dy1 * z0, dy0 * f1 - dy1 * f0 - dedge.p = _zf; // dx1 * z0 - dx0 * z1, dx1 * f0 - dx0 * f1 - - GSVector4 _s = dxy01c * dv[1].t.xxxx(dv[0].t); // dx0 * s1, dy0 * s1, dx1 * s0, dy1 * s0 - GSVector4 _t = dxy01c * dv[1].t.yyyy(dv[0].t); // dx0 * t1, dy0 * t1, dx1 * t0, dy1 * t0 - GSVector4 _q = dxy01c * dv[1].t.zzzz(dv[0].t); // dx0 * q1, dy0 * q1, dx1 * q0, dy1 * q0 - - dscan.t = _s.ywyw(_t).hsub(_q.ywyw()); // dy0 * s1 - dy1 * s0, dy0 * t1 - dy1 * t0, dy0 * q1 - dy1 * q0 - dedge.t = _s.zxzx(_t).hsub(_q.zxzx()); // dx1 * s0 - dx0 * s1, dx1 * t0 - dx0 * t1, dx1 * q0 - dx0 * q1 - - GSVector4 _r = dxy01c * dv[1].c.xxxx(dv[0].c); // dx0 * r1, dy0 * r1, dx1 * r0, dy1 * r0 - GSVector4 _g = dxy01c * dv[1].c.yyyy(dv[0].c); // dx0 * g1, dy0 * g1, dx1 * g0, dy1 * g0 - GSVector4 _b = dxy01c * dv[1].c.zzzz(dv[0].c); // dx0 * b1, dy0 * b1, dx1 * b0, dy1 * b0 - GSVector4 _a = dxy01c * dv[1].c.wwww(dv[0].c); // dx0 * a1, dy0 * a1, dx1 * a0, dy1 * a0 - - dscan.c = _r.ywyw(_g).hsub(_b.ywyw(_a)); // dy0 * r1 - dy1 * r0, dy0 * g1 - dy1 * g0, dy0 * b1 - dy1 * b0, dy0 * a1 - dy1 * a0 - dedge.c = _r.zxzx(_g).hsub(_b.zxzx(_a)); // dx1 * r0 - dx0 * r1, dx1 * g0 - dx0 * g1, dx1 * b0 - dx0 * b1, dx1 * a0 - dx0 * a1 + dedge.p = dv[0].p * dxy01c.zzzz() - dv[1].p * dxy01c.xxxx(); + dedge.t = dv[0].t * dxy01c.zzzz() - dv[1].t * dxy01c.xxxx(); + dedge.c = dv[0].c * dxy01c.zzzz() - dv[1].c * dxy01c.xxxx(); if(m1 & 1) { @@ -555,7 +549,13 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co scan.t = edge.t + dedge.t * dy; scan.c = edge.c + dedge.c * dy; - AddScanline(e++, pixels, left, top, scan + dscan * (l - p0).xxxx()); + GSVector4 prestep = (l - p0).xxxx(); + + scan.p += dscan.p * prestep; + scan.t += dscan.t * prestep; + scan.c += dscan.c * prestep; + + AddScanline(e++, pixels, left, top, scan); } top++; @@ -904,11 +904,20 @@ void GSRasterizer::Flush(const GSVertexSW* vertex, const uint32* index, const GS // -GSRasterizerList::GSRasterizerList() - : GSJobQueue >() - , m_sync_count(0) - , m_syncpoint_count(0) +GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon) + : m_perfmon(perfmon) { + m_scanline = (uint8*)_aligned_malloc((2048 >> THREAD_HEIGHT) + 16, 64); + + int row = 0; + + while(row < (2048 >> THREAD_HEIGHT)) + { + for(int i = 0; i < threads; i++, row++) + { + m_scanline[row] = i; + } + } } GSRasterizerList::~GSRasterizerList() @@ -917,31 +926,49 @@ GSRasterizerList::~GSRasterizerList() { delete *i; } + + _aligned_free(m_scanline); } void GSRasterizerList::Queue(shared_ptr data) { - // disable dispatcher thread for now and pass-through directly, - // would only be relevant if data->syncpoint was utilized more, - // it would hide the syncing latency from the main gs thread + GSVector4i r = data->bbox.rintersect(data->scissor); - // Push(data); + ASSERT(r.top >= 0 && r.top < 2048 && r.bottom >= 0 && r.bottom < 2048); - Process(data); m_count++; + int top = r.top >> THREAD_HEIGHT; + int bottom = std::min((r.bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT, top + m_workers.size()); + + while(top < bottom) + { + m_workers[m_scanline[top++]]->Push(data); + } } void GSRasterizerList::Sync() { - if(GetCount() == 0) return; + if(!IsSynced()) + { + for(size_t i = 0; i < m_workers.size(); i++) + { + m_workers[i]->Wait(); + } - Wait(); // first dispatch all items to workers + m_perfmon->Put(GSPerfMon::SyncPoint, 1); + } +} +bool GSRasterizerList::IsSynced() const +{ for(size_t i = 0; i < m_workers.size(); i++) { - m_workers[i]->Wait(); // then wait all workers to finish their jobs + if(!m_workers[i]->IsEmpty()) + { + return false; + } } - m_sync_count++; + return true; } int GSRasterizerList::GetPixels(bool reset) @@ -956,24 +983,6 @@ int GSRasterizerList::GetPixels(bool reset) return pixels; } -void GSRasterizerList::Process(shared_ptr& item) -{ - if(item->syncpoint) - { - for(size_t i = 0; i < m_workers.size(); i++) - { - m_workers[i]->Wait(); - } - - m_syncpoint_count++; - } - - for(size_t i = 0; i < m_workers.size(); i++) - { - m_workers[i]->Push(item); - } -} - // GSRasterizerList::GSWorker GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r) @@ -994,16 +1003,6 @@ int GSRasterizerList::GSWorker::GetPixels(bool reset) return m_r->GetPixels(reset); } -void GSRasterizerList::GSWorker::Push(const shared_ptr& item) -{ - GSVector4i r = item->bbox.rintersect(item->scissor); - - if(m_r->IsOneOfMyScanlines(r.top, r.bottom)) - { - GSJobQueue >::Push(item); - } -} - void GSRasterizerList::GSWorker::Process(shared_ptr& item) { m_r->Draw(item.get()); diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 71b2dd4ad1..45ab368d1c 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -30,6 +30,8 @@ __aligned(class, 32) GSRasterizerData : public GSAlignedClass<32> { + static int s_counter; + public: GSVector4i scissor; GSVector4i bbox; @@ -39,8 +41,10 @@ public: int vertex_count; uint32* index; int index_count; - bool syncpoint; uint64 frame; + uint64 start; + int pixels; + int counter; GSRasterizerData() : scissor(GSVector4i::zero()) @@ -51,9 +55,11 @@ public: , vertex_count(0) , index(NULL) , index_count(0) - , syncpoint(false) , frame(0) + , start(0) + , pixels(0) { + counter = s_counter++; } virtual ~GSRasterizerData() @@ -109,6 +115,7 @@ public: virtual void Queue(shared_ptr data) = 0; virtual void Sync() = 0; + virtual bool IsSynced() const = 0; virtual int GetPixels(bool reset = true) = 0; }; @@ -119,7 +126,7 @@ protected: IDrawScanline* m_ds; int m_id; int m_threads; - uint8* m_myscanline; + uint8* m_scanline; GSVector4i m_scissor; GSVector4 m_fscissor_x; GSVector4 m_fscissor_y; @@ -155,12 +162,12 @@ public: void Queue(shared_ptr data); void Sync() {} + bool IsSynced() const {return true;} int GetPixels(bool reset); }; class GSRasterizerList : public IRasterizer - , private GSJobQueue > { protected: class GSWorker : public GSJobQueue > @@ -175,17 +182,14 @@ protected: // GSJobQueue - void Push(const shared_ptr& item); void Process(shared_ptr& item); }; + GSPerfMon* m_perfmon; vector m_workers; + uint8* m_scanline; - GSRasterizerList(); - - // GSJobQueue - - void Process(shared_ptr& item); + GSRasterizerList(int threads, GSPerfMon* perfmon); public: virtual ~GSRasterizerList(); @@ -200,7 +204,7 @@ public: } else { - GSRasterizerList* rl = new GSRasterizerList(); + GSRasterizerList* rl = new GSRasterizerList(threads, perfmon); for(int i = 0; i < threads; i++) { @@ -211,12 +215,10 @@ public: } } - int m_sync_count; - int m_syncpoint_count; - // IRasterizer void Queue(shared_ptr data); void Sync(); + bool IsSynced() const; int GetPixels(bool reset); }; diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp index e196fb226f..ca8fee320f 100644 --- a/plugins/GSdx/GSRenderer.cpp +++ b/plugins/GSdx/GSRenderer.cpp @@ -22,9 +22,8 @@ #include "stdafx.h" #include "GSRenderer.h" -GSRenderer::GSRenderer(GSVertexTrace* vt, size_t vertex_stride) - : GSState(vt, vertex_stride) - , m_dev(NULL) +GSRenderer::GSRenderer() + : m_dev(NULL) , m_shader(0) , m_shift_key(false) , m_control_key(false) @@ -38,12 +37,6 @@ GSRenderer::GSRenderer(GSVertexTrace* vt, size_t vertex_stride) m_aa1 = !!theApp.GetConfig("aa1", 0); m_mipmap = !!theApp.GetConfig("mipmap", 1); m_fxaa = !!theApp.GetConfig("fxaa", 0); - - s_n = 0; - s_dump = !!theApp.GetConfig("dump", 0); - s_save = !!theApp.GetConfig("save", 0); - s_savez = !!theApp.GetConfig("savez", 0); - s_saven = theApp.GetConfig("saven", 0); } GSRenderer::~GSRenderer() @@ -259,7 +252,7 @@ bool GSRenderer::Merge(int field) { int field2 = 1 - ((m_interlace - 1) & 1); int mode = (m_interlace - 1) >> 1; - + m_dev->Interlace(ds, field ^ field2, mode, tex[1] ? tex[1]->GetScale().y : tex[0]->GetScale().y); } @@ -306,6 +299,8 @@ void GSRenderer::VSync(int field) ResetDevice(); } + m_dev->AgePool(); + // osd if((m_perfmon.GetFrame() & 0x1f) == 0) @@ -334,7 +329,7 @@ void GSRenderer::VSync(int field) s2.c_str(), theApp.m_gs_interlace[m_interlace].name.c_str(), theApp.m_gs_aspectratio[m_aspectratio].name.c_str(), - (int)m_perfmon.Get(GSPerfMon::Quad), + (int)m_perfmon.Get(GSPerfMon::SyncPoint), (int)m_perfmon.Get(GSPerfMon::Prim), (int)m_perfmon.Get(GSPerfMon::Draw), m_perfmon.CPU(), diff --git a/plugins/GSdx/GSRenderer.h b/plugins/GSdx/GSRenderer.h index 2d66348795..ce6b8417d2 100644 --- a/plugins/GSdx/GSRenderer.h +++ b/plugins/GSdx/GSRenderer.h @@ -55,14 +55,8 @@ public: GSWnd m_wnd; GSDevice* m_dev; - int s_n; - bool s_dump; - bool s_save; - bool s_savez; - int s_saven; - public: - GSRenderer(GSVertexTrace* vt, size_t vertex_stride); + GSRenderer(); virtual ~GSRenderer(); virtual bool CreateWnd(const string& title, int w, int h); diff --git a/plugins/GSdx/GSRendererCS.cpp b/plugins/GSdx/GSRendererCS.cpp index a244081feb..d9945a9070 100644 --- a/plugins/GSdx/GSRendererCS.cpp +++ b/plugins/GSdx/GSRendererCS.cpp @@ -22,18 +22,28 @@ #include "stdafx.h" #include "GSRendererCS.h" +#define PS_BATCH_SIZE 512 + GSRendererCS::GSRendererCS() - : GSRenderer(new GSVertexTraceCS(this), sizeof(GSVertex)) + : GSRenderer() { m_nativeres = true; - InitConvertVertex(GSRendererCS); - memset(m_vm_valid, 0, sizeof(m_vm_valid)); + + memset(m_texture, 0, sizeof(m_texture)); + + m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); } GSRendererCS::~GSRendererCS() { + for(int i = 0; i < countof(m_texture); i++) + { + delete m_texture[i]; + } + + _aligned_free(m_output); } bool GSRendererCS::CreateDevice(GSDevice* dev_unk) @@ -41,27 +51,157 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk) if(!__super::CreateDevice(dev_unk)) return false; + HRESULT hr; + + D3D11_DEPTH_STENCIL_DESC dsd; + D3D11_BLEND_DESC bsd; + D3D11_SAMPLER_DESC sd; + D3D11_BUFFER_DESC bd; + D3D11_TEXTURE2D_DESC td; + D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; + D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + D3D_FEATURE_LEVEL level; ((GSDeviceDX*)dev_unk)->GetFeatureLevel(level); - if(level < D3D_FEATURE_LEVEL_10_0) + if(level < D3D_FEATURE_LEVEL_11_0) return false; - HRESULT hr; - GSDevice11* dev = (GSDevice11*)dev_unk; - D3D11_BUFFER_DESC bd; - D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; - D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + ID3D11DeviceContext* ctx = *dev; + + // empty depth stencil state + + memset(&dsd, 0, sizeof(dsd)); + + dsd.StencilEnable = false; + dsd.DepthEnable = false; + + hr = (*dev)->CreateDepthStencilState(&dsd, &m_dss); + + if(FAILED(hr)) return false; + + // empty blend state + + memset(&bsd, 0, sizeof(bsd)); + + bsd.RenderTarget[0].BlendEnable = false; + + hr = (*dev)->CreateBlendState(&bsd, &m_bs); + + if(FAILED(hr)) return false; + + // point sampler + + memset(&sd, 0, sizeof(sd)); + + sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT; + + sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; + sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; + sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; + + sd.MaxLOD = FLT_MAX; + sd.MaxAnisotropy = 16; + sd.ComparisonFunc = D3D11_COMPARISON_NEVER; + + hr = (*dev)->CreateSamplerState(&sd, &m_ss); + + if(FAILED(hr)) return false; + + // link buffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = 256 << 20; // 256 MB w00t + bd.StructureByteStride = sizeof(uint32) * 4; // c, z, id, next + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; + bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_lb); + + { + uint32 data[] = {0, 0, 0xffffffff, 0}; + + D3D11_BOX box; + memset(&box, 0, sizeof(box)); + box.right = sizeof(data); + box.bottom = 1; + box.back = 1; + + ctx->UpdateSubresource(m_lb, 0, &box, data, 0, 0); + } + + if(FAILED(hr)) return false; + + memset(&uavd, 0, sizeof(uavd)); + + uavd.Format = DXGI_FORMAT_UNKNOWN; + uavd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride; + uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER; + uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; + + hr = (*dev)->CreateUnorderedAccessView(m_lb, &uavd, &m_lb_uav); + + if(FAILED(hr)) return false; + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_UNKNOWN; + srvd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; + + hr = (*dev)->CreateShaderResourceView(m_lb, &srvd, &m_lb_srv); + + if(FAILED(hr)) return false; + + // start offset buffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(uint32) * 2048 * 2048; // index + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; + bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_sob); + + if(FAILED(hr)) return false; + + memset(&uavd, 0, sizeof(uavd)); + + uavd.Format = DXGI_FORMAT_R32_TYPELESS; + uavd.Buffer.NumElements = bd.ByteWidth / sizeof(uint32); + uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW; + uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; + + hr = (*dev)->CreateUnorderedAccessView(m_sob, &uavd, &m_sob_uav); + + if(FAILED(hr)) return false; + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_R32_TYPELESS; + srvd.BufferEx.NumElements = bd.ByteWidth / sizeof(uint32); + srvd.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX; + + hr = (*dev)->CreateShaderResourceView(m_sob, &srvd, &m_sob_srv); + + if(FAILED(hr)) return false; + + const uint32 tmp = 0; + + ctx->ClearUnorderedAccessViewUint(m_sob_uav, &tmp); // initial clear, next time Draw should restore it in Step 2 // video memory (4MB) memset(&bd, 0, sizeof(bd)); bd.ByteWidth = 4 * 1024 * 1024; - bd.StructureByteStride = 4; bd.Usage = D3D11_USAGE_DEFAULT; bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS; bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; @@ -81,35 +221,32 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk) hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); if(FAILED(hr)) return false; +/* + memset(&td, 0, sizeof(td)); - // vertex buffer + td.Width = PAGE_SIZE; + td.Height = MAX_PAGES; + td.Format = DXGI_FORMAT_R8_UINT; + td.MipLevels = 1; + td.ArraySize = 1; + td.SampleDesc.Count = 1; + td.SampleDesc.Quality = 0; + td.Usage = D3D11_USAGE_DEFAULT; + td.BindFlags = D3D11_BIND_UNORDERED_ACCESS; - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = sizeof(GSVertex) * 10000; - bd.StructureByteStride = sizeof(GSVertex); - bd.Usage = D3D11_USAGE_DYNAMIC; - bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; - bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_vb); + hr = (*dev)->CreateTexture2D(&td, NULL, &m_vm); if(FAILED(hr)) return false; - // index buffer + memset(&uavd, 0, sizeof(uavd)); - memset(&bd, 0, sizeof(bd)); + uavd.Format = DXGI_FORMAT_R8_UINT; + uavd.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D; - bd.ByteWidth = sizeof(uint32) * 10000 * 3; - bd.Usage = D3D11_USAGE_DYNAMIC; - bd.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE; - bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_ib); + hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); if(FAILED(hr)) return false; - +*/ // one page, for copying between cpu<->gpu memset(&bd, 0, sizeof(bd)); @@ -121,219 +258,429 @@ bool GSRendererCS::CreateDevice(GSDevice* dev_unk) hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb); if(FAILED(hr)) return false; +/* + memset(&td, 0, sizeof(td)); + + td.Width = PAGE_SIZE; + td.Height = 1; + td.Format = DXGI_FORMAT_R8_UINT; + td.MipLevels = 1; + td.ArraySize = 1; + td.SampleDesc.Count = 1; + td.SampleDesc.Quality = 0; + td.Usage = D3D11_USAGE_STAGING; + td.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + + hr = (*dev)->CreateTexture2D(&td, NULL, &m_pb); + + if(FAILED(hr)) return false; +*/ + // VSConstantBuffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(VSConstantBuffer); + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_vs_cb); + + if(FAILED(hr)) return false; + + // PS + + D3D11_SHADER_MACRO macro[] = + { + {NULL, NULL}, + }; + + hr = dev->CompileShader(IDR_CS_FX, "ps_main0", macro, &m_ps0); + + if(FAILED(hr)) return false; + + // PSConstantBuffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(PSConstantBuffer); + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_ps_cb); + + if(FAILED(hr)) return false; + + // return true; } +void GSRendererCS::ResetDevice() +{ + for(int i = 0; i < countof(m_texture); i++) + { + delete m_texture[i]; + + m_texture[i] = NULL; + } +} + +void GSRendererCS::VSync(int field) +{ + __super::VSync(field); + + //printf("%lld\n", m_perfmon.GetFrame()); +} + GSTexture* GSRendererCS::GetOutput(int i) { // TODO: create a compute shader which unswizzles the frame from m_vm to the output texture - return NULL; -} + const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB; -template -void GSRendererCS::ConvertVertex(size_t dst_index, size_t src_index) -{ - // TODO: vertex format more fitting as the input for the compute shader + int w = DISPFB.FBW * 64; + int h = GetFrameRect(i).bottom; - if(src_index != dst_index) + // TODO: round up bottom + + if(m_dev->ResizeTexture(&m_texture[i], w, h)) { - GSVertex v = ((GSVertex*)m_vertex.buff)[src_index]; + const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM]; - ((GSVertex*)m_vertex.buff)[dst_index] = v; + GSVector4i r(0, 0, w, h); + GSVector4i r2 = r.ralign(psm.bs); + + GSOffset* o = m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM); + + Read(o, r2, false); + + (m_mem.*psm.rtx)(o, r2, m_output, 1024 * 4, m_env.TEXA); + + m_texture[i]->Update(r, m_output, 1024 * 4); + + if(s_dump) + { + if(s_save && s_n >= s_saven) + { + m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM)); + } + + s_n++; + } } + + return m_texture[i]; } void GSRendererCS::Draw() { - HRESULT hr; + GSDrawingEnvironment& env = m_env; + GSDrawingContext* context = m_context; + + GSVector2i rtsize(2048, 2048); + GSVector4i scissor = GSVector4i(context->scissor.in).rintersect(GSVector4i(rtsize).zwxy()); + GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); + GSVector4i r = bbox.rintersect(scissor); + + uint32 fm = context->FRAME.FBMSK; + uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; + + if(fm != 0xffffffff) + { + Write(context->offset.fb, r); + + // TODO: m_tc->InvalidateVideoMem(context->offset.fb, r, false); + } + + if(zm != 0xffffffff) + { + Write(context->offset.zb, r); + + // TODO: m_tc->InvalidateVideoMem(context->offset.zb, r, false); + } + + // TODO: if(24-bit) fm/zm |= 0xff000000; + + if(PRIM->TME) + { + m_mem.m_clut.Read32(context->TEX0, env.TEXA); + + GSVector4i r; + + GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear()); + + // TODO: unswizzle pages of r to a texture, check m_vm_valid, bit not set cpu->gpu, set gpu->gpu + + // TODO: Write transfer should directly write to m_vm, then Read/Write syncing won't be necessary, clut must be updated with the gpu also + + // TODO: tex = m_tc->LookupSource(context->TEX0, env.TEXA, r); + + // if(!tex) return; + } + + // GSDevice11* dev = (GSDevice11*)m_dev; - + ID3D11DeviceContext* ctx = *dev; - D3D11_BUFFER_DESC bd; - D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; - D3D11_SHADER_RESOURCE_VIEW_DESC srvd; - D3D11_MAPPED_SUBRESOURCE map; - - CComPtr vb_srv; - CComPtr ib_srv; - - // TODO: cache these in hash_maps - - CComPtr fbr, fbc, zbr, zbc; - CComPtr fbr_srv, fbc_srv, zbr_srv, zbc_srv; - - // TODO: grow m_vb, m_ib if needed - - if(m_vertex.next > 10000) return; - if(m_index.tail > 30000) return; - - // TODO: fill/advance/discardwhenfull, as in GSDevice11::IASetVertexBuffer/IASetIndexBuffer - - hr = ctx->Map(m_vb, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around - - if(FAILED(hr)) return; - - memcpy(map.pData, m_vertex.buff, sizeof(GSVertex) * m_vertex.next); - - ctx->Unmap(m_vb, 0); - // - hr = ctx->Map(m_ib, 0, D3D11_MAP_WRITE_DISCARD, 0, &map); // discarding, until properly advancing the start pointer around + dev->BeginScene(); - if(FAILED(hr)) return; + // SetupOM - memcpy(map.pData, m_index.buff, sizeof(uint32) * m_index.tail); - - ctx->Unmap(m_ib, 0); - - // TODO: UpdateResource might be faster, based on my exprience with the real vertex buffer, write-no-overwrite/discarded dynamic buffer + map is better - - // - - memset(&srvd, 0, sizeof(srvd)); - - srvd.Format = DXGI_FORMAT_UNKNOWN; - srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; - srvd.Buffer.FirstElement = 0; - srvd.Buffer.NumElements = m_vertex.next; - - hr = (*dev)->CreateShaderResourceView(m_vb, &srvd, &vb_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg - - memset(&srvd, 0, sizeof(srvd)); - - srvd.Format = DXGI_FORMAT_R32_UINT; - srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; - srvd.Buffer.FirstElement = 0; - srvd.Buffer.NumElements = m_index.tail; - - hr = (*dev)->CreateShaderResourceView(m_ib, &srvd, &ib_srv); // TODO: have to create this dyncamically in Draw() or pass the start/count in a const reg - - // fzb offsets - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = sizeof(int) * 4096; - bd.StructureByteStride = sizeof(int); - bd.Usage = D3D11_USAGE_IMMUTABLE; - bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; - - D3D11_SUBRESOURCE_DATA data; - - memset(&data, 0, sizeof(data)); - - data.pSysMem = m_context->offset.fb->pixel.row; - - hr = (*dev)->CreateBuffer(&bd, &data, &fbr); - - data.pSysMem = m_context->offset.fb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats + dev->OMSetDepthStencilState(m_dss, 0); + dev->OMSetBlendState(m_bs, 0); - hr = (*dev)->CreateBuffer(&bd, &data, &fbc); + ID3D11UnorderedAccessView* uavs[] = {m_vm_uav, m_lb_uav, m_sob_uav}; + uint32 counters[] = {1, 0, 0}; - data.pSysMem = m_context->offset.zb->pixel.row; - - hr = (*dev)->CreateBuffer(&bd, &data, &zbr); + dev->OMSetRenderTargets(rtsize, countof(uavs), uavs, counters, &scissor); - data.pSysMem = m_context->offset.zb->pixel.col[0]; // same column layout for every line in case of frame and zbuffer formats - - hr = (*dev)->CreateBuffer(&bd, &data, &zbc); + // SetupIA - // TODO: D3D10_SHADER_MACRO (primclass, less frequently changing drawing attribs, etc.) + D3D11_PRIMITIVE_TOPOLOGY topology; - uint32 sel = 0; // TODO - - hash_map >::iterator i = m_cs.find(sel); - - CComPtr cs; - - if(i == m_cs.end()) + switch(m_vt.m_primclass) { - // hr = dev->CompileShader(IDR_CS_FX, "cs_main", NULL, &cs); - hr = dev->CompileShader("E:\\Progs\\pcsx2\\plugins\\GSdx\\res\\cs.fx", "cs_main", NULL, &cs); + case GS_POINT_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST; + break; + case GS_TRIANGLE_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + break; + default: + __assume(0); + } - if(FAILED(hr)) return; + GSVector4i r2 = bbox.add32(GSVector4i(-1, -1, 1, 1)).rintersect(scissor); - m_cs[sel] = cs; + m_vertex.buff[m_vertex.next + 0].XYZ.X = context->XYOFFSET.OFX + (r2.left << 4); + m_vertex.buff[m_vertex.next + 0].XYZ.Y = context->XYOFFSET.OFY + (r2.top << 4); + m_vertex.buff[m_vertex.next + 1].XYZ.X = context->XYOFFSET.OFX + (r2.right << 4); + m_vertex.buff[m_vertex.next + 1].XYZ.Y = context->XYOFFSET.OFY + (r2.bottom << 4); + + m_index.buff[m_index.tail + 0] = m_vertex.next + 0; + m_index.buff[m_index.tail + 1] = m_vertex.next + 1; + + dev->IASetVertexBuffer(m_vertex.buff, sizeof(GSVertex), m_vertex.next + 2); + dev->IASetIndexBuffer(m_index.buff, m_index.tail + 2); + + // SetupVS + + VSSelector vs_sel; + + vs_sel.tme = PRIM->TME; + vs_sel.fst = PRIM->FST; + + VSConstantBuffer vs_cb; + + float sx = 2.0f / (rtsize.x << 4); + float sy = 2.0f / (rtsize.y << 4); + //float sx = 1.0f / 16; + //float sy = 1.0f / 16; + float ox = (float)(int)context->XYOFFSET.OFX; + float oy = (float)(int)context->XYOFFSET.OFY; + + vs_cb.VertexScale = GSVector4(sx, -sy, 0.0f, 0.0f); + vs_cb.VertexOffset = GSVector4(ox * sx + 1, -(oy * sy + 1), 0.0f, -1.0f); + //vs_cb.VertexScale = GSVector4(sx, sy, 0.0f, 0.0f); + //vs_cb.VertexOffset = GSVector4(ox * sx, oy * sy, 0.0f, -1.0f); + + { + GSVertexShader11 vs; + + hash_map::const_iterator i = m_vs.find(vs_sel); + + if(i != m_vs.end()) + { + vs = i->second; + } + else + { + string str[2]; + + str[0] = format("%d", vs_sel.tme); + str[1] = format("%d", vs_sel.fst); + + D3D11_SHADER_MACRO macro[] = + { + {"VS_TME", str[0].c_str()}, + {"VS_FST", str[1].c_str()}, + {NULL, NULL}, + }; + + D3D11_INPUT_ELEMENT_DESC layout[] = + { + {"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0}, + }; + + dev->CompileShader(IDR_CS_FX, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il); + + m_vs[vs_sel] = vs; + } + + ctx->UpdateSubresource(m_vs_cb, 0, NULL, &vs_cb, 0, 0); // TODO: only update if changed + + dev->VSSetShader(vs.vs, m_vs_cb); + + dev->IASetInputLayout(vs.il); + } + + // SetupGS + + GSSelector gs_sel; + + gs_sel.iip = PRIM->IIP; + + CComPtr gs[2]; + + for(int j = 0; j < 2; j++) + { + gs_sel.prim = j == 0 ? m_vt.m_primclass : GS_SPRITE_CLASS; + + hash_map >::const_iterator i = m_gs.find(gs_sel); + + if(i != m_gs.end()) + { + gs[j] = i->second; + } + else + { + string str[2]; + + str[0] = format("%d", gs_sel.iip); + str[1] = format("%d", j == 0 ? gs_sel.prim : GS_SPRITE_CLASS); + + D3D11_SHADER_MACRO macro[] = + { + {"GS_IIP", str[0].c_str()}, + {"GS_PRIM", str[1].c_str()}, + {NULL, NULL}, + }; + + dev->CompileShader(IDR_CS_FX, "gs_main", macro, &gs[j]); + + m_gs[gs_sel] = gs[j]; + } + } + + // SetupPS + + dev->PSSetSamplerState(m_ss, NULL, NULL); + + PSSelector ps_sel; + + ps_sel.fpsm = context->FRAME.PSM; + ps_sel.zpsm = context->ZBUF.PSM; + + CComPtr ps[2] = {m_ps0, NULL}; + + hash_map >::const_iterator i = m_ps1.find(ps_sel); + + if(i != m_ps1.end()) + { + ps[1] = i->second; } else { - cs = i->second; + string str[15]; + + str[0] = format("%d", PS_BATCH_SIZE); + str[1] = format("%d", context->FRAME.PSM); + str[2] = format("%d", context->ZBUF.PSM); + + D3D11_SHADER_MACRO macro[] = + { + {"PS_BATCH_SIZE", str[0].c_str()}, + {"PS_FPSM", str[1].c_str()}, + {"PS_ZPSM", str[2].c_str()}, + {NULL, NULL}, + }; + + dev->CompileShader(IDR_CS_FX, "ps_main1", macro, &ps[1]); + + m_ps1[ps_sel] = ps[1]; } + + PSConstantBuffer ps_cb; + + ps_cb.fm = fm; + ps_cb.zm = zm; + + ctx->UpdateSubresource(m_ps_cb, 0, NULL, &ps_cb, 0, 0); // TODO: only update if changed + + OffsetBuffer* fzbo = NULL; - // + GetOffsetBuffer(&fzbo); - dev->CSSetShaderUAV(0, m_vm_uav); - - dev->CSSetShaderSRV(0, vb_srv); - dev->CSSetShaderSRV(1, ib_srv); - dev->CSSetShaderSRV(2, fbr_srv); - dev->CSSetShaderSRV(3, fbc_srv); - dev->CSSetShaderSRV(4, zbr_srv); - dev->CSSetShaderSRV(5, zbc_srv); - - dev->CSSetShader(cs); + dev->PSSetShaderResourceView(0, fzbo->row_srv); + dev->PSSetShaderResourceView(1, fzbo->col_srv); + // TODO: palette, texture - GSVector4i bbox = GSVector4i(0, 0, 640, 512); // TODO: vertex trace + int step = PS_BATCH_SIZE * GSUtil::GetVertexCount(PRIM->PRIM); - GSVector4i r = bbox.ralign(GSVector2i(16, 8)); + for(int i = 0; i < m_index.tail; i += step) + { + dev->IASetPrimitiveTopology(topology); + dev->GSSetShader(gs[0]); + dev->PSSetShader(ps[0], m_ps_cb); + dev->DrawIndexedPrimitive(i, std::min(m_index.tail - i, step)); - bool fb = true; // TODO: frame buffer used - bool zb = true; // TODO: z-buffer used + dev->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_LINELIST); + dev->GSSetShader(gs[1]); + dev->PSSetShader(ps[1], m_ps_cb); + dev->DrawIndexedPrimitive(m_index.tail, 2); - if(fb) Write(m_context->offset.fb, r); - if(zb) Write(m_context->offset.zb, r); + //printf("%d/%d, %d %d %d %d\n", i, m_index.tail, r2.x, r2.y, r2.z, r2.w); + } - // TODO: constant buffer (frequently chaning drawing attribs) - // TODO: texture (implement texture cache) - // TODO: clut to a palette texture (should be texture1d, not simply buffer, it is random accessed) - // TODO: CSSetShaderSRV(6 7 8 ..., texture level 0 1 2 ...) or use Texture3D? - // TODO: invalidate texture cache + dev->EndScene(); - /* - CComPtr q; + if(0) + { + std::string s; + /* + s = format("c:\\temp1\\_%05d_f%lld_fb0_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0); + m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024); + Read(m_mem.GetOffset(0, 16, PSM_PSMCT32), GSVector4i(0, 0, 1024, 1024), false); + */ + // + if(fm != 0xffffffff) Read(context->offset.fb, r, false); + // + if(zm != 0xffffffff) Read(context->offset.zb, r, false); - D3D11_QUERY_DESC qd; - memset(&qd, 0, sizeof(qd)); - qd.Query = D3D11_QUERY_EVENT; + s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM); + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); - hr = (*dev)->CreateQuery(&qd, &q); + s = format("c:\\temp1\\_%05d_f%lld_zt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM); + m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); - ctx->Begin(q); - */ - - printf("[%lld] dispatch %05x %d %05x %d %05x %d %dx%d | %d %d %d\n", - __rdtsc(), - m_context->FRAME.Block(), m_context->FRAME.PSM, - m_context->ZBUF.Block(), m_context->ZBUF.PSM, - PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, - PRIM->PRIM, m_vertex.next, m_index.tail); + /* + s = format("c:\\temp1\\_%05d_f%lld_fb1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0); + m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024); + */ - GSVector4i rsize = r.rsize(); - - dev->Dispatch(rsize.z >> 4, rsize.w >> 3, 1); // TODO: pass upper-left corner offset (r.xy) in a const buffer - - /* - ctx->End(q); - - uint64 t0 = __rdtsc(); - - BOOL b; - - while(S_OK != ctx->GetData(q, &b, sizeof(BOOL), 0)) {} - - printf("%lld\n", __rdtsc() - t0); - */ + s_n++; + } } void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) { GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); - Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated + Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated (important) // TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target // TODO: invalidate texture cache @@ -356,6 +703,10 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r) memset(&box, 0, sizeof(box)); + box.right = 1; + box.bottom = 1; + box.back = 1; + uint32* pages = o->GetPages(r); for(size_t i = 0; pages[i] != GSOffset::EOP; i++) @@ -370,10 +721,20 @@ void GSRendererCS::Write(GSOffset* o, const GSVector4i& r) m_vm_valid[row] |= col; box.left = page * PAGE_SIZE; - box.right = box.left + PAGE_SIZE; + box.right = (page + 1) * PAGE_SIZE; - ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + box.left, 0, 0); + ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0); +/* + // m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4 + box.left = 0; + box.right = PAGE_SIZE; + box.top = page; + box.bottom = box.top + 1; + + ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0); +*/ + if(0) printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); } } @@ -391,6 +752,10 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) memset(&box, 0, sizeof(box)); + box.right = 1; + box.bottom = 1; + box.back = 1; + uint32* pages = o->GetPages(r); for(size_t i = 0; pages[i] != GSOffset::EOP; i++) @@ -402,21 +767,34 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) if(m_vm_valid[row] & col) { - if(invalidate) m_vm_valid[row] ^= col; + if(invalidate) + { + m_vm_valid[row] ^= col; + } box.left = page * PAGE_SIZE; - box.right = box.left + PAGE_SIZE; + box.right = (page + 1) * PAGE_SIZE; ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); +/* + // m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4 + box.left = 0; + box.right = PAGE_SIZE; + box.top = page; + box.bottom = box.top + 1; + + ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); +*/ D3D11_MAPPED_SUBRESOURCE map; - if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ_WRITE, 0, &map))) + if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ, 0, &map))) { - memcpy(m_mem.m_vm8 + box.left, map.pData, PAGE_SIZE); + memcpy(m_mem.m_vm8 + page * PAGE_SIZE, map.pData, PAGE_SIZE); ctx->Unmap(m_pb, 0); - + + if(0) printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); } } @@ -424,3 +802,64 @@ void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) delete [] pages; } + +bool GSRendererCS::GetOffsetBuffer(OffsetBuffer** fzbo) +{ + HRESULT hr; + + GSDevice11* dev = (GSDevice11*)m_dev; + + D3D11_BUFFER_DESC bd; + D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + D3D11_SUBRESOURCE_DATA data; + + hash_map::iterator i = m_offset.find(m_context->offset.fzb->hash); + + if(i == m_offset.end()) + { + OffsetBuffer ob; + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(GSVector2i) * 2048; + bd.Usage = D3D11_USAGE_IMMUTABLE; + bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; + + memset(&data, 0, sizeof(data)); + + data.pSysMem = m_context->offset.fzb->row; + + hr = (*dev)->CreateBuffer(&bd, &data, &ob.row); + + if(FAILED(hr)) return false; + + data.pSysMem = m_context->offset.fzb->col; + + hr = (*dev)->CreateBuffer(&bd, &data, &ob.col); + + if(FAILED(hr)) return false; + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_R32G32_SINT; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; + srvd.Buffer.FirstElement = 0; + srvd.Buffer.NumElements = 2048; + + hr = (*dev)->CreateShaderResourceView(ob.row, &srvd, &ob.row_srv); + + if(FAILED(hr)) return false; + + hr = (*dev)->CreateShaderResourceView(ob.col, &srvd, &ob.col_srv); + + if(FAILED(hr)) return false; + + m_offset[m_context->offset.fzb->hash] = ob; + + i = m_offset.find(m_context->offset.fzb->hash); + } + + *fzbo = &i->second; + + return true; +} diff --git a/plugins/GSdx/GSRendererCS.h b/plugins/GSdx/GSRendererCS.h index 42f45d58af..8a08e04427 100644 --- a/plugins/GSdx/GSRendererCS.h +++ b/plugins/GSdx/GSRendererCS.h @@ -26,28 +26,114 @@ class GSRendererCS : public GSRenderer { - class GSVertexTraceCS : public GSVertexTrace + struct VSSelector { - public: - GSVertexTraceCS(const GSState* state) : GSVertexTrace(state) {} + union + { + struct + { + uint32 tme:1; + uint32 fst:1; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x3;} + + VSSelector() : key(0) {} }; + __aligned(struct, 32) VSConstantBuffer + { + GSVector4 VertexScale; + GSVector4 VertexOffset; + }; + + struct GSSelector + { + union + { + struct + { + uint32 iip:1; + uint32 prim:2; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x7;} + + GSSelector() : key(0) {} + }; + + struct PSSelector + { + union + { + struct + { + uint32 fpsm:6; + uint32 zpsm:6; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x3ff;} + + PSSelector() : key(0) {} + }; + + __aligned(struct, 32) PSConstantBuffer + { + uint32 fm; + uint32 zm; + }; + + CComPtr m_dss; + CComPtr m_bs; + CComPtr m_ss; + CComPtr m_lb; + CComPtr m_lb_uav; + CComPtr m_lb_srv; + CComPtr m_sob; + CComPtr m_sob_uav; + CComPtr m_sob_srv; CComPtr m_vm; + //CComPtr m_vm; CComPtr m_vm_uav; - CComPtr m_vb; - CComPtr m_ib; - CComPtr m_pb; - hash_map > m_cs; uint32 m_vm_valid[16]; + CComPtr m_pb; + //CComPtr m_pb; + hash_map m_vs; + CComPtr m_vs_cb; + hash_map > m_gs; + CComPtr m_ps0; + hash_map > m_ps1; + CComPtr m_ps_cb; void Write(GSOffset* o, const GSVector4i& r); void Read(GSOffset* o, const GSVector4i& r, bool invalidate); - + + struct OffsetBuffer + { + CComPtr row, col; + CComPtr row_srv, col_srv; + }; + + hash_map m_offset; + + bool GetOffsetBuffer(OffsetBuffer** fzbo); + protected: - template - void ConvertVertex(size_t dst_index, size_t src_index); + GSTexture* m_texture[2]; + uint8* m_output; bool CreateDevice(GSDevice* dev); + void ResetDevice(); + void VSync(int field); GSTexture* GetOutput(int i); void Draw(); void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); diff --git a/plugins/GSdx/GSRendererDX.cpp b/plugins/GSdx/GSRendererDX.cpp index 0a887490e4..c8b447a798 100644 --- a/plugins/GSdx/GSRendererDX.cpp +++ b/plugins/GSdx/GSRendererDX.cpp @@ -23,10 +23,9 @@ #include "GSRendererDX.h" #include "GSDeviceDX.h" -GSRendererDX::GSRendererDX(GSVertexTrace* vt, size_t vertex_stride, GSTextureCache* tc, const GSVector2& pixelcenter) - : GSRendererHW(vt, vertex_stride, tc) +GSRendererDX::GSRendererDX(GSTextureCache* tc, const GSVector2& pixelcenter) + : GSRendererHW(tc) , m_pixelcenter(pixelcenter) - , m_topology(-1) { m_logz = !!theApp.GetConfig("logz", 0); m_fba = !!theApp.GetConfig("fba", 1); @@ -61,7 +60,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc GSVector4 s = GSVector4(rtscale.x / rtsize.x, rtscale.y / rtsize.y); GSVector4 o = GSVector4(-1.0f, 1.0f); - GSVector4 src = ((m_vt->m_min.p.xyxy(m_vt->m_max.p) + o.xxyy()) * s.xyxy()).sat(o.zzyy()); + GSVector4 src = ((m_vt.m_min.p.xyxy(m_vt.m_max.p) + o.xxyy()) * s.xyxy()).sat(o.zzyy()); GSVector4 dst = src * 2.0f + o.xxxx(); GSVertexPT1 vertices[] = @@ -111,7 +110,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc if(!IsOpaque()) { - om_bsel.abe = PRIM->ABE || PRIM->AA1 && m_vt->m_primclass == GS_LINE_CLASS; + om_bsel.abe = PRIM->ABE || PRIM->AA1 && m_vt.m_primclass == GS_LINE_CLASS; om_bsel.a = context->ALPHA.A; om_bsel.b = context->ALPHA.B; @@ -154,11 +153,11 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc { if(context->ZBUF.PSM == PSM_PSMZ24) { - if(m_vt->m_max.p.z > 0xffffff) + if(m_vt.m_max.p.z > 0xffffff) { - ASSERT(m_vt->m_min.p.z > 0xffffff); + ASSERT(m_vt.m_min.p.z > 0xffffff); // Fixme :Following conditional fixes some dialog frame in Wild Arms 3, but may not be what was intended. - if (m_vt->m_min.p.z > 0xffffff) + if (m_vt.m_min.p.z > 0xffffff) { vs_sel.bppz = 1; om_dssel.ztst = ZTST_ALWAYS; @@ -167,11 +166,11 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc } else if(context->ZBUF.PSM == PSM_PSMZ16 || context->ZBUF.PSM == PSM_PSMZ16S) { - if(m_vt->m_max.p.z > 0xffff) + if(m_vt.m_max.p.z > 0xffff) { - ASSERT(m_vt->m_min.p.z > 0xffff); // sfex capcom logo + ASSERT(m_vt.m_min.p.z > 0xffff); // sfex capcom logo // Fixme : Same as above, I guess. - if (m_vt->m_min.p.z > 0xffff) + if (m_vt.m_min.p.z > 0xffff) { vs_sel.bppz = 2; om_dssel.ztst = ZTST_ALWAYS; @@ -213,7 +212,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc GSDeviceDX::GSSelector gs_sel; gs_sel.iip = PRIM->IIP; - gs_sel.prim = m_vt->m_primclass; + gs_sel.prim = m_vt.m_primclass; // ps @@ -233,7 +232,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc } } - if (env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST) + if(env.COLCLAMP.CLAMP == 0 && /* hack */ !tex && PRIM->PRIM != GS_POINTLIST) { ps_sel.colclip = 1; } @@ -281,7 +280,7 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc ps_sel.aem = env.TEXA.AEM; ps_sel.tfx = context->TEX0.TFX; ps_sel.tcc = context->TEX0.TCC; - ps_sel.ltf = m_filter == 2 ? m_vt->IsLinear() : m_filter; + ps_sel.ltf = m_filter == 2 ? m_vt.IsLinear() : m_filter; ps_sel.rt = tex->m_target; int w = tex->m_texture->GetWidth(); @@ -330,8 +329,9 @@ void GSRendererDX::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sourc uint8 afix = context->ALPHA.FIX; + SetupIA(); + dev->SetupOM(om_dssel, om_bsel, afix); - dev->SetupIA(m_vertex.buff, m_vertex.next, m_index.buff, m_index.tail, m_topology); dev->SetupVS(vs_sel, &vs_cb); dev->SetupGS(gs_sel); dev->SetupPS(ps_sel, &ps_cb, ps_ssel); diff --git a/plugins/GSdx/GSRendererDX.h b/plugins/GSdx/GSRendererDX.h index b693d89315..13dfbc48c6 100644 --- a/plugins/GSdx/GSRendererDX.h +++ b/plugins/GSdx/GSRendererDX.h @@ -32,13 +32,12 @@ class GSRendererDX : public GSRendererHW bool UserHacks_AlphaHack; protected: - int m_topology; - virtual void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex); + virtual void SetupIA() = 0; virtual void UpdateFBA(GSTexture* rt) {} public: - GSRendererDX(GSVertexTrace* vt, size_t vertex_stride, GSTextureCache* tc, const GSVector2& pixelcenter = GSVector2(0, 0)); + GSRendererDX(GSTextureCache* tc, const GSVector2& pixelcenter = GSVector2(0, 0)); virtual ~GSRendererDX(); }; diff --git a/plugins/GSdx/GSRendererDX11.cpp b/plugins/GSdx/GSRendererDX11.cpp index d9b3c2c6d9..2feb6c7ec2 100644 --- a/plugins/GSdx/GSRendererDX11.cpp +++ b/plugins/GSdx/GSRendererDX11.cpp @@ -25,9 +25,8 @@ #include "resource.h" GSRendererDX11::GSRendererDX11() - : GSRendererDX(new GSVertexTraceDX11(this), sizeof(GSVertexHW11), new GSTextureCache11(this), GSVector2(-0.5f, -0.5f)) + : GSRendererDX(new GSTextureCache11(this), GSVector2(-0.5f, -0.5f)) { - InitConvertVertex(GSRendererDX11); } bool GSRendererDX11::CreateDevice(GSDevice* dev) @@ -38,43 +37,38 @@ bool GSRendererDX11::CreateDevice(GSDevice* dev) return true; } -template -void GSRendererDX11::ConvertVertex(size_t dst_index, size_t src_index) +void GSRendererDX11::SetupIA() { - GSVertex* s = (GSVertex*)((GSVertexHW11*)m_vertex.buff + src_index); - GSVertexHW11* d = (GSVertexHW11*)m_vertex.buff + dst_index; + GSDevice11* dev = (GSDevice11*)m_dev; - GSVector4i v0 = ((GSVector4i*)s)[0]; - GSVector4i v1 = ((GSVector4i*)s)[1]; + void* ptr = NULL; - if(tme && fst) + if(dev->IAMapVertexBuffer(&ptr, sizeof(GSVertex), m_vertex.next)) { - // TODO: modify VertexTrace and the shaders to read uv from v1.u16[0], v1.u16[1], then this step is not needed + GSVector4i::storent(ptr, m_vertex.buff, sizeof(GSVertex) * m_vertex.next); - v0 = GSVector4i::cast(GSVector4(v1.uph16()).xyzw(GSVector4::cast(v0))); // uv => st + dev->IAUnmapVertexBuffer(); } - ((GSVector4i*)d)[0] = v0; - ((GSVector4i*)d)[1] = v1; -} + dev->IASetIndexBuffer(m_index.buff, m_index.tail); -void GSRendererDX11::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) -{ - switch(m_vt->m_primclass) + D3D11_PRIMITIVE_TOPOLOGY t; + + switch(m_vt.m_primclass) { case GS_POINT_CLASS: - m_topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST; + t = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST; break; case GS_LINE_CLASS: case GS_SPRITE_CLASS: - m_topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST; + t = D3D11_PRIMITIVE_TOPOLOGY_LINELIST; break; case GS_TRIANGLE_CLASS: - m_topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + t = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; break; default: __assume(0); } - - __super::DrawPrims(rt, ds, tex); + + dev->IASetPrimitiveTopology(t); } diff --git a/plugins/GSdx/GSRendererDX11.h b/plugins/GSdx/GSRendererDX11.h index 30451710c9..1f9194ac14 100644 --- a/plugins/GSdx/GSRendererDX11.h +++ b/plugins/GSdx/GSRendererDX11.h @@ -28,14 +28,7 @@ class GSRendererDX11 : public GSRendererDX { protected: - template - void ConvertVertex(size_t dst_index, size_t src_index); - void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex); - - int GetPosX(const void* vertex) const {return (int)((const GSVertexHW11*)vertex)->p.x;} - int GetPosY(const void* vertex) const {return (int)((const GSVertexHW11*)vertex)->p.y;} - uint32 GetColor(const void* vertex) const {return ((const GSVertexHW11*)vertex)->c0;} - void SetColor(void* vertex, uint32 c) const {((GSVertexHW11*)vertex)->c0 = c;} + void SetupIA(); public: GSRendererDX11(); diff --git a/plugins/GSdx/GSRendererDX9.cpp b/plugins/GSdx/GSRendererDX9.cpp index a0dfa10af2..ab0143e17f 100644 --- a/plugins/GSdx/GSRendererDX9.cpp +++ b/plugins/GSdx/GSRendererDX9.cpp @@ -25,9 +25,8 @@ #include "resource.h" GSRendererDX9::GSRendererDX9() - : GSRendererDX(new GSVertexTraceDX9(this), sizeof(GSVertexHW9), new GSTextureCache9(this)) + : GSRendererDX(new GSTextureCache9(this)) { - InitConvertVertex(GSRendererDX9); } bool GSRendererDX9::CreateDevice(GSDevice* dev) @@ -57,56 +56,21 @@ bool GSRendererDX9::CreateDevice(GSDevice* dev) return true; } -template -void GSRendererDX9::ConvertVertex(size_t dst_index, size_t src_index) +void GSRendererDX9::SetupIA() { - GSVertex* s = (GSVertex*)((GSVertexHW9*)m_vertex.buff + src_index); - GSVertexHW9* d = (GSVertexHW9*)m_vertex.buff + dst_index; + D3DPRIMITIVETYPE topology; - GSVector4 p = GSVector4(GSVector4i::load(s->XYZ.u32[0]).upl16()); - - if(tme && !fst) - { - p = p.xyxy(GSVector4((float)s->XYZ.Z, s->RGBAQ.Q)); - } - else - { - p = p.xyxy(GSVector4::load((float)s->XYZ.Z)); - } - - GSVector4 t = GSVector4::zero(); - - if(tme) - { - if(fst) - { - t = GSVector4(GSVector4i::load(s->UV).upl16()); - } - else - { - t = GSVector4::loadl(&s->ST); - } - } - - t = t.xyxy(GSVector4::cast(GSVector4i(s->RGBAQ.u32[0], s->FOG))); - - d->p = p; - d->t = t; -} - -void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) -{ - switch(m_vt->m_primclass) + switch(m_vt.m_primclass) { case GS_POINT_CLASS: - m_topology = D3DPT_POINTLIST; + topology = D3DPT_POINTLIST; break; case GS_LINE_CLASS: - m_topology = D3DPT_LINELIST; + topology = D3DPT_LINELIST; if(PRIM->IIP == 0) { @@ -122,7 +86,7 @@ void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour case GS_TRIANGLE_CLASS: - m_topology = D3DPT_TRIANGLELIST; + topology = D3DPT_TRIANGLELIST; if(PRIM->IIP == 0) { @@ -138,7 +102,7 @@ void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour case GS_SPRITE_CLASS: - m_topology = D3DPT_TRIANGLELIST; + topology = D3DPT_TRIANGLELIST; // each sprite converted to quad needs twice the space @@ -154,29 +118,35 @@ void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour size_t count = m_vertex.next; int i = (int)count * 2 - 4; - GSVertexHW9* s = (GSVertexHW9*)&m_vertex.buff[sizeof(GSVertexHW9) * count] - 2; - GSVertexHW9* q = (GSVertexHW9*)&m_vertex.buff[sizeof(GSVertexHW9) * (count * 2)] - 4; - uint32* RESTRICT index = &m_index.buff[count * 3] - 6; + GSVertex* s = &m_vertex.buff[count - 2]; + GSVertex* q = &m_vertex.buff[count * 2 - 4]; + uint32* RESTRICT index = &m_index.buff[count * 3 - 6]; for(; i >= 0; i -= 4, s -= 2, q -= 4, index -= 6) { - GSVertexHW9 v0 = s[0]; - GSVertexHW9 v1 = s[1]; + GSVertex v0 = s[0]; + GSVertex v1 = s[1]; - v0.p = v0.p.xyzw(v1.p); // z, q - v0.t = v0.t.xyzw(v1.t); // c, f + v0.RGBAQ = v1.RGBAQ; + v0.XYZ.Z = v1.XYZ.Z; + v0.FOG = v1.FOG; q[0] = v0; q[3] = v1; - // swap x, s + // swap x, s, u - GSVector4 p = v0.p.insert<0, 0>(v1.p); - GSVector4 t = v0.t.insert<0, 0>(v1.t); - v1.p = v1.p.insert<0, 0>(v0.p); - v1.t = v1.t.insert<0, 0>(v0.t); - v0.p = p; - v0.t = t; + uint16 x = v0.XYZ.X; + v0.XYZ.X = v1.XYZ.X; + v1.XYZ.X = x; + + float s = v0.ST.S; + v0.ST.S = v1.ST.S; + v1.ST.S = s; + + uint16 u = v0.U; + v0.U = v1.U; + v1.U = u; q[1] = v0; q[2] = v1; @@ -199,9 +169,56 @@ void GSRendererDX9::DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Sour __assume(0); } - (*(GSDevice9*)m_dev)->SetRenderState(D3DRS_SHADEMODE, PRIM->IIP ? D3DSHADE_GOURAUD : D3DSHADE_FLAT); // TODO + GSDevice9* dev = (GSDevice9*)m_dev; - __super::DrawPrims(rt, ds, tex); + (*dev)->SetRenderState(D3DRS_SHADEMODE, PRIM->IIP ? D3DSHADE_GOURAUD : D3DSHADE_FLAT); // TODO + + void* ptr = NULL; + + if(dev->IAMapVertexBuffer(&ptr, sizeof(GSVertexHW9), m_vertex.next)) + { + GSVertex* RESTRICT s = (GSVertex*)m_vertex.buff; + GSVertexHW9* RESTRICT d = (GSVertexHW9*)ptr; + + for(int i = 0; i < m_vertex.next; i++, s++, d++) + { + GSVector4 p = GSVector4(GSVector4i::load(s->XYZ.u32[0]).upl16()); + + if(PRIM->TME && !PRIM->FST) + { + p = p.xyxy(GSVector4((float)s->XYZ.Z, s->RGBAQ.Q)); + } + else + { + p = p.xyxy(GSVector4::load((float)s->XYZ.Z)); + } + + GSVector4 t = GSVector4::zero(); + + if(PRIM->TME) + { + if(PRIM->FST) + { + t = GSVector4(GSVector4i::load(s->UV).upl16()); + } + else + { + t = GSVector4::loadl(&s->ST); + } + } + + t = t.xyxy(GSVector4::cast(GSVector4i(s->RGBAQ.u32[0], s->FOG))); + + d->p = p; + d->t = t; + } + + dev->IAUnmapVertexBuffer(); + } + + dev->IASetIndexBuffer(m_index.buff, m_index.tail); + + dev->IASetPrimitiveTopology(topology); } void GSRendererDX9::UpdateFBA(GSTexture* rt) @@ -220,7 +237,7 @@ void GSRendererDX9::UpdateFBA(GSTexture* rt) GSVector4 s = GSVector4(rt->GetScale().x / rt->GetWidth(), rt->GetScale().y / rt->GetHeight()); GSVector4 o = GSVector4(-1.0f, 1.0f); - GSVector4 src = ((m_vt->m_min.p.xyxy(m_vt->m_max.p) + o.xxyy()) * s.xyxy()).sat(o.zzyy()); + GSVector4 src = ((m_vt.m_min.p.xyxy(m_vt.m_max.p) + o.xxyy()) * s.xyxy()).sat(o.zzyy()); GSVector4 dst = src * 2.0f + o.xxxx(); GSVertexPT1 vertices[] = diff --git a/plugins/GSdx/GSRendererDX9.h b/plugins/GSdx/GSRendererDX9.h index f70a14bfe0..35aa3a181d 100644 --- a/plugins/GSdx/GSRendererDX9.h +++ b/plugins/GSdx/GSRendererDX9.h @@ -34,17 +34,9 @@ protected: Direct3DBlendState9 bs; } m_fba; - template - void ConvertVertex(size_t dst_index, size_t src_index); - - void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex); + void SetupIA(); void UpdateFBA(GSTexture* rt); - int GetPosX(const void* vertex) const {return (int)((const GSVertexHW9*)vertex)->p.x;} - int GetPosY(const void* vertex) const {return (int)((const GSVertexHW9*)vertex)->p.y;} - uint32 GetColor(const void* vertex) const {return ((const GSVertexHW9*)vertex)->t.u32[2];} - void SetColor(void* vertex, uint32 c) const {((GSVertexHW9*)vertex)->t.u32[2] = c;} - public: GSRendererDX9(); virtual ~GSRendererDX9() {} diff --git a/plugins/GSdx/GSRendererHW.cpp b/plugins/GSdx/GSRendererHW.cpp index 921641b721..c773999d5c 100644 --- a/plugins/GSdx/GSRendererHW.cpp +++ b/plugins/GSdx/GSRendererHW.cpp @@ -22,9 +22,8 @@ #include "stdafx.h" #include "GSRendererHW.h" -GSRendererHW::GSRendererHW(GSVertexTrace* vt, size_t vertex_stride, GSTextureCache* tc) - : GSRenderer(vt, vertex_stride) - , m_tc(tc) +GSRendererHW::GSRendererHW(GSTextureCache* tc) + : m_tc(tc) , m_width(1024) , m_height(1024) , m_skip(0) @@ -101,19 +100,18 @@ void GSRendererHW::Reset() void GSRendererHW::VSync(int field) { - GSRenderer::VSync(field); - - m_tc->IncAge(); - m_dev->AgePool(); - - m_skip = 0; - if(m_reset) { m_tc->RemoveAll(); m_reset = false; } + + GSRenderer::VSync(field); + + m_tc->IncAge(); + + m_skip = 0; } void GSRendererHW::ResetDevice() @@ -212,7 +210,7 @@ void GSRendererHW::Draw() GSVector4i r; - GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt->IsLinear()); + GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear()); tex = m_tc->LookupSource(context->TEX0, env.TEXA, r); @@ -299,7 +297,7 @@ void GSRendererHW::Draw() // - GSVector4i r = GSVector4i(m_vt->m_min.p.xyxy(m_vt->m_max.p)).rintersect(GSVector4i(context->scissor.in)); + GSVector4i r = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p)).rintersect(GSVector4i(context->scissor.in)); if(fm != 0xffffffff) { @@ -411,14 +409,14 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source if(lines == 0) { - if(m_vt->m_primclass == GS_LINE_CLASS && (m_vertex.next == 448 * 2 || m_vertex.next == 512 * 2)) + if(m_vt.m_primclass == GS_LINE_CLASS && (m_vertex.next == 448 * 2 || m_vertex.next == 512 * 2)) { lines = m_vertex.next / 2; } } else { - if(m_vt->m_primclass == GS_POINT_CLASS) + if(m_vt.m_primclass == GS_POINT_CLASS) { if(m_vertex.next >= 16 * 512) { @@ -429,14 +427,14 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source int ox = m_context->XYOFFSET.OFX; int oy = m_context->XYOFFSET.OFY; - const uint8* RESTRICT v = m_vertex.buff; + const GSVertex* RESTRICT v = m_vertex.buff; - for(int i = (int)m_vertex.next; i >= 0; i--, v += m_vertex.stride) + for(int i = (int)m_vertex.next; i >= 0; i--, v++) { - int x = (GetPosX(v) - ox) >> 4; - int y = (GetPosY(v) - oy) >> 4; + int x = (v->XYZ.X - ox) >> 4; + int y = (v->XYZ.Y - oy) >> 4; - video[(y << 8) + (y << 7) + (y << 6) + x] = GetColor(v); + video[(y << 8) + (y << 7) + (y << 6) + x] = v->RGBAQ.u32[0]; } return false; @@ -446,7 +444,7 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source lines = 0; } } - else if(m_vt->m_primclass == GS_LINE_CLASS) + else if(m_vt.m_primclass == GS_LINE_CLASS) { if(m_vertex.next == lines * 2) { @@ -459,10 +457,8 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source t->m_texture->Update(GSVector4i(0, 0, 448, lines), video, 448 * 4); - size_t stride = m_vertex.stride; - - memcpy(&m_vertex.buff[stride * 2], &m_vertex.buff[stride * (m_vertex.next - 2)], stride); - memcpy(&m_vertex.buff[stride * 3], &m_vertex.buff[stride * (m_vertex.next - 1)], stride); + m_vertex.buff[2] = m_vertex.buff[m_vertex.next - 2]; + m_vertex.buff[3] = m_vertex.buff[m_vertex.next - 1]; m_index.buff[0] = 0; m_index.buff[1] = 1; @@ -474,7 +470,7 @@ bool GSRendererHW::OI_FFXII(GSTexture* rt, GSTexture* ds, GSTextureCache::Source m_vertex.head = m_vertex.tail = m_vertex.next = 4; m_index.tail = 6; - m_vt->Update(m_vertex.buff, m_index.buff, m_index.tail, GS_TRIANGLE_CLASS); + m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, GS_TRIANGLE_CLASS); } else { @@ -506,11 +502,11 @@ bool GSRendererHW::OI_MetalSlug6(GSTexture* rt, GSTexture* ds, GSTextureCache::S { // missing red channel fix (looks alright in pcsx2 r5000+) - uint8* RESTRICT v = m_vertex.buff; + GSVertex* RESTRICT v = m_vertex.buff; - for(int i = (int)m_vertex.next; i >= 0; i--, v += m_vertex.stride) + for(int i = (int)m_vertex.next; i >= 0; i--, v++) { - uint32 c = GetColor(v); + uint32 c = v->RGBAQ.u32[0]; uint32 r = (c >> 0) & 0xff; uint32 g = (c >> 8) & 0xff; @@ -518,11 +514,11 @@ bool GSRendererHW::OI_MetalSlug6(GSTexture* rt, GSTexture* ds, GSTextureCache::S if(r == 0 && g != 0 && b != 0) { - SetColor(v, (c & 0xffffff00) | ((g + b + 1) >> 1)); + v->RGBAQ.u32[0] = (c & 0xffffff00) | ((g + b + 1) >> 1); } } - m_vt->Update(m_vertex.buff, m_index.buff, m_index.tail, m_vt->m_primclass); + m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, m_vt.m_primclass); return true; } @@ -702,7 +698,7 @@ bool GSRendererHW::OI_StarWarsForceUnleashed(GSTexture* rt, GSTexture* ds, GSTex } else if(PRIM->TME) { - if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt->m_max.p.z == m_vt->m_min.p.z && m_vt->m_max.p.z == 0)) + if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_max.p.z == 0)) { m_dev->ClearDepth(ds, 0); } @@ -758,7 +754,7 @@ bool GSRendererHW::OI_SpyroNewBeginning(GSTexture* rt, GSTexture* ds, GSTextureC } else if(PRIM->TME) { - if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt->m_max.p.z == m_vt->m_min.p.z && m_vt->m_min.p.z == 0x0)) + if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_min.p.z == 0)) { m_dev->ClearDepth(ds, 0); } @@ -784,7 +780,7 @@ bool GSRendererHW::OI_SpyroEternalNight(GSTexture* rt, GSTexture* ds, GSTextureC } else if(PRIM->TME) { - if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt->m_max.p.z == m_vt->m_min.p.z && m_vt->m_min.p.z == 0x0)) + if((FBP == 0x0 || FBP == 0x01180) && FPSM == PSM_PSMCT32 && (m_vt.m_eq.z && m_vt.m_min.p.z == 0)) { m_dev->ClearDepth(ds, 0); } @@ -798,7 +794,7 @@ bool GSRendererHW::OI_TalesOfLegendia(GSTexture* rt, GSTexture* ds, GSTextureCac uint32 FBP = m_context->FRAME.Block(); uint32 FPSM = m_context->FRAME.PSM; - if (FPSM == PSM_PSMCT32 && FBP == 0x01c00 && !m_context->TEST.ATE && m_vt->m_max.p.z == m_vt->m_min.p.z) + if (FPSM == PSM_PSMCT32 && FBP == 0x01c00 && !m_context->TEST.ATE && m_vt.m_eq.z) { m_context->TEST.ZTST = ZTST_ALWAYS; //m_dev->ClearDepth(ds, 0); @@ -810,7 +806,7 @@ bool GSRendererHW::OI_TalesOfLegendia(GSTexture* rt, GSTexture* ds, GSTextureCac bool GSRendererHW::OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* t) { - if(m_vt->m_primclass == GS_POINT_CLASS && !PRIM->TME) + if(m_vt.m_primclass == GS_POINT_CLASS && !PRIM->TME) { uint32 FBP = m_context->FRAME.Block(); uint32 FBW = m_context->FRAME.FBW; @@ -819,16 +815,16 @@ bool GSRendererHW::OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCa { if(m_vertex.next == 16) { - uint8* RESTRICT v = m_vertex.buff; + GSVertex* RESTRICT v = m_vertex.buff; - for(int i = 0; i < 16; i++, v += m_vertex.stride) + for(int i = 0; i < 16; i++, v++) { - uint32 c = GetColor(v); + uint32 c = v->RGBAQ.u32[0]; uint32 a = c >> 24; c = (a >= 0x80 ? 0xff000000 : (a << 25)) | (c & 0x00ffffff); - SetColor(v, c); + v->RGBAQ.u32[0] = c; m_mem.WritePixel32(i & 7, i >> 3, c, FBP, FBW); } @@ -839,16 +835,16 @@ bool GSRendererHW::OI_PointListPalette(GSTexture* rt, GSTexture* ds, GSTextureCa } else if(m_vertex.next == 256) { - uint8* RESTRICT v = m_vertex.buff; + GSVertex* RESTRICT v = m_vertex.buff; - for(int i = 0; i < 256; i++, v += m_vertex.stride) + for(int i = 0; i < 256; i++, v++) { - uint32 c = GetColor(v); + uint32 c = v->RGBAQ.u32[0]; uint32 a = c >> 24; c = (a >= 0x80 ? 0xff000000 : (a << 25)) | (c & 0x00ffffff); - SetColor(v, c); + v->RGBAQ.u32[0] = c; m_mem.WritePixel32(i & 15, i >> 4, c, FBP, FBW); } diff --git a/plugins/GSdx/GSRendererHW.h b/plugins/GSdx/GSRendererHW.h index 6aed9f469a..2b1befe516 100644 --- a/plugins/GSdx/GSRendererHW.h +++ b/plugins/GSdx/GSRendererHW.h @@ -126,11 +126,6 @@ private: } m_hacks; - virtual int GetPosX(const void* vertex) const = 0; - virtual int GetPosY(const void* vertex) const = 0; - virtual uint32 GetColor(const void* vertex) const = 0; - virtual void SetColor(void* vertex, uint32 c) const = 0; - #pragma endregion protected: @@ -139,7 +134,7 @@ protected: virtual void DrawPrims(GSTexture* rt, GSTexture* ds, GSTextureCache::Source* tex) = 0; public: - GSRendererHW(GSVertexTrace* vt, size_t vertex_stride, GSTextureCache* tc); + GSRendererHW(GSTextureCache* tc); virtual ~GSRendererHW(); void SetGameCRC(uint32 crc, int options); diff --git a/plugins/GSdx/GSRendererNull.h b/plugins/GSdx/GSRendererNull.h index 7db828a0c8..b9b06a415a 100644 --- a/plugins/GSdx/GSRendererNull.h +++ b/plugins/GSdx/GSRendererNull.h @@ -32,11 +32,6 @@ class GSRendererNull : public GSRenderer }; protected: - template - void ConvertVertex(size_t dst_index, size_t src_index) - { - } - void Draw() { } @@ -48,8 +43,7 @@ protected: public: GSRendererNull() - : GSRenderer(new GSVertexTraceNull(this), sizeof(GSVertex)) + : GSRenderer() { - InitConvertVertex(GSRendererNull); } }; diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index d6494f58c3..44de110260 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -22,14 +22,15 @@ #include "stdafx.h" #include "GSRendererSW.h" +#define LOG 0 + +static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; + const GSVector4 g_pos_scale(1.0f / 16, 1.0f / 16, 1.0f, 128.0f); GSRendererSW::GSRendererSW(int threads) - : GSRenderer(new GSVertexTraceSW(this), sizeof(GSVertexSW)) - , m_fzb(NULL) + : m_fzb(NULL) { - InitConvertVertex(GSRendererSW); - m_nativeres = true; // ignore ini, sw is always native m_tc = new GSTextureCacheSW(this); @@ -42,6 +43,17 @@ GSRendererSW::GSRendererSW(int threads) memset(m_fzb_pages, 0, sizeof(m_fzb_pages)); memset(m_tex_pages, 0, sizeof(m_tex_pages)); + + #define InitCVB(P) \ + m_cvb[P][0][0] = &GSRendererSW::ConvertVertexBuffer; \ + m_cvb[P][0][1] = &GSRendererSW::ConvertVertexBuffer; \ + m_cvb[P][1][0] = &GSRendererSW::ConvertVertexBuffer; \ + m_cvb[P][1][1] = &GSRendererSW::ConvertVertexBuffer; \ + + InitCVB(GS_POINT_CLASS); + InitCVB(GS_LINE_CLASS); + InitCVB(GS_TRIANGLE_CLASS); + InitCVB(GS_SPRITE_CLASS); } GSRendererSW::~GSRendererSW() @@ -60,10 +72,9 @@ GSRendererSW::~GSRendererSW() void GSRendererSW::Reset() { - // TODO: GSreset can come from the main thread too => crash - // m_tc->RemoveAll(); + Sync(-1); - m_reset = true; + m_tc->RemoveAll(); GSRenderer::Reset(); } @@ -72,6 +83,112 @@ void GSRendererSW::VSync(int field) { Sync(0); // IncAge might delete a cached texture in use + if(0) if(LOG) + { + fprintf(s_fp, "%lld\n", m_perfmon.GetFrame()); + + GSVector4i dr = GetDisplayRect(); + GSVector4i fr = GetFrameRect(); + GSVector2i ds = GetDeviceSize(); + + fprintf(s_fp, "dr %d %d %d %d, fr %d %d %d %d, ds %d %d\n", + dr.x, dr.y, dr.z, dr.w, + fr.x, fr.y, fr.z, fr.w, + ds.x, ds.y); + + for(int i = 0; i < 2; i++) + { + if(i == 0 && !m_regs->PMODE.EN1) continue; + if(i == 1 && !m_regs->PMODE.EN2) continue; + + fprintf(s_fp, "DISPFB[%d] BP=%05x BW=%d PSM=%d DBX=%d DBY=%d\n", + i, + m_regs->DISP[i].DISPFB.Block(), + m_regs->DISP[i].DISPFB.FBW, + m_regs->DISP[i].DISPFB.PSM, + m_regs->DISP[i].DISPFB.DBX, + m_regs->DISP[i].DISPFB.DBY + ); + + fprintf(s_fp, "DISPLAY[%d] DX=%d DY=%d DW=%d DH=%d MAGH=%d MAGV=%d\n", + i, + m_regs->DISP[i].DISPLAY.DX, + m_regs->DISP[i].DISPLAY.DY, + m_regs->DISP[i].DISPLAY.DW, + m_regs->DISP[i].DISPLAY.DH, + m_regs->DISP[i].DISPLAY.MAGH, + m_regs->DISP[i].DISPLAY.MAGV + ); + } + + fprintf(s_fp, "PMODE EN1=%d EN2=%d CRTMD=%d MMOD=%d AMOD=%d SLBG=%d ALP=%d\n", + m_regs->PMODE.EN1, + m_regs->PMODE.EN2, + m_regs->PMODE.CRTMD, + m_regs->PMODE.MMOD, + m_regs->PMODE.AMOD, + m_regs->PMODE.SLBG, + m_regs->PMODE.ALP + ); + + fprintf(s_fp, "SMODE1 CLKSEL=%d CMOD=%d EX=%d GCONT=%d LC=%d NVCK=%d PCK2=%d PEHS=%d PEVS=%d PHS=%d PRST=%d PVS=%d RC=%d SINT=%d SLCK=%d SLCK2=%d SPML=%d T1248=%d VCKSEL=%d VHP=%d XPCK=%d\n", + m_regs->SMODE1.CLKSEL, + m_regs->SMODE1.CMOD, + m_regs->SMODE1.EX, + m_regs->SMODE1.GCONT, + m_regs->SMODE1.LC, + m_regs->SMODE1.NVCK, + m_regs->SMODE1.PCK2, + m_regs->SMODE1.PEHS, + m_regs->SMODE1.PEVS, + m_regs->SMODE1.PHS, + m_regs->SMODE1.PRST, + m_regs->SMODE1.PVS, + m_regs->SMODE1.RC, + m_regs->SMODE1.SINT, + m_regs->SMODE1.SLCK, + m_regs->SMODE1.SLCK2, + m_regs->SMODE1.SPML, + m_regs->SMODE1.T1248, + m_regs->SMODE1.VCKSEL, + m_regs->SMODE1.VHP, + m_regs->SMODE1.XPCK + ); + + fprintf(s_fp, "SMODE2 INT=%d FFMD=%d DPMS=%d\n", + m_regs->SMODE2.INT, + m_regs->SMODE2.FFMD, + m_regs->SMODE2.DPMS + ); + + fprintf(s_fp, "SRFSH %08x_%08x\n", + m_regs->SRFSH.u32[0], + m_regs->SRFSH.u32[1] + ); + + fprintf(s_fp, "SYNCH1 %08x_%08x\n", + m_regs->SYNCH1.u32[0], + m_regs->SYNCH1.u32[1] + ); + + fprintf(s_fp, "SYNCH2 %08x_%08x\n", + m_regs->SYNCH2.u32[0], + m_regs->SYNCH2.u32[1] + ); + + fprintf(s_fp, "SYNCV %08x_%08x\n", + m_regs->SYNCV.u32[0], + m_regs->SYNCV.u32[1] + ); + + fprintf(s_fp, "CSR %08x_%08x\n", + m_regs->CSR.u32[0], + m_regs->CSR.u32[1] + ); + + fflush(s_fp); + } + /* int draw[8], sum = 0; @@ -87,20 +204,12 @@ void GSRendererSW::VSync(int field) draw[0], draw[1], draw[2], draw[3], draw[4], draw[5], draw[6], draw[7], sum); // - printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0; - printf("m_syncpoint_count = %d\n", ((GSRasterizerList*)m_rl)->m_syncpoint_count); ((GSRasterizerList*)m_rl)->m_syncpoint_count = 0; */ + GSRenderer::VSync(field); m_tc->IncAge(); - if(m_reset) - { - m_tc->RemoveAll(); - - m_reset = false; - } - // if((m_perfmon.GetFrame() & 255) == 0) m_rl.PrintStats(); } @@ -151,92 +260,104 @@ GSTexture* GSRendererSW::GetOutput(int i) return m_texture[i]; } -template -void GSRendererSW::ConvertVertex(size_t dst_index, size_t src_index) +template +void GSRendererSW::ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count) { - GSVertex* s = (GSVertex*)((GSVertexSW*)m_vertex.buff + src_index); - GSVertexSW* d = (GSVertexSW*)m_vertex.buff + dst_index; + size_t i = m_vertex.next; - ASSERT(d->_pad.u32[0] != 0x12345678); + GSVector4i o = (GSVector4i)m_context->XYOFFSET; + GSVector4 tsize = GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH, 1, 0); - uint32 z = s->XYZ.Z; + #if _M_SSE >= 0x501 - GSVector4i xy = GSVector4i::load((int)s->XYZ.u32[0]).upl16() - (GSVector4i)m_context->XYOFFSET; - GSVector4i zf = GSVector4i((int)std::min(z, 0xffffff00), s->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later + // TODO: process vertices in pairs, when AVX2 becomes available - GSVector4 p, t, c; - - p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale; - - if(tme) - { - if(fst) - { - t = GSVector4(GSVector4i::load(s->UV).upl16() << (16 - 4)); - } - else - { - t = GSVector4(s->ST.S, s->ST.T) * GSVector4(0x10000 << m_context->TEX0.TW, 0x10000 << m_context->TEX0.TH); - t = t.xyxy(GSVector4::load(s->RGBAQ.Q)); - } - } - - c = GSVector4::rgba32(s->RGBAQ.u32[0], 7); - - d->p = p; - d->c = c; - d->t = t; - - #ifdef _DEBUG - d->_pad.u32[0] = 0x12345678; // means trouble if this has already been set, should only convert each vertex once #endif - - if(prim == GS_SPRITE) + + for(; i > 0; i--, src++, dst++) { - d->t.u32[3] = z; + GSVector4 stcq = GSVector4::load(&src->m[0]); // s t rgba q + + #if _M_SSE >= 0x401 + + GSVector4i xyzuvf(src->m[1]); + + GSVector4i xy = xyzuvf.upl16() - o; + GSVector4i zf = xyzuvf.ywww().min_u32(GSVector4i::xffffff00()); + + #else + + uint32 z = src->XYZ.Z; + + GSVector4i xy = GSVector4i::load((int)src->XYZ.u32[0]).upl16() - o; + GSVector4i zf = GSVector4i((int)std::min(z, 0xffffff00), src->FOG); // NOTE: larger values of z may roll over to 0 when converting back to uint32 later + + #endif + + dst->p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale; + dst->c = GSVector4(GSVector4i::cast(stcq).zzzz().u8to32() << 7); + + GSVector4 t; + + if(tme) + { + if(fst) + { + #if _M_SSE >= 0x401 + + t = GSVector4(xyzuvf.uph16() << (16 - 4)); + + #else + + t = GSVector4(GSVector4i::load(src->UV).upl16() << (16 - 4)); + + #endif + } + else + { + t = stcq.xyww() * tsize; + } + } + + if(primclass == GS_SPRITE_CLASS) + { + #if _M_SSE >= 0x401 + + t = t.insert<1, 3>(GSVector4::cast(xyzuvf)); + + #else + + t = t.insert<0, 3>(GSVector4::cast(GSVector4i::load(z))); + + #endif + } + + dst->t = t; } } -#define LOG 0 - -FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; - void GSRendererSW::Draw() { + const GSDrawingContext* context = m_context; + SharedData* sd = new SharedData(this); shared_ptr data(sd); - sd->primclass = m_vt->m_primclass; + sd->primclass = m_vt.m_primclass; sd->buff = (uint8*)_aligned_malloc(sizeof(GSVertexSW) * m_vertex.next + sizeof(uint32) * m_index.tail, 32); sd->vertex = (GSVertexSW*)sd->buff; sd->vertex_count = m_vertex.next; sd->index = (uint32*)(sd->buff + sizeof(GSVertexSW) * m_vertex.next); sd->index_count = m_index.tail; - memcpy(sd->vertex, m_vertex.buff, sizeof(GSVertexSW) * m_vertex.next); + (this->*m_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST])(sd->vertex, m_vertex.buff, m_vertex.next); + memcpy(sd->index, m_index.buff, sizeof(uint32) * m_index.tail); - for(size_t i = 0; i < m_index.tail; i++) - { - ASSERT(((GSVertexSW*)m_vertex.buff + m_index.buff[i])->_pad.u32[0] == 0x12345678); - } - - // TODO: delay texture update, do it later along with the syncing on the dispatcher thread, then this thread does not have to wait and can continue assembling more jobs - // TODO: if(any texture page is used as a target) GSRasterizerData::syncpoint = true; - // TODO: virtual void GSRasterizerData::Update() {texture[all levels]->Update();}, call it from the dispatcher thread before sending to workers - // TODO: m_tc->InvalidatePages must be called after texture->Update, move that inside GSRasterizerData::Update too - - if(!GetScanlineGlobalData(sd)) return; - - // - - const GSDrawingContext* context = m_context; - - GSScanlineGlobalData& gd = sd->global; - GSVector4i scissor = GSVector4i(context->scissor.in); - GSVector4i bbox = GSVector4i(m_vt->m_min.p.floor().xyxy(m_vt->m_max.p.ceil())); + GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); + GSVector4i r = bbox.rintersect(scissor); scissor.z = std::min(scissor.z, (int)context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour @@ -244,110 +365,78 @@ void GSRendererSW::Draw() sd->bbox = bbox; sd->frame = m_perfmon.GetFrame(); + if(!GetScanlineGlobalData(sd)) return; + + if(0) if(LOG) + { + int n = GSUtil::GetVertexCount(PRIM->PRIM); + + for(int i = 0, j = 0; i < m_index.tail; i += n, j++) + { + for(int k = 0; k < n; k++) + { + GSVertex* v = &m_vertex.buff[m_index.buff[i + k]]; + GSVertex* vn = &m_vertex.buff[m_index.buff[i + n - 1]]; + + fprintf(s_fp, "%d:%d %f %f %f %f\n", + j, k, + (float)(v->XYZ.X - context->XYOFFSET.OFX) / 16, + (float)(v->XYZ.Y - context->XYOFFSET.OFY) / 16, + PRIM->FST ? (float)(v->U) / 16 : v->ST.S / (PRIM->PRIM == GS_SPRITE ? vn->RGBAQ.Q : v->RGBAQ.Q), + PRIM->FST ? (float)(v->V) / 16 : v->ST.T / (PRIM->PRIM == GS_SPRITE ? vn->RGBAQ.Q : v->RGBAQ.Q) + ); + } + } + } + // + GSScanlineGlobalData& gd = sd->global; + uint32* fb_pages = NULL; uint32* zb_pages = NULL; - GSVector4i r = bbox.rintersect(scissor); - - if(gd.sel.fwrite) + if(sd->global.sel.fb) { - fb_pages = context->offset.fb->GetPages(r); - - m_tc->InvalidatePages(fb_pages, context->offset.fb->psm); + fb_pages = m_context->offset.fb->GetPages(r); } - if(gd.sel.zwrite) + if(sd->global.sel.zb) { - zb_pages = context->offset.zb->GetPages(r); - - m_tc->InvalidatePages(zb_pages, context->offset.zb->psm); + zb_pages = m_context->offset.zb->GetPages(r); } - // set data->syncpoint + // check if there is an overlap between this and previous targets - if(m_fzb != context->offset.fzb) + if(CheckTargetPages(fb_pages, zb_pages, r)) { - // hmm, what if "r" gets bigger next time and slips through unchecked, need to trace that too - - sd->syncpoint = true; // TODO - - if(!sd->syncpoint) - { - if(fb_pages == NULL) - { - fb_pages = context->offset.fb->GetPages(r); - } - - if(CheckTargetPages<0xffffffff>(fb_pages)) - { - sd->syncpoint = true; - - if(LOG) fprintf(s_fp, "syncpoint 0\n"); - } - } - - if(!sd->syncpoint) - { - if(zb_pages == NULL) - { - zb_pages = context->offset.zb->GetPages(r); - } - - if(CheckTargetPages<0xffffffff>(zb_pages)) - { - sd->syncpoint = true; - - if(LOG) fprintf(s_fp, "syncpoint 1\n"); - } - } - - if(!sd->syncpoint) - { - if(LOG) fprintf(s_fp, "no syncpoint *\n"); - } - - m_fzb = context->offset.fzb; + sd->m_syncpoint = SharedData::SyncTarget; } - else + + // check if the texture is not part of a target currently in use + + if(CheckSourcePages(sd)) { - // chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue, - // m_fzb filters out most of these cases, only have to be careful when the addresses stay the same and the output - // is mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300) - - if(!sd->syncpoint) - { - if(gd.sel.fwrite) - { - if(CheckTargetPages<0xffff0000>(fb_pages)) // already used as a z-buffer - { - sd->syncpoint = true; - - if(LOG) fprintf(s_fp, "syncpoint 2\n"); - } - } - } - - if(!sd->syncpoint) - { - if(gd.sel.zwrite) - { - if(CheckTargetPages<0x0000ffff>(zb_pages)) // already used as a frame buffer - { - sd->syncpoint = true; - - if(LOG) fprintf(s_fp, "syncpoint 3\n"); - } - } - } + sd->m_syncpoint = SharedData::SyncSource; } + // addref source and target pages + + sd->UsePages(fb_pages, m_context->offset.fb->psm, zb_pages, m_context->offset.zb->psm); + // - sd->UseTargetPages(fb_pages, zb_pages); + if(LOG) + { + fprintf(s_fp, "[%d] queue %05x %d (%d) %05x %d (%d) %05x %d %dx%d (%d %d %d) | %d %d %d\n", + sd->counter, + m_context->FRAME.Block(), m_context->FRAME.PSM, gd.sel.fwrite, + m_context->ZBUF.Block(), m_context->ZBUF.PSM, gd.sel.zwrite, + PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, m_context->TEX0.CSM, m_context->TEX0.CPSM, m_context->TEX0.CSA, + PRIM->PRIM, sd->vertex_count, sd->index_count); - // + fflush(s_fp); + } if(s_dump) { @@ -382,7 +471,7 @@ void GSRendererSW::Draw() s_n++; - m_rl->Queue(data); + Queue(data); Sync(4); @@ -404,13 +493,7 @@ void GSRendererSW::Draw() } else { - if(LOG) fprintf(s_fp, "queue %05x %d %05x %d %05x %d %dx%d | %d %d %d\n", - m_context->FRAME.Block(), m_context->FRAME.PSM, - m_context->ZBUF.Block(), m_context->ZBUF.PSM, - PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH, - PRIM->PRIM, sd->vertex_count, sd->index_count); - - m_rl->Queue(data); + Queue(data); } /* @@ -425,6 +508,39 @@ void GSRendererSW::Draw() */ } +void GSRendererSW::Queue(shared_ptr& item) +{ + SharedData* sd = (SharedData*)item.get(); + + if(sd->m_syncpoint == SharedData::SyncSource) + { + m_rl->Sync(); + } + + // update previously invalidated parts + + sd->UpdateSource(); + + // invalidate new parts rendered onto + + if(sd->global.sel.fwrite) + { + m_tc->InvalidatePages(sd->m_fb_pages, sd->m_fpsm); + } + + if(sd->global.sel.zwrite) + { + m_tc->InvalidatePages(sd->m_zb_pages, sd->m_zpsm); + } + + if(sd->m_syncpoint == SharedData::SyncTarget) + { + m_rl->Sync(); + } + + m_rl->Queue(item); +} + void GSRendererSW::Sync(int reason) { //printf("sync %d\n", reason); @@ -435,36 +551,56 @@ void GSRendererSW::Sync(int reason) m_rl->Sync(); - s_n++; + if(0) if(LOG) + { + s_n++; + + std::string s; + + if(s_save) + { + s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM); + + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); + } + + if(s_savez) + { + s = format("c:\\temp1\\_%05d_f%lld_zb1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM); + + m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); + } + } t = __rdtsc() - t; - if(LOG) fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, m_rl->GetPixels(), t > 10000000 ? '*' : ' '); + int pixels = m_rl->GetPixels(); - m_perfmon.Put(GSPerfMon::Fillrate, m_rl->GetPixels()); + if(LOG) {fprintf(s_fp, "sync n=%d r=%d t=%lld p=%d %c\n", s_n, reason, t, pixels, t > 10000000 ? '*' : ' '); fflush(s_fp);} + + m_perfmon.Put(GSPerfMon::Fillrate, pixels); } void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) { + if(LOG) {fprintf(s_fp, "w %05x %d %d, %d %d %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} + GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); - uint32* RESTRICT p = m_tmp_pages; - - o->GetPages(r, p); + o->GetPages(r, m_tmp_pages); // check if the changing pages either used as a texture or a target - for(; *p != GSOffset::EOP; p++) + if(!m_rl->IsSynced()) { - uint32 page = *p; - - //while(m_fzb_pages[page] | m_tex_pages[page]) _mm_pause(); - - if(m_fzb_pages[page] | m_tex_pages[page]) + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) { - Sync(5); + if(m_fzb_pages[*p] | m_tex_pages[*p]) + { + Sync(5); - break; + break; + } } } @@ -473,21 +609,22 @@ void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GS void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) { - GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); + if(LOG) {fprintf(s_fp, "%s %05x %d %d, %d %d %d %d\n", clut ? "rp" : "r", BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM, r.x, r.y, r.z, r.w); fflush(s_fp);} - uint32* RESTRICT p = m_tmp_pages; - - o->GetPages(r, p); - - for(; *p != GSOffset::EOP; p++) + if(!m_rl->IsSynced()) { - //while(m_fzb_pages[*p]) _mm_pause(); + GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); - if(m_fzb_pages[*p]) + o->GetPages(r, m_tmp_pages); + + for(uint32* RESTRICT p = m_tmp_pages; *p != GSOffset::EOP; p++) { - Sync(6); + if(m_fzb_pages[*p]) + { + Sync(7); - break; + break; + } } } } @@ -505,18 +642,6 @@ void GSRendererSW::UsePages(const uint32* pages, int type) } else { - for(const uint32* p = pages; *p != GSOffset::EOP; p++) - { - //while(m_fzb_pages[*p]) _mm_pause(); - - if(m_fzb_pages[*p]) // currently being drawn to? => sync (could even spin and wait until it hits 0, not sure if it's worth though, or just create 512 condvars? :D) - { - Sync(7); - - break; - } - } - for(const uint32* p = pages; *p != GSOffset::EOP; p++) { ASSERT(m_tex_pages[*p] < SHRT_MAX); @@ -548,13 +673,178 @@ void GSRendererSW::ReleasePages(const uint32* pages, int type) } } -template bool GSRendererSW::CheckTargetPages(const uint32* pages) +bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r) { - for(const uint32* p = pages; *p != GSOffset::EOP; p++) + bool synced = m_rl->IsSynced(); + + bool fb = fb_pages != NULL; + bool zb = zb_pages != NULL; + + if(m_fzb != m_context->offset.fzb4) { - if(mask != 0xffffffff ? (m_fzb_pages[*p] & mask) : m_fzb_pages[*p]) + // targets changed, check everything + + m_fzb = m_context->offset.fzb4; + m_fzb_bbox = r; + + if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r); + if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r); + + memset(m_fzb_cur_pages, 0, sizeof(m_fzb_cur_pages)); + + uint32 used = 0; + + for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++) { - return true; + uint32 i = *p; + + uint32 row = i >> 5; + uint32 col = 1 << (i & 31); + + m_fzb_cur_pages[row] |= col; + + used |= m_fzb_pages[i]; + } + + for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++) + { + uint32 i = *p; + + uint32 row = i >> 5; + uint32 col = 1 << (i & 31); + + m_fzb_cur_pages[row] |= col; + + used |= m_fzb_pages[i]; + } + + if(!synced) + { + if(used) + { + if(LOG) {fprintf(s_fp, "syncpoint 0\n"); fflush(s_fp);} + + return true; + } + + //if(LOG) {fprintf(s_fp, "no syncpoint *\n"); fflush(s_fp);} + } + } + else + { + // same target, only check new areas and cross-rendering between frame and z-buffer + + GSVector4i bbox = m_fzb_bbox.runion(r); + + bool check = !m_fzb_bbox.eq(bbox); + + m_fzb_bbox = bbox; + + if(check) + { + // drawing area is larger than previous time, check new parts only to avoid false positives (m_fzb_cur_pages guards) + + if(fb_pages == NULL) fb_pages = m_context->offset.fb->GetPages(r); + if(zb_pages == NULL) zb_pages = m_context->offset.zb->GetPages(r); + + uint32 used = 0; + + for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++) + { + uint32 i = *p; + + uint32 row = i >> 5; + uint32 col = 1 << (i & 31); + + if((m_fzb_cur_pages[row] & col) == 0) + { + m_fzb_cur_pages[row] |= col; + + used |= m_fzb_pages[i]; + } + } + + for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++) + { + uint32 i = *p; + + uint32 row = i >> 5; + uint32 col = 1 << (i & 31); + + if((m_fzb_cur_pages[row] & col) == 0) + { + m_fzb_cur_pages[row] |= col; + + used |= m_fzb_pages[i]; + } + } + + if(!synced) + { + if(used) + { + if(LOG) {fprintf(s_fp, "syncpoint 1\n"); fflush(s_fp);} + + return true; + } + } + } + + if(!synced) + { + // chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue, + // have to be careful when the two buffers are mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300) + + if(fb) + { + for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++) + { + if(m_fzb_pages[*p] & 0xffff0000) + { + if(LOG) {fprintf(s_fp, "syncpoint 2\n"); fflush(s_fp);} + + return true; + } + } + } + + if(zb) + { + for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++) + { + if(m_fzb_pages[*p] & 0x0000ffff) + { + if(LOG) {fprintf(s_fp, "syncpoint 3\n"); fflush(s_fp);} + + return true; + } + } + } + } + } + + return false; +} + +bool GSRendererSW::CheckSourcePages(SharedData* sd) +{ + if(!m_rl->IsSynced()) + { + for(size_t i = 0; sd->m_tex[i].t != NULL; i++) + { + sd->m_tex[i].t->m_offset->GetPages(sd->m_tex[i].r, m_tmp_pages); + + uint32* pages = m_tmp_pages; // sd->m_tex[i].t->m_pages.n; + + for(const uint32* p = pages; *p != GSOffset::EOP; p++) + { + // TODO: 8H 4HL 4HH texture at the same place as the render target (24 bit, or 32-bit where the alpha channel is masked, Valkyrie Profile 2) + + if(m_fzb_pages[*p]) // currently being drawn to? => sync + { + return true; + } + } } } @@ -569,7 +859,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) const GSDrawingEnvironment& env = m_env; const GSDrawingContext* context = m_context; - const GS_PRIM_CLASS primclass = m_vt->m_primclass; + const GS_PRIM_CLASS primclass = m_vt.m_primclass; gd.vm = m_mem.m_vm8; @@ -577,8 +867,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.zbr = context->offset.zb->pixel.row; gd.fbc = context->offset.fb->pixel.col[0]; gd.zbc = context->offset.zb->pixel.col[0]; - gd.fzbr = context->offset.fzb->row; - gd.fzbc = context->offset.fzb->col; + gd.fzbr = context->offset.fzb4->row; + gd.fzbc = context->offset.fzb4->col; gd.sel.key = 0; @@ -600,7 +890,10 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) if(PRIM->TME) { - m_mem.m_clut.Read32(context->TEX0, env.TEXA); + if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) + { + m_mem.m_clut.Read32(context->TEX0, env.TEXA); + } } if(context->TEST.ATE) @@ -646,7 +939,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) { gd.sel.fpsm = GSLocalMemory::m_psm[context->FRAME.PSM].fmt; - if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt->m_eq.rgba != 0xffff) + if((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff) { gd.sel.iip = PRIM->IIP; } @@ -656,7 +949,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.tfx = context->TEX0.TFX; gd.sel.tcc = context->TEX0.TCC; gd.sel.fst = PRIM->FST; - gd.sel.ltf = m_vt->IsLinear(); + gd.sel.ltf = m_vt.IsLinear(); if(GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) { @@ -670,7 +963,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.sel.wms = context->CLAMP.WMS; gd.sel.wmt = context->CLAMP.WMT; - if(gd.sel.tfx == TFX_MODULATE && gd.sel.tcc && m_vt->m_eq.rgba == 0xffff && m_vt->m_min.c.eq(GSVector4i(128))) + if(gd.sel.tfx == TFX_MODULATE && gd.sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128))) { // modulate does not do anything when vertex color is 0x80 @@ -681,32 +974,15 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) if(t == NULL) {ASSERT(0); return false;} - data->UseSourcePages(t, 0); - GSVector4i r; GetTextureMinMax(r, context->TEX0, context->CLAMP, gd.sel.ltf); - if(!t->Update(r)) {ASSERT(0); return false;} + data->SetSource(t, r, 0); - if(s_dump)// && m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt->m_lod.x > 0) - { - uint64 frame = m_perfmon.GetFrame(); - - string s; - - if(s_save && s_n >= s_saven) - { - s = format("c:\\temp1\\_%05d_f%lld_tex32_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM); - - t->Save(s); - } - } - - gd.tex[0] = t->m_buff; gd.sel.tw = t->m_tw - 3; - if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt->m_lod.y > 0) + if(m_mipmap && context->TEX1.MXL > 0 && context->TEX1.MMIN >= 2 && context->TEX1.MMIN <= 5 && m_vt.m_lod.y > 0) { // TEX1.MMIN // 000 p @@ -716,13 +992,13 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) // 100 l round // 101 l tri - if(m_vt->m_lod.x > 0) + if(m_vt.m_lod.x > 0) { gd.sel.ltf = context->TEX1.MMIN >> 2; } else { - // TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt->m_lod.x <= 0 && m_vt->m_lod.y > 0 + // TODO: isbilinear(mmag) != isbilinear(mmin) && m_vt.m_lod.x <= 0 && m_vt.m_lod.y > 0 } gd.sel.mmin = (context->TEX1.MMIN & 1) + 1; // 1: round, 2: tri @@ -731,9 +1007,9 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) int mxl = (std::min((int)context->TEX1.MXL, 6) << 16); int k = context->TEX1.K << 12; - if((int)m_vt->m_lod.x >= (int)context->TEX1.MXL) + if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) { - k = (int)m_vt->m_lod.x << 16; // set lod to max level + k = (int)m_vt.m_lod.x << 16; // set lod to max level gd.sel.lcm = 1; // lod is constant gd.sel.mmin = 1; // tri-linear is meaningless @@ -747,7 +1023,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) if(gd.sel.fst) { ASSERT(gd.sel.lcm == 1); - ASSERT(((m_vt->m_min.t.uph(m_vt->m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu) + ASSERT(((m_vt.m_min.t.uph(m_vt.m_max.t) == GSVector4::zero()).mask() & 3) == 3); // ratchet and clank (menu) gd.sel.lcm = 1; } @@ -776,8 +1052,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) GIFRegTEX0 MIP_TEX0 = context->TEX0; GIFRegCLAMP MIP_CLAMP = context->CLAMP; - GSVector4 tmin = m_vt->m_min.t; - GSVector4 tmax = m_vt->m_max.t; + GSVector4 tmin = m_vt.m_min.t; + GSVector4 tmax = m_vt.m_max.t; static int s_counter = 0; @@ -827,51 +1103,24 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) MIP_CLAMP.MAXU >>= 1; MIP_CLAMP.MAXV >>= 1; - m_vt->m_min.t *= 0.5f; - m_vt->m_max.t *= 0.5f; + m_vt.m_min.t *= 0.5f; + m_vt.m_max.t *= 0.5f; GSTextureCacheSW::Texture* t = m_tc->Lookup(MIP_TEX0, env.TEXA, gd.sel.tw + 3); if(t == NULL) {ASSERT(0); return false;} - data->UseSourcePages(t, i); - GSVector4i r; GetTextureMinMax(r, MIP_TEX0, MIP_CLAMP, gd.sel.ltf); - if(!t->Update(r)) {ASSERT(0); return false;} - - gd.tex[i] = t->m_buff; - - if(0) - //if(context->TEX0.TH > context->TEX0.TW) - //if(s_n >= s_saven && s_n < s_saven + 3) - //if(context->TEX0.TBP0 >= 0x2b80 && context->TEX0.TBW == 2 && context->TEX0.PSM == PSM_PSMT4) - { - t->Save(format("c:/temp1/%08d_%05x_%d.bmp", s_counter, context->TEX0.TBP0, i)); - /* - GIFRegTEX0 TEX0 = MIP_TEX0; - TEX0.TBP0 = context->TEX0.TBP0; - do - { - TEX0.TBP0++; - const GSTextureCacheSW::Texture* t = m_tc->Lookup(TEX0, env.TEXA, r, gd.sel.tw + 3); - if(t == NULL) {ASSERT(0); return false;} - t->Save(format("c:/temp1/%08d_%05x_%d.bmp", s_counter, TEX0.TBP0, i)); - } - while(TEX0.TBP0 < 0x3fff); - */ - - int i = 0; - } - + data->SetSource(t, r, i); } s_counter++; - m_vt->m_min.t = tmin; - m_vt->m_max.t = tmax; + m_vt.m_min.t = tmin; + m_vt.m_max.t = tmax; } else { @@ -881,7 +1130,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) GSVertexSW* RESTRICT v = data->vertex; - if(m_vt->m_eq.q) + if(m_vt.m_eq.q) { gd.sel.fst = 1; @@ -956,8 +1205,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.t.mask.u32[0] = 0; break; case CLAMP_REGION_REPEAT: - gd.t.min.u16[0] = gd.t.minmax.u16[0] = context->CLAMP.MINU; - gd.t.max.u16[0] = gd.t.minmax.u16[2] = context->CLAMP.MAXU; + gd.t.min.u16[0] = gd.t.minmax.u16[0] = context->CLAMP.MINU & (tw - 1); + gd.t.max.u16[0] = gd.t.minmax.u16[2] = context->CLAMP.MAXU & (tw - 1); gd.t.mask.u32[0] = 0xffffffff; break; default: @@ -982,8 +1231,8 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.t.mask.u32[2] = 0; break; case CLAMP_REGION_REPEAT: - gd.t.min.u16[4] = gd.t.minmax.u16[1] = context->CLAMP.MINV; - gd.t.max.u16[4] = gd.t.minmax.u16[3] = context->CLAMP.MAXV; + gd.t.min.u16[4] = gd.t.minmax.u16[1] = context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127 + gd.t.max.u16[4] = gd.t.minmax.u16[3] = context->CLAMP.MAXV & (th - 1); gd.t.mask.u32[2] = 0xffffffff; break; default: @@ -1058,7 +1307,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) { gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt; gd.sel.ztst = ztest ? context->TEST.ZTST : ZTST_ALWAYS; - gd.sel.zoverflow = GSVector4i(m_vt->m_max.p).z == 0x80000000; + gd.sel.zoverflow = GSVector4i(m_vt.m_max.p).z == 0x80000000; } gd.fm = GSVector4i(fm); @@ -1085,6 +1334,23 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.zm |= GSVector4i::xffff0000(); } + if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) + { + gd.sel.notest = 1; + + uint32 ofx = context->XYOFFSET.OFX; + + for(int i = 0, j = m_vertex.tail; i < j; i++) + { + if((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4 + { + gd.sel.notest = 0; + + break; + } + } + } + return true; } @@ -1093,8 +1359,9 @@ GSRendererSW::SharedData::SharedData(GSRendererSW* parent) , m_fb_pages(NULL) , m_zb_pages(NULL) , m_using_pages(false) + , m_syncpoint(SyncNone) { - m_tex_pages[0] = NULL; + m_tex[0].t = NULL; global.sel.key = 0; @@ -1104,57 +1371,114 @@ GSRendererSW::SharedData::SharedData(GSRendererSW* parent) GSRendererSW::SharedData::~SharedData() { - if(m_using_pages) - { - if(global.sel.fwrite) - { - m_parent->ReleasePages(m_fb_pages, 0); - } + ReleasePages(); - if(global.sel.zwrite) - { - m_parent->ReleasePages(m_zb_pages, 1); - } - } - - delete m_fb_pages; - delete m_zb_pages; - - for(size_t i = 0; i < countof(m_tex_pages) && m_tex_pages[i] != NULL; i++) - { - m_parent->ReleasePages(m_tex_pages[i], 2); - } - if(global.clut) _aligned_free(global.clut); if(global.dimx) _aligned_free(global.dimx); + + if(LOG) {fprintf(s_fp, "[%d] done t=%lld p=%d | %d %d %d | %08x_%08x\n", + counter, + __rdtsc() - start, pixels, + primclass, vertex_count, index_count, + global.sel.hi, global.sel.lo + ); + fflush(s_fp);} } -void GSRendererSW::SharedData::UseTargetPages(const uint32* fb_pages, const uint32* zb_pages) +void GSRendererSW::SharedData::UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm) { if(m_using_pages) return; - m_fb_pages = fb_pages; - m_zb_pages = zb_pages; - - if(global.sel.fwrite) + if(global.sel.fb) { m_parent->UsePages(fb_pages, 0); } - if(global.sel.zwrite) + if(global.sel.zb) { m_parent->UsePages(zb_pages, 1); } + for(size_t i = 0; m_tex[i].t != NULL; i++) + { + m_parent->UsePages(m_tex[i].t->m_pages.n, 2); + } + + m_fb_pages = fb_pages; + m_zb_pages = zb_pages; + m_fpsm = fpsm; + m_zpsm = zpsm; + m_using_pages = true; } -void GSRendererSW::SharedData::UseSourcePages(GSTextureCacheSW::Texture* t, int level) +void GSRendererSW::SharedData::ReleasePages() { - ASSERT(m_tex_pages[level] == NULL); + if(!m_using_pages) return; - m_tex_pages[level] = t->m_pages.n; - m_tex_pages[level + 1] = NULL; + if(global.sel.fb) + { + m_parent->ReleasePages(m_fb_pages, 0); + } - m_parent->UsePages(t->m_pages.n, 2); + if(global.sel.zb) + { + m_parent->ReleasePages(m_zb_pages, 1); + } + + for(size_t i = 0; m_tex[i].t != NULL; i++) + { + m_parent->ReleasePages(m_tex[i].t->m_pages.n, 2); + } + + delete [] m_fb_pages; + delete [] m_zb_pages; + + m_fb_pages = NULL; + m_zb_pages = NULL; + + m_using_pages = false; +} + +void GSRendererSW::SharedData::SetSource(GSTextureCacheSW::Texture* t, const GSVector4i& r, int level) +{ + ASSERT(m_tex[level].t == NULL); + + m_tex[level].t = t; + m_tex[level].r = r; + + m_tex[level + 1].t = NULL; +} + +void GSRendererSW::SharedData::UpdateSource() +{ + for(size_t i = 0; m_tex[i].t != NULL; i++) + { + if(m_tex[i].t->Update(m_tex[i].r)) + { + global.tex[i] = m_tex[i].t->m_buff; + } + else + { + printf("GSdx: out-of-memory, texturing temporarily disabled\n"); + + global.sel.tfx = TFX_NONE; + } + + // TODO + + if(m_parent->s_dump) + { + uint64 frame = m_parent->m_perfmon.GetFrame(); + + string s; + + if(m_parent->s_save && m_parent->s_n >= m_parent->s_saven) + { + s = format("c:\\temp1\\_%05d_f%lld_tex%d_%05x_%d.bmp", m_parent->s_n - 2, frame, i, (int)m_parent->m_context->TEX0.TBP0, (int)m_parent->m_context->TEX0.PSM); + + m_tex[i].t->Save(s); + } + } + } } diff --git a/plugins/GSdx/GSRendererSW.h b/plugins/GSdx/GSRendererSW.h index ee68de3611..4ad138e4c9 100644 --- a/plugins/GSdx/GSRendererSW.h +++ b/plugins/GSdx/GSRendererSW.h @@ -29,27 +29,48 @@ class GSRendererSW : public GSRenderer { class SharedData : public GSDrawScanline::SharedData { + __aligned(struct, 16) TextureLevel + { + GSVector4i r; + GSTextureCacheSW::Texture* t; + }; + + public: GSRendererSW* m_parent; const uint32* m_fb_pages; const uint32* m_zb_pages; - const uint32* m_tex_pages[7 + 1]; // NULL terminated + int m_fpsm; + int m_zpsm; bool m_using_pages; + TextureLevel m_tex[7 + 1]; // NULL terminated + enum {SyncNone, SyncSource, SyncTarget} m_syncpoint; public: SharedData(GSRendererSW* parent); virtual ~SharedData(); - void UseTargetPages(const uint32* fb_pages, const uint32* zb_pages); - void UseSourcePages(GSTextureCacheSW::Texture* t, int level); + void UsePages(const uint32* fb_pages, int fpsm, const uint32* zb_pages, int zpsm); + void ReleasePages(); + + void SetSource(GSTextureCacheSW::Texture* t, const GSVector4i& r, int level); + void UpdateSource(); }; + typedef void (GSRendererSW::*ConvertVertexBufferPtr)(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); + + ConvertVertexBufferPtr m_cvb[4][2][2]; + + template + void ConvertVertexBuffer(GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); + protected: IRasterizer* m_rl; GSTextureCacheSW* m_tc; GSTexture* m_texture[2]; uint8* m_output; - bool m_reset; GSPixelOffset4* m_fzb; + GSVector4i m_fzb_bbox; + uint32 m_fzb_cur_pages[16]; uint32 m_fzb_pages[512]; // uint16 frame/zbuf pages interleaved uint16 m_tex_pages[512]; uint32 m_tmp_pages[512 + 1]; @@ -60,19 +81,19 @@ protected: GSTexture* GetOutput(int i); void Draw(); + void Queue(shared_ptr& item); void Sync(int reason); void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut = false); void UsePages(const uint32* pages, int type); void ReleasePages(const uint32* pages, int type); - template bool CheckTargetPages(const uint32* pages); + + bool CheckTargetPages(const uint32* fb_pages, const uint32* zb_pages, const GSVector4i& r); + bool CheckSourcePages(SharedData* sd); bool GetScanlineGlobalData(SharedData* data); - template - void ConvertVertex(size_t dst_index, size_t src_index); - public: GSRendererSW(int threads); virtual ~GSRendererSW(); diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index f6ecaced11..a8d9637c40 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -24,6 +24,8 @@ #include "GSLocalMemory.h" #include "GSVector.h" +#define GS_BILINEAR_PRECISION 4 // max precision 15, but several games like okami, rogue galaxy, dq8 break above 4 + union GSScanlineSelector { struct @@ -65,8 +67,9 @@ union GSScanlineSelector uint32 edge:1; // 48 uint32 tw:3; // 49 (encodes values between 3 -> 10, texture cache makes sure it is at least 3) - uint32 lcm:1; // 50 - uint32 mmin:2; // 51 + uint32 lcm:1; // 52 + uint32 mmin:2; // 53 + uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) }; struct diff --git a/plugins/GSdx/GSSettingsDlg.cpp b/plugins/GSdx/GSSettingsDlg.cpp index 23cb97e7fd..905b8b5f11 100644 --- a/plugins/GSdx/GSSettingsDlg.cpp +++ b/plugins/GSdx/GSSettingsDlg.cpp @@ -315,7 +315,7 @@ void GSSettingsDlg::UpdateControls() EnableWindow(GetDlgItem(m_hWnd, IDC_NATIVERES), hw); EnableWindow(GetDlgItem(m_hWnd, IDC_FILTER), hw); EnableWindow(GetDlgItem(m_hWnd, IDC_PALTEX), hw); - EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw && GSDevice9::GetMaxDepth(m_lastValidMsaa) < 32); + EnableWindow(GetDlgItem(m_hWnd, IDC_LOGZ), dx9 && hw); EnableWindow(GetDlgItem(m_hWnd, IDC_FBA), dx9 && hw); //EnableWindow(GetDlgItem(m_hWnd, IDC_AA1), sw); // Let uers set software params regardless of renderer used //EnableWindow(GetDlgItem(m_hWnd, IDC_SWTHREADS_EDIT), sw); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp index b473094d3b..86df61d412 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.avx.cpp @@ -38,7 +38,7 @@ void GSSetupPrimCodeGenerator::Generate() { mov(edx, dword[esp + _dscan]); - for(int i = 0; i < 5; i++) + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) { vmovaps(Xmm(3 + i), ptr[&m_shift[i]]); } @@ -80,7 +80,7 @@ void GSSetupPrimCodeGenerator::Depth() vpshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vmovdqa(ptr[&m_local.d4.f], xmm2); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); @@ -103,7 +103,7 @@ void GSSetupPrimCodeGenerator::Depth() vmulps(xmm1, xmm0, xmm3); vmovdqa(ptr[&m_local.d4.z], xmm1); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z = dz * m_shift[i]; @@ -139,36 +139,6 @@ void GSSetupPrimCodeGenerator::Depth() vmovdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]); vpshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - -/* - // GSVector4 z = p.zzzz(); - - vshufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - vbroadcastss(xmm1, ptr[&GSVector4::m_half]); - vmulps(xmm1, xmm0); - vcvttps2dq(xmm1, xmm1); - vpslld(xmm1, 1); - - vcvttps2dq(xmm0, xmm0); - vpcmpeqd(xmm2, xmm2); - vpsrld(xmm2, 31); - vpand(xmm0, xmm2); - - vpor(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - vcvttps2dq(xmm0, xmm0); - } -*/ - vmovdqa(ptr[&m_local.p.z], xmm0); } } @@ -210,7 +180,7 @@ void GSSetupPrimCodeGenerator::Texture() vshufps(xmm1, xmm0, xmm0, (uint8)_MM_SHUFFLE(j, j, j, j)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4 v = ds/dt * m_shift[i]; @@ -272,7 +242,7 @@ void GSSetupPrimCodeGenerator::Color() vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); @@ -302,7 +272,7 @@ void GSSetupPrimCodeGenerator::Color() vshufps(xmm2, xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp index 01d79b21fc..070ccbf109 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp @@ -38,7 +38,7 @@ void GSSetupPrimCodeGenerator::Generate() { mov(edx, dword[esp + _dscan]); - for(int i = 0; i < 5; i++) + for(int i = 0; i < (m_sel.notest ? 2 : 5); i++) { movaps(Xmm(3 + i), ptr[&m_shift[i]]); } @@ -82,7 +82,7 @@ void GSSetupPrimCodeGenerator::Depth() pshufhw(xmm2, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); movdqa(ptr[&m_local.d4.f], xmm2); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].f = GSVector4i(df * m_shift[i]).xxzzlh(); @@ -107,7 +107,7 @@ void GSSetupPrimCodeGenerator::Depth() mulps(xmm1, xmm3); movdqa(ptr[&m_local.d4.z], xmm1); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // m_local.d[i].z = dz * m_shift[i]; @@ -144,36 +144,6 @@ void GSSetupPrimCodeGenerator::Depth() movdqa(xmm0, ptr[ecx + offsetof(GSVertexSW, t)]); pshufd(xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - - /* - // GSVector4 z = p.zzzz(); - - shufps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - - if(m_sel.zoverflow) - { - // m_local.p.z = (GSVector4i(z * 0.5f) << 1) | (GSVector4i(z) & GSVector4i::x00000001()); - - movaps(xmm1, ptr[&GSVector4::m_half]); - mulps(xmm1, xmm0); - cvttps2dq(xmm1, xmm1); - pslld(xmm1, 1); - - cvttps2dq(xmm0, xmm0); - pcmpeqd(xmm2, xmm2); - psrld(xmm2, 31); - pand(xmm0, xmm2); - - por(xmm0, xmm1); - } - else - { - // m_local.p.z = GSVector4i(z); - - cvttps2dq(xmm0, xmm0); - } - */ - movdqa(ptr[&m_local.p.z], xmm0); } } @@ -217,7 +187,7 @@ void GSSetupPrimCodeGenerator::Texture() movaps(xmm1, xmm0); shufps(xmm1, xmm1, (uint8)_MM_SHUFFLE(j, j, j, j)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4 v = ds/dt * m_shift[i]; @@ -282,7 +252,7 @@ void GSSetupPrimCodeGenerator::Color() shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i r = GSVector4i(dr * m_shift[i]).ps32(); @@ -315,7 +285,7 @@ void GSSetupPrimCodeGenerator::Color() shufps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - for(int i = 0; i < 4; i++) + for(int i = 0; i < (m_sel.notest ? 1 : 4); i++) { // GSVector4i g = GSVector4i(dg * m_shift[i]).ps32(); diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index f1259f76ca..3562a94f55 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -26,7 +26,7 @@ //#define Offset_ST // Fixes Persona3 mini map alignment which is off even in software rendering //#define Offset_UV // Fixes / breaks various titles -GSState::GSState(GSVertexTrace* vt, size_t vertex_stride) +GSState::GSState() : m_version(6) , m_mt(false) , m_irq(NULL) @@ -35,24 +35,20 @@ GSState::GSState(GSVertexTrace* vt, size_t vertex_stride) , m_crc(0) , m_options(0) , m_frameskip(0) - , m_vt(vt) + , m_vt(this) + , m_q(1.0f) + , m_texflush(true) { m_nativeres = !!theApp.GetConfig("nativeres", 0); memset(&m_v, 0, sizeof(m_v)); - m_q = 1.0f; memset(&m_vertex, 0, sizeof(m_vertex)); memset(&m_index, 0, sizeof(m_index)); - ASSERT(vertex_stride >= sizeof(GSVertex)); - - m_vertex.stride = vertex_stride; - m_vertex.tmp = (uint8*)_aligned_malloc(m_vertex.stride * 2, 32); + m_v.RGBAQ.Q = 1.0f; GrowVertexBuffer(); - memset(m_cv, 0, sizeof(m_cv)); - m_sssize = 0; m_sssize += sizeof(m_version); @@ -110,12 +106,16 @@ GSState::GSState(GSVertexTrace* vt, size_t vertex_stride) Reset(); ResetHandlers(); + + s_n = 0; + s_dump = !!theApp.GetConfig("dump", 0); + s_save = !!theApp.GetConfig("save", 0); + s_savez = !!theApp.GetConfig("savez", 0); + s_saven = theApp.GetConfig("saven", 0); } GSState::~GSState() { - _aligned_free(m_vertex.tmp); - if(m_vertex.buff) _aligned_free(m_vertex.buff); if(m_index.buff) _aligned_free(m_index.buff); } @@ -165,50 +165,28 @@ void GSState::SetFrameSkip(int skip) { m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerNOP; - m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = &GSState::GIFPackedRegHandlerNOP; - m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = &GSState::GIFPackedRegHandlerNOP; - m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerNOP; m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerNOP; - m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerNOP; - m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerNOP; - m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerNOP; - m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerNOP; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerNOP; - m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerNOP; - m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerNOP; + + m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = &GSState::GIFPackedRegHandlerNOP; + m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = &GSState::GIFPackedRegHandlerNOP; } else { - m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = &GSState::GIFPackedRegHandlerXYZF2; - m_fpGIFPackedRegHandlers[GIF_REG_XYZ2] = &GSState::GIFPackedRegHandlerXYZ2; - m_fpGIFPackedRegHandlers[GIF_REG_XYZF3] = &GSState::GIFPackedRegHandlerXYZF2; - m_fpGIFPackedRegHandlers[GIF_REG_XYZ3] = &GSState::GIFPackedRegHandlerXYZ2; - m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_1] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerCLAMP<0>; - m_fpGIFPackedRegHandlers[GIF_REG_CLAMP_2] = (GIFPackedRegHandler)(GIFRegHandler)&GSState::GIFRegHandlerCLAMP<1>; - m_fpGIFPackedRegHandlers[GIF_REG_FOG] = &GSState::GIFPackedRegHandlerFOG; - - m_fpGIFRegHandlers[GIF_A_D_REG_PRIM] = &GSState::GIFRegHandlerPRIM; - m_fpGIFRegHandlers[GIF_A_D_REG_RGBAQ] = &GSState::GIFRegHandlerRGBAQ; - m_fpGIFRegHandlers[GIF_A_D_REG_ST] = &GSState::GIFRegHandlerST; - m_fpGIFRegHandlers[GIF_A_D_REG_UV] = &GSState::GIFRegHandlerUV; - m_fpGIFRegHandlers[GIF_A_D_REG_XYZF2] = &GSState::GIFRegHandlerXYZF2; - m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = &GSState::GIFRegHandlerXYZ2; - m_fpGIFRegHandlers[GIF_A_D_REG_XYZF3] = &GSState::GIFRegHandlerXYZF2; - m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = &GSState::GIFRegHandlerXYZ2; - m_fpGIFRegHandlers[GIF_A_D_REG_PRMODECONT] = &GSState::GIFRegHandlerPRMODECONT; - m_fpGIFRegHandlers[GIF_A_D_REG_PRMODE] = &GSState::GIFRegHandlerPRMODE; - UpdateVertexKick(); } } void GSState::Reset() { + printf("GS reset\n"); + + // FIXME: memset(m_mem.m_vm8, 0, m_mem.m_vmsize); // bios logo not shown cut in half after reset, missing graphics in GoW after first FMV memset(&m_path[0], 0, sizeof(m_path[0]) * countof(m_path)); memset(&m_v, 0, sizeof(m_v)); @@ -223,6 +201,8 @@ void GSState::Reset() m_vertex.tail = 0; m_vertex.next = 0; m_index.tail = 0; + + m_texflush = true; } void GSState::ResetHandlers() @@ -253,6 +233,8 @@ void GSState::ResetHandlers() m_fpGIFRegHandlerXYZ[P][1] = &GSState::GIFRegHandlerXYZF2; \ m_fpGIFRegHandlerXYZ[P][2] = &GSState::GIFRegHandlerXYZ2; \ m_fpGIFRegHandlerXYZ[P][3] = &GSState::GIFRegHandlerXYZ2; \ + m_fpGIFPackedRegHandlerSTQRGBAXYZF2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZF2

; \ + m_fpGIFPackedRegHandlerSTQRGBAXYZ2[P] = &GSState::GIFPackedRegHandlerSTQRGBAXYZ2

; \ SetHandlerXYZ(GS_POINTLIST); SetHandlerXYZ(GS_LINELIST); @@ -334,6 +316,8 @@ GSVector4i GSState::GetDisplayRect(int i) return r; } +// There's a problem when games expand/shrink and relocate the visible area since GSdx doesn't support +// moving the output area. (Disgaea 2 intro FMV when upscaling is used, also those games hackfixed below.) GSVector4i GSState::GetFrameRect(int i) { if(i < 0) i = IsEnabled(1) ? 1 : 0; @@ -356,12 +340,20 @@ GSVector4i GSState::GetFrameRect(int i) r.top = m_regs->DISP[i].DISPFB.DBY; r.right = r.left + w; r.bottom = r.top + h; - //printf("%d %d %d %d %d %d\n",w,h,r.left,r.top,r.right,r.bottom); + + /*static GSVector4i old_r = (GSVector4i) 0; + if ((old_r.left != r.left) || (old_r.right != r.right) || (old_r.top != r.top) || (old_r.right != r.right)){ + printf("w %d h %d left %d top %d right %d bottom %d\n",w,h,r.left,r.top,r.right,r.bottom); + } + old_r = r;*/ + return r; } GSVector2i GSState::GetDeviceSize(int i) { + // TODO: return (m_regs->SMODE1.CMOD & 1) ? GSVector2i(640, 576) : GSVector2i(640, 480); + // TODO: other params of SMODE1 should affect the true device display size // TODO2: pal games at 60Hz @@ -439,19 +431,12 @@ void GSState::GIFPackedRegHandlerRGBA(const GIFPackedReg* RESTRICT r) m_v.RGBAQ.u32[0] = (uint32)GSVector4i::store(v); - #elif _M_SSE >= 0x200 + #else GSVector4i v = GSVector4i::load(r) & GSVector4i::x000000ff(); m_v.RGBAQ.u32[0] = v.rgba32(); - #else - - m_v.RGBAQ.R = r->RGBA.R; - m_v.RGBAQ.G = r->RGBA.G; - m_v.RGBAQ.B = r->RGBA.B; - m_v.RGBAQ.A = r->RGBA.A; - #endif m_v.RGBAQ.Q = m_q; @@ -463,16 +448,11 @@ void GSState::GIFPackedRegHandlerSTQ(const GIFPackedReg* RESTRICT r) m_v.ST.u64 = r->u64[0]; - #elif _M_SSE >= 0x200 + #else GSVector4i v = GSVector4i::loadl(r); GSVector4i::storel(&m_v.ST.u64, v); - #else - - m_v.ST.S = r->STQ.S; - m_v.ST.T = r->STQ.T; - #endif m_q = r->STQ.Q; @@ -546,6 +526,69 @@ void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r) { } +template +void GSState::GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size) +{ + ASSERT(size > 0 && size % 3 == 0); + + const GIFPackedReg* RESTRICT r_end = r + size; + + while(r < r_end) + { + GSVector4i st = GSVector4i::loadl(&r[0].u64[0]); + GSVector4i q = GSVector4i::loadl(&r[0].u64[1]); + GSVector4i rgba = (GSVector4i::load(&r[1]) & GSVector4i::x000000ff()).ps32().pu16(); + + m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one + + GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]); + GSVector4i zf = GSVector4i::loadl(&r[2].u64[1]); + xy = xy.upl16(xy.srl<4>()).upl32(GSVector4i::loadl(&m_v.UV)); + zf = zf.srl32(4) & GSVector4i::x00ffffff().upl32(GSVector4i::x000000ff()); + + m_v.m[1] = xy.upl32(zf); // TODO: only store the last one + + VertexKick(r[2].XYZF2.Skip()); + + r += 3; + } + + m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time +} + +template +void GSState::GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uint32 size) +{ + ASSERT(size > 0 && size % 3 == 0); + + const GIFPackedReg* RESTRICT r_end = r + size; + + while(r < r_end) + { + GSVector4i st = GSVector4i::loadl(&r[0].u64[0]); + GSVector4i q = GSVector4i::loadl(&r[0].u64[1]); + GSVector4i rgba = (GSVector4i::load(&r[1]) & GSVector4i::x000000ff()).ps32().pu16(); + + m_v.m[0] = st.upl64(rgba.upl32(q)); // TODO: only store the last one + + GSVector4i xy = GSVector4i::loadl(&r[2].u64[0]); + GSVector4i z = GSVector4i::loadl(&r[2].u64[1]); + GSVector4i xyz = xy.upl16(xy.srl<4>()).upl32(z); + + m_v.m[1] = xyz.upl64(GSVector4i::loadl(&m_v.UV)); // TODO: only store the last one + + VertexKick(r[2].XYZ2.Skip()); + + r += 3; + } + + m_q = r[-3].STQ.Q; // remember the last one, STQ outputs this to the temp Q each time +} + +void GSState::GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r, uint32 size) +{ +} + // GIFRegHandler* void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r) @@ -553,13 +596,13 @@ void GSState::GIFRegHandlerNull(const GIFReg* RESTRICT r) // ASSERT(0); } -__forceinline void GSState::ApplyPRIM(const GIFRegPRIM& prim) +__forceinline void GSState::ApplyPRIM(uint32 prim) { // ASSERT(r->PRIM.PRIM < 7); - if(GSUtil::GetPrimClass(m_env.PRIM.PRIM) == GSUtil::GetPrimClass(prim.PRIM)) // NOTE: assume strips/fans are converted to lists + if(GSUtil::GetPrimClass(m_env.PRIM.PRIM) == GSUtil::GetPrimClass(prim & 7)) // NOTE: assume strips/fans are converted to lists { - if((m_env.PRIM.u32[0] ^ prim.u32[0]) & 0x7f8) // all fields except PRIM + if((m_env.PRIM.u32[0] ^ prim) & 0x7f8) // all fields except PRIM { Flush(); } @@ -569,8 +612,8 @@ __forceinline void GSState::ApplyPRIM(const GIFRegPRIM& prim) Flush(); } - m_env.PRIM = (GSVector4i)prim; - m_env.PRMODE._PRIM = prim.PRIM; + m_env.PRIM.u32[0] = prim; + m_env.PRMODE._PRIM = prim; UpdateContext(); @@ -590,7 +633,7 @@ void GSState::GIFRegHandlerPRIM(const GIFReg* RESTRICT r) { ALIGN_STACK(32); - ApplyPRIM(r->PRIM); + ApplyPRIM(r->PRIM.u32[0]); } void GSState::GIFRegHandlerRGBAQ(const GIFReg* RESTRICT r) @@ -681,17 +724,49 @@ template void GSState::ApplyTEX0(GIFRegTEX0& TEX0) if(wt) { GIFRegBITBLTBUF BITBLTBUF; - - BITBLTBUF.SBP = TEX0.CBP; - BITBLTBUF.SBW = 1; - BITBLTBUF.SPSM = TEX0.CSM; + GSVector4i r; - GSVector4i r = GSVector4i::zero(); + if(TEX0.CSM == 0) + { + BITBLTBUF.SBP = TEX0.CBP; + BITBLTBUF.SBW = 1; + BITBLTBUF.SPSM = TEX0.CSM; - r.right = GSLocalMemory::m_psm[TEX0.CPSM].pgs.x; - r.bottom = GSLocalMemory::m_psm[TEX0.CPSM].pgs.y; + r.left = 0; + r.top = 0; + r.right = GSLocalMemory::m_psm[TEX0.CPSM].bs.x; + r.bottom = GSLocalMemory::m_psm[TEX0.CPSM].bs.y; + + int blocks = 4; + + if(GSLocalMemory::m_psm[TEX0.CPSM].bpp == 16) + { + blocks >>= 1; + } + + if(GSLocalMemory::m_psm[TEX0.PSM].bpp == 4) + { + blocks >>= 1; + } - InvalidateLocalMem(BITBLTBUF, r, true); + for(int j = 0; j < blocks; j++, BITBLTBUF.SBP++) + { + InvalidateLocalMem(BITBLTBUF, r, true); + } + } + else + { + BITBLTBUF.SBP = TEX0.CBP; + BITBLTBUF.SBW = m_env.TEXCLUT.CBW; + BITBLTBUF.SPSM = TEX0.CSM; + + r.left = m_env.TEXCLUT.COU; + r.top = m_env.TEXCLUT.COV; + r.right = r.left + GSLocalMemory::m_psm[TEX0.CPSM].pal; + r.bottom = r.top + 1; + + InvalidateLocalMem(BITBLTBUF, r, true); + } m_mem.m_clut.Write(m_env.CTXT[i].TEX0, m_env.TEXCLUT); } @@ -701,8 +776,13 @@ template void GSState::GIFRegHandlerTEX0(const GIFReg* RESTRICT r) { GIFRegTEX0 TEX0 = r->TEX0; - if(TEX0.TW > 10) TEX0.TW = 10; - if(TEX0.TH > 10) TEX0.TH = 10; + // Tokyo Xtreme Racer Drift 2, TW/TH == 0, PRIM->FST == 1 + // Just setting the max texture size to make the texture cache allocate some surface. + // The vertex trace will narrow the updated area down to the minimum, upper-left 8x8 + // for a single letter, but it may address the whole thing if it wants to. + + if(TEX0.TW > 10 || TEX0.TW == 0) TEX0.TW = 10; + if(TEX0.TH > 10 || TEX0.TH == 0) TEX0.TH = 10; if((TEX0.TBW & 1) && (TEX0.PSM == PSM_PSMT8 || TEX0.PSM == PSM_PSMT4)) { @@ -915,7 +995,7 @@ void GSState::GIFRegHandlerFOGCOL(const GIFReg* RESTRICT r) void GSState::GIFRegHandlerTEXFLUSH(const GIFReg* RESTRICT r) { - // TRACE(_T("TEXFLUSH\n")); + m_texflush = true; } template void GSState::GIFRegHandlerSCISSOR(const GIFReg* RESTRICT r) @@ -1037,7 +1117,8 @@ template void GSState::GIFRegHandlerFRAME(const GIFReg* RESTRICT r) { m_env.CTXT[i].offset.fb = m_mem.GetOffset(r->FRAME.Block(), r->FRAME.FBW, r->FRAME.PSM); m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), r->FRAME.FBW, m_env.CTXT[i].ZBUF.PSM); - m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF); + m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(r->FRAME, m_env.CTXT[i].ZBUF); + m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(r->FRAME, m_env.CTXT[i].ZBUF); } m_env.CTXT[i].FRAME = (GSVector4i)r->FRAME; @@ -1075,7 +1156,8 @@ template void GSState::GIFRegHandlerZBUF(const GIFReg* RESTRICT r) if((m_env.CTXT[i].ZBUF.u32[0] ^ ZBUF.u32[0]) & 0x3f0001ff) // ZBP PSM { m_env.CTXT[i].offset.zb = m_mem.GetOffset(ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, ZBUF.PSM); - m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF); + m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, ZBUF); + m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, ZBUF); } m_env.CTXT[i].ZBUF = (GSVector4i)ZBUF; @@ -1230,40 +1312,8 @@ void GSState::FlushPrim() { if(m_index.tail > 0) { - if(0) - { - uint8* buff = new uint8[m_vertex.next]; + GSVertex buff[2]; - memset(buff, 0, m_vertex.next); - - for(size_t i = 0; i < m_index.tail; i++) - { - ASSERT(m_index.buff[i] < m_vertex.next); - - buff[m_index.buff[i]] = 1; - } - - size_t count = 0; - - for(size_t i = 0; i < m_vertex.next; i++) - { - if(buff[i] == 0) - { - count++; - } - } - - if(count > 0) - { - printf("unref %lld %d/%d\n", m_perfmon.GetFrame(), count, m_vertex.next); - } - - delete [] buff; - } - - uint8* buff = m_vertex.tmp; - - size_t stride = m_vertex.stride; size_t head = m_vertex.head; size_t tail = m_vertex.tail; size_t next = m_vertex.next; @@ -1282,11 +1332,11 @@ void GSState::FlushPrim() case GS_TRIANGLELIST: case GS_TRIANGLESTRIP: unused = tail - head; - memcpy(buff, &m_vertex.buff[stride * head], stride * unused); + memcpy(buff, &m_vertex.buff[head], sizeof(GSVertex) * unused); break; case GS_TRIANGLEFAN: - memcpy(buff, &m_vertex.buff[stride * head], stride); unused = 1; - if(tail - 1 > head) {memcpy(&buff[stride], &m_vertex.buff[stride * (tail - 1)], stride); unused = 2;} + buff[0] = m_vertex.buff[head]; unused = 1; + if(tail - 1 > head) {buff[1] = m_vertex.buff[tail - 1]; unused = 2;} break; case GS_INVALID: break; @@ -1301,7 +1351,7 @@ void GSState::FlushPrim() { // FIXME: berserk fpsm = 27 (8H) - m_vt->Update(m_vertex.buff, m_index.buff, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM)); + m_vt.Update(m_vertex.buff, m_index.buff, m_index.tail, GSUtil::GetPrimClass(PRIM->PRIM)); Draw(); @@ -1315,7 +1365,7 @@ void GSState::FlushPrim() if(unused > 0) { - memcpy(m_vertex.buff, buff, stride * unused); + memcpy(m_vertex.buff, buff, sizeof(GSVertex) * unused); m_vertex.tail = unused; m_vertex.next = next > head ? next - head : 0; @@ -1641,7 +1691,7 @@ void GSState::SoftReset(uint32 mask) m_env.TRXDIR.XDIR = 3; //-1 ; set it to invalid value - m_q = 1; + m_q = 1.0f; } void GSState::ReadFIFO(uint8* mem, int size) @@ -1665,6 +1715,8 @@ template void GSState::Transfer<1>(const uint8* mem, uint32 size); template void GSState::Transfer<2>(const uint8* mem, uint32 size); template void GSState::Transfer<3>(const uint8* mem, uint32 size); +static hash_map s_tags; + template void GSState::Transfer(const uint8* mem, uint32 size) { GSPerfMonAutoTimer pmat(&m_perfmon); @@ -1679,6 +1731,16 @@ template void GSState::Transfer(const uint8* mem, uint32 size) { path.SetTag(mem); + if(0) + { + GIFTag* t = (GIFTag*)mem; + uint64 hash; + if(t->NREG < 8) hash = t->u32[2] & ((1 << t->NREG * 4) - 1); + else if(t->NREG < 16) {hash = t->u32[2]; ((uint32*)&hash)[1] = t->u32[3] & ((1 << (t->NREG - 8) * 4) - 1);} + else hash = t->u64[1]; + s_tags[hash] += path.nloop * path.nreg; + } + mem += sizeof(GIFTag); size--; @@ -1690,9 +1752,7 @@ template void GSState::Transfer(const uint8* mem, uint32 size) if(path.tag.PRE && path.tag.FLG == GIF_FLG_PACKED) { - GIFRegPRIM r; - r.u64 = path.tag.PRIM; - ApplyPRIM(r); + ApplyPRIM(path.tag.PRIM); } } } @@ -1726,8 +1786,28 @@ template void GSState::Transfer(const uint8* mem, uint32 size) { size -= total; - if(path.adonly) + switch(path.type) { + case GIFPath::TYPE_UNKNOWN: + + { + uint32 reg = 0; + + do + { + (this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem); + + mem += sizeof(GIFPackedReg); + + reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg + } + while(--total > 0); + } + + break; + + case GIFPath::TYPE_ADONLY: // very common + do { (this->*m_fpGIFRegHandlers[((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r); @@ -1735,20 +1815,28 @@ template void GSState::Transfer(const uint8* mem, uint32 size) mem += sizeof(GIFPackedReg); } while(--total > 0); - } - else - { - uint32 reg = 0; - do - { - (this->*m_fpGIFPackedRegHandlers[path.GetReg(reg++)])((GIFPackedReg*)mem); + break; + + case GIFPath::TYPE_STQRGBAXYZF2: // majority of the vertices are formatted like this - mem += sizeof(GIFPackedReg); + (this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2])((GIFPackedReg*)mem, total); - reg = reg & ((int)(reg - path.nreg) >> 31); // resets reg back to 0 when it becomes equal to path.nreg - } - while(--total > 0); + mem += total * sizeof(GIFPackedReg); + + break; + + case GIFPath::TYPE_STQRGBAXYZ2: + + (this->*m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2])((GIFPackedReg*)mem, total); + + mem += total * sizeof(GIFPackedReg); + + break; + + default: + + __assume(0); } path.nloop = 0; @@ -1952,6 +2040,12 @@ int GSState::Freeze(GSFreezeData* fd, bool sizeonly) { m_path[i].tag.NREG = m_path[i].nreg; m_path[i].tag.NLOOP = m_path[i].nloop; + m_path[i].tag.REGS = 0; + + for(size_t j = 0; j < countof(m_path[i].regs.u8); j++) + { + m_path[i].tag.u32[2 + (j >> 3)] |= m_path[i].regs.u8[j] << ((j & 7) << 2); + } WriteState(data, &m_path[i].tag); WriteState(data, &m_path[i].reg); @@ -2070,7 +2164,8 @@ int GSState::Defrost(const GSFreezeData* fd) m_env.CTXT[i].offset.fb = m_mem.GetOffset(m_env.CTXT[i].FRAME.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].FRAME.PSM); m_env.CTXT[i].offset.zb = m_mem.GetOffset(m_env.CTXT[i].ZBUF.Block(), m_env.CTXT[i].FRAME.FBW, m_env.CTXT[i].ZBUF.PSM); m_env.CTXT[i].offset.tex = m_mem.GetOffset(m_env.CTXT[i].TEX0.TBP0, m_env.CTXT[i].TEX0.TBW, m_env.CTXT[i].TEX0.PSM); - m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF); + m_env.CTXT[i].offset.fzb = m_mem.GetPixelOffset(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF); + m_env.CTXT[i].offset.fzb4 = m_mem.GetPixelOffset4(m_env.CTXT[i].FRAME, m_env.CTXT[i].ZBUF); } UpdateScissor(); @@ -2104,6 +2199,8 @@ void GSState::UpdateScissor() void GSState::UpdateVertexKick() { + if(m_frameskip) return; + uint32 prim = PRIM->PRIM; m_fpGIFPackedRegHandlers[GIF_REG_XYZF2] = m_fpGIFPackedRegHandlerXYZ[prim][0]; @@ -2116,19 +2213,20 @@ void GSState::UpdateVertexKick() m_fpGIFRegHandlers[GIF_A_D_REG_XYZ2] = m_fpGIFRegHandlerXYZ[prim][2]; m_fpGIFRegHandlers[GIF_A_D_REG_XYZ3] = m_fpGIFRegHandlerXYZ[prim][3]; - m_cvf = m_cv[prim][PRIM->TME][PRIM->FST]; + m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZF2] = m_fpGIFPackedRegHandlerSTQRGBAXYZF2[prim]; + m_fpGIFPackedRegHandlersC[GIF_REG_STQRGBAXYZ2] = m_fpGIFPackedRegHandlerSTQRGBAXYZ2[prim]; } void GSState::GrowVertexBuffer() { int maxcount = std::max(m_vertex.maxcount * 3 / 2, 10000); - uint8* vertex = (uint8*)_aligned_malloc(m_vertex.stride * maxcount, 16); + GSVertex* vertex = (GSVertex*)_aligned_malloc(sizeof(GSVertex) * maxcount, 16); uint32* index = (uint32*)_aligned_malloc(sizeof(uint32) * maxcount * 3, 16); // worst case is slightly less than vertex number * 3 if(m_vertex.buff != NULL) { - memcpy(vertex, m_vertex.buff, m_vertex.stride * m_vertex.tail); + memcpy(vertex, m_vertex.buff, sizeof(GSVertex) * m_vertex.tail); _aligned_free(m_vertex.buff); } @@ -2160,17 +2258,13 @@ __forceinline void GSState::VertexKick(uint32 skip) GSVector4i v0(m_v.m[0]); GSVector4i v1(m_v.m[1]); - GSVector4i* RESTRICT tailptr = (GSVector4i*)&m_vertex.buff[m_vertex.stride * tail]; + GSVector4i* RESTRICT tailptr = (GSVector4i*)&m_vertex.buff[tail]; tailptr[0] = v0; tailptr[1] = v1; m_vertex.xy[xy_tail & 3] = GSVector4(v1.upl32(v1.sub16(GSVector4i::load(m_ofxy)).sra16(4)).upl16()); // zw not sign extended, only useful for eq tests - #ifdef _DEBUG - memset(&tailptr[2], 0, m_vertex.stride - sizeof(GSVertex)); - #endif - m_vertex.tail = ++tail; m_vertex.xy_tail = ++xy_tail; @@ -2286,8 +2380,6 @@ __forceinline void GSState::VertexKick(uint32 skip) uint32* RESTRICT buff = &m_index.buff[m_index.tail]; - size_t src_index = head; - switch(prim) { case GS_POINTLIST: @@ -2295,7 +2387,6 @@ __forceinline void GSState::VertexKick(uint32 skip) m_vertex.head = head + 1; m_vertex.next = head + 1; m_index.tail += 1; - (this->*m_cvf)(head, head); break; case GS_LINELIST: buff[0] = head + 0; @@ -2303,18 +2394,20 @@ __forceinline void GSState::VertexKick(uint32 skip) m_vertex.head = head + 2; m_vertex.next = head + 2; m_index.tail += 2; - (this->*m_cvf)(head + 0, head + 0); - (this->*m_cvf)(head + 1, head + 1); break; case GS_LINESTRIP: - if(next < head) {head = next; m_vertex.tail = next + 2;} + if(next < head) + { + m_vertex.buff[next + 0] = m_vertex.buff[head + 0]; + m_vertex.buff[next + 1] = m_vertex.buff[head + 1]; + head = next; + m_vertex.tail = next + 2; + } buff[0] = head + 0; buff[1] = head + 1; m_vertex.head = head + 1; m_vertex.next = head + 2; m_index.tail += 2; - if(head + 0 >= next) (this->*m_cvf)(head + 0, src_index + 0); - /*if(head + 1 >= next)*/ (this->*m_cvf)(head + 1, src_index + 1); // this is always a new vertex break; case GS_TRIANGLELIST: buff[0] = head + 0; @@ -2323,21 +2416,22 @@ __forceinline void GSState::VertexKick(uint32 skip) m_vertex.head = head + 3; m_vertex.next = head + 3; m_index.tail += 3; - (this->*m_cvf)(head + 0, head + 0); - (this->*m_cvf)(head + 1, head + 1); - (this->*m_cvf)(head + 2, head + 2); break; case GS_TRIANGLESTRIP: - if(next < head) {head = next; m_vertex.tail = next + 3;} + if(next < head) + { + m_vertex.buff[next + 0] = m_vertex.buff[head + 0]; + m_vertex.buff[next + 1] = m_vertex.buff[head + 1]; + m_vertex.buff[next + 2] = m_vertex.buff[head + 2]; + head = next; + m_vertex.tail = next + 3; + } buff[0] = head + 0; buff[1] = head + 1; buff[2] = head + 2; m_vertex.head = head + 1; m_vertex.next = head + 3; m_index.tail += 3; - if(src_index + 0 >= next) (this->*m_cvf)(head + 0, src_index + 0); - if(src_index + 1 >= next) (this->*m_cvf)(head + 1, src_index + 1); - /*if(src_index + 2 >= next)*/ (this->*m_cvf)(head + 2, src_index + 2); // this is always a new vertex break; case GS_TRIANGLEFAN: // TODO: remove gaps, next == head && head < tail - 3 || next > head && next < tail - 2 (very rare) @@ -2346,9 +2440,6 @@ __forceinline void GSState::VertexKick(uint32 skip) buff[2] = tail - 1; m_vertex.next = tail; m_index.tail += 3; - if(head >= next) (this->*m_cvf)(head, head); - if(tail - 2 >= next) (this->*m_cvf)(tail - 2, tail - 2); - /*if(tail - 1 >= next)*/ (this->*m_cvf)(tail - 1, tail - 1); // this is always a new vertex break; case GS_SPRITE: buff[0] = head + 0; @@ -2356,10 +2447,8 @@ __forceinline void GSState::VertexKick(uint32 skip) m_vertex.head = head + 2; m_vertex.next = head + 2; m_index.tail += 2; - (this->*m_cvf)(head + 0, head + 0); - (this->*m_cvf)(head + 1, head + 1); break; - case GS_INVALID: + case GS_INVALID: m_vertex.tail = head; break; default: @@ -2425,7 +2514,7 @@ void GSState::GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFR if(wms + wmt < 6) { - GSVector4 st = m_vt->m_min.t.xyxy(m_vt->m_max.t); + GSVector4 st = m_vt.m_min.t.xyxy(m_vt.m_max.t); if(linear) { @@ -2503,7 +2592,7 @@ void GSState::GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFR void GSState::GetAlphaMinMax() { - if(m_vt->m_alpha.valid) + if(m_vt.m_alpha.valid) { return; } @@ -2511,7 +2600,7 @@ void GSState::GetAlphaMinMax() const GSDrawingEnvironment& env = m_env; const GSDrawingContext* context = m_context; - GSVector4i a = m_vt->m_min.c.uph32(m_vt->m_max.c).zzww(); + GSVector4i a = m_vt.m_min.c.uph32(m_vt.m_max.c).zzww(); if(PRIM->TME && context->TEX0.TCC) { @@ -2563,9 +2652,9 @@ void GSState::GetAlphaMinMax() } } - m_vt->m_alpha.min = a.x; - m_vt->m_alpha.max = a.z; - m_vt->m_alpha.valid = true; + m_vt.m_alpha.min = a.x; + m_vt.m_alpha.max = a.z; + m_vt.m_alpha.valid = true; } bool GSState::TryAlphaTest(uint32& fm, uint32& zm) @@ -2582,8 +2671,8 @@ bool GSState::TryAlphaTest(uint32& fm, uint32& zm) { GetAlphaMinMax(); - int amin = m_vt->m_alpha.min; - int amax = m_vt->m_alpha.max; + int amin = m_vt.m_alpha.min; + int amax = m_vt.m_alpha.max; int aref = context->TEST.AREF; @@ -2667,8 +2756,8 @@ bool GSState::IsOpaque() { GetAlphaMinMax(); - amin = m_vt->m_alpha.min; - amax = m_vt->m_alpha.max; + amin = m_vt.m_alpha.min; + amax = m_vt.m_alpha.max; } else if(context->ALPHA.C == 1) { diff --git a/plugins/GSdx/GSState.h b/plugins/GSdx/GSState.h index 0624f39267..8e4431d7be 100644 --- a/plugins/GSdx/GSState.h +++ b/plugins/GSdx/GSState.h @@ -59,8 +59,18 @@ class GSState : public GSAlignedClass<32> GIFRegHandler m_fpGIFRegHandlers[256]; GIFRegHandler m_fpGIFRegHandlerXYZ[8][4]; + typedef void (GSState::*GIFPackedRegHandlerC)(const GIFPackedReg* RESTRICT r, uint32 size); + + GIFPackedRegHandlerC m_fpGIFPackedRegHandlersC[2]; + GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZF2[8]; + GIFPackedRegHandlerC m_fpGIFPackedRegHandlerSTQRGBAXYZ2[8]; + + template void GIFPackedRegHandlerSTQRGBAXYZF2(const GIFPackedReg* RESTRICT r, uint32 size); + template void GIFPackedRegHandlerSTQRGBAXYZ2(const GIFPackedReg* RESTRICT r, uint32 size); + void GIFPackedRegHandlerNOP(const GIFPackedReg* RESTRICT r, uint32 size); + template void ApplyTEX0(GIFRegTEX0& TEX0); - void ApplyPRIM(const GIFRegPRIM& PRIM); + void ApplyPRIM(uint32 prim); void GIFRegHandlerNull(const GIFReg* RESTRICT r); void GIFRegHandlerPRIM(const GIFReg* RESTRICT r); @@ -133,15 +143,14 @@ protected: float m_q; GSVector4 m_scissor; uint32 m_ofxy; + bool m_texflush; struct { - uint8* buff; - size_t stride; + GSVertex* buff; size_t head, tail, next, maxcount; // head: first vertex, tail: last vertex + 1, next: last indexed + 1 GSVector4 xy[4]; size_t xy_tail; - uint8* tmp; } m_vertex; struct @@ -150,26 +159,6 @@ protected: size_t tail; } m_index; - typedef void (GSState::*ConvertVertexPtr)(size_t dst_index, size_t src_index); - - ConvertVertexPtr m_cv[8][2][2], m_cvf; // [PRIM][TME][FST] - - #define InitConvertVertex2(T, P) \ - m_cv[P][0][0] = (ConvertVertexPtr)&T::ConvertVertex; \ - m_cv[P][0][1] = (ConvertVertexPtr)&T::ConvertVertex; \ - m_cv[P][1][0] = (ConvertVertexPtr)&T::ConvertVertex; \ - m_cv[P][1][1] = (ConvertVertexPtr)&T::ConvertVertex; \ - - #define InitConvertVertex(T) \ - InitConvertVertex2(T, GS_POINTLIST) \ - InitConvertVertex2(T, GS_LINELIST) \ - InitConvertVertex2(T, GS_LINESTRIP) \ - InitConvertVertex2(T, GS_TRIANGLELIST) \ - InitConvertVertex2(T, GS_TRIANGLESTRIP) \ - InitConvertVertex2(T, GS_TRIANGLEFAN) \ - InitConvertVertex2(T, GS_SPRITE) \ - InitConvertVertex2(T, GS_INVALID) \ - void UpdateContext(); void UpdateScissor(); @@ -182,7 +171,7 @@ protected: // following functions need m_vt to be initialized - GSVertexTrace* m_vt; + GSVertexTrace m_vt; void GetTextureMinMax(GSVector4i& r, const GIFRegTEX0& TEX0, const GIFRegCLAMP& CLAMP, bool linear); void GetAlphaMinMax(); @@ -205,8 +194,14 @@ public: GSDump m_dump; bool m_nativeres; + int s_n; + bool s_dump; + bool s_save; + bool s_savez; + int s_saven; + public: - GSState(GSVertexTrace* vt, size_t vertex_stride); + GSState(); virtual ~GSState(); void ResetHandlers(); diff --git a/plugins/GSdx/GSTexture11.cpp b/plugins/GSdx/GSTexture11.cpp index da0c285f05..9f199fbb36 100644 --- a/plugins/GSdx/GSTexture11.cpp +++ b/plugins/GSdx/GSTexture11.cpp @@ -167,6 +167,18 @@ GSTexture11::operator ID3D11ShaderResourceView*() return m_srv; } +GSTexture11::operator ID3D11UnorderedAccessView*() +{ + if(!m_uav && m_dev && m_texture) + { + ASSERT(!m_msaa); + + m_dev->CreateUnorderedAccessView(m_texture, NULL, &m_uav); + } + + return m_uav; +} + GSTexture11::operator ID3D11RenderTargetView*() { ASSERT(m_dev); diff --git a/plugins/GSdx/GSTexture11.h b/plugins/GSdx/GSTexture11.h index 1f78df5f34..2d287ac4f9 100644 --- a/plugins/GSdx/GSTexture11.h +++ b/plugins/GSdx/GSTexture11.h @@ -30,6 +30,7 @@ class GSTexture11 : public GSTexture CComPtr m_texture; D3D11_TEXTURE2D_DESC m_desc; CComPtr m_srv; + CComPtr m_uav; CComPtr m_rtv; CComPtr m_dsv; @@ -43,6 +44,7 @@ public: operator ID3D11Texture2D*(); operator ID3D11ShaderResourceView*(); + operator ID3D11UnorderedAccessView*(); operator ID3D11RenderTargetView*(); operator ID3D11DepthStencilView*(); }; diff --git a/plugins/GSdx/GSTextureCache.cpp b/plugins/GSdx/GSTextureCache.cpp index 8a9e572cee..49743f557c 100644 --- a/plugins/GSdx/GSTextureCache.cpp +++ b/plugins/GSdx/GSTextureCache.cpp @@ -281,6 +281,8 @@ GSTextureCache::Target* GSTextureCache::LookupTarget(const GIFRegTEX0& TEX0, int { return NULL; } + + m_renderer->m_dev->ClearRenderTarget(dst->m_texture, 0); // new frame buffers after reset should be cleared, don't display memory garbage } else { diff --git a/plugins/GSdx/GSTextureCacheSW.cpp b/plugins/GSdx/GSTextureCacheSW.cpp index b0d6d83fbf..37e2720067 100644 --- a/plugins/GSdx/GSTextureCacheSW.cpp +++ b/plugins/GSdx/GSTextureCacheSW.cpp @@ -178,6 +178,11 @@ GSTextureCacheSW::Texture::Texture(GSState* state, uint32 tw0, const GIFRegTEX0& m_TEX0 = TEX0; m_TEXA = TEXA; + if(m_tw == 0) + { + m_tw = std::max(m_TEX0.TW, GSLocalMemory::m_psm[m_TEX0.PSM].pal == 0 ? 3 : 5); // makes one row 32 bytes at least, matches the smallest block size that is allocated for m_buff + } + memset(m_valid, 0, sizeof(m_valid)); memset(m_pages.bm, 0, sizeof(m_pages.bm)); @@ -239,17 +244,6 @@ bool GSTextureCacheSW::Texture::Update(const GSVector4i& rect) if(m_buff == NULL) { - uint32 tw0 = std::max(m_TEX0.TW, 5 - shift); // makes one row 32 bytes at least, matches the smallest block size that is allocated for m_buff - - if(m_tw == 0) - { - m_tw = tw0; - } - else - { - ASSERT(m_tw >= tw0); - } - uint32 pitch = (1 << m_tw) << shift; m_buff = _aligned_malloc(pitch * th * 4, 32); diff --git a/plugins/GSdx/GSTextureFX11.cpp b/plugins/GSdx/GSTextureFX11.cpp index 354efd530f..5e7171b294 100644 --- a/plugins/GSdx/GSTextureFX11.cpp +++ b/plugins/GSdx/GSTextureFX11.cpp @@ -82,13 +82,6 @@ bool GSDevice11::CreateTextureFX() return true; } -void GSDevice11::SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim) -{ - IASetVertexBuffer(vertex, sizeof(GSVertexHW11), vertex_count); - IASetIndexBuffer(index, index_count); - IASetPrimitiveTopology((D3D11_PRIMITIVE_TOPOLOGY)prim); -} - void GSDevice11::SetupVS(VSSelector sel, const VSConstantBuffer* cb) { hash_map::const_iterator i = m_vs.find(sel); @@ -118,6 +111,7 @@ void GSDevice11::SetupVS(VSSelector sel, const VSConstantBuffer* cb) {"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0}, {"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0}, {"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0}, {"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0}, }; diff --git a/plugins/GSdx/GSTextureFX9.cpp b/plugins/GSdx/GSTextureFX9.cpp index aff4a33d90..6173334121 100644 --- a/plugins/GSdx/GSTextureFX9.cpp +++ b/plugins/GSdx/GSTextureFX9.cpp @@ -61,13 +61,6 @@ GSTexture* GSDevice9::CreateMskFix(uint32 size, uint32 msk, uint32 fix) return t; } -void GSDevice9::SetupIA(const void* vertex, int vertex_count, const uint32* index, int index_count, int prim) -{ - IASetVertexBuffer(vertex, sizeof(GSVertexHW9), vertex_count); - IASetIndexBuffer(index, index_count); - IASetPrimitiveTopology((D3DPRIMITIVETYPE)prim); -} - void GSDevice9::SetupVS(VSSelector sel, const VSConstantBuffer* cb) { hash_map::const_iterator i = m_vs.find(sel); diff --git a/plugins/GSdx/GSThread.cpp b/plugins/GSdx/GSThread.cpp index 54284285f0..995772bee1 100644 --- a/plugins/GSdx/GSThread.cpp +++ b/plugins/GSdx/GSThread.cpp @@ -28,9 +28,13 @@ InitializeConditionVariablePtr pInitializeConditionVariable; WakeConditionVariablePtr pWakeConditionVariable; WakeAllConditionVariablePtr pWakeAllConditionVariable; SleepConditionVariableSRWPtr pSleepConditionVariableSRW; -InitializeSRWLockPtr pInitializeSRWLock;; +InitializeSRWLockPtr pInitializeSRWLock; AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive; +TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive; ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive; +AcquireSRWLockSharedPtr pAcquireSRWLockShared; +TryAcquireSRWLockSharedPtr pTryAcquireSRWLockShared; +ReleaseSRWLockSharedPtr pReleaseSRWLockShared; class InitCondVar { @@ -47,7 +51,11 @@ public: pSleepConditionVariableSRW = (SleepConditionVariableSRWPtr)GetProcAddress(m_kernel32, "SleepConditionVariableSRW"); pInitializeSRWLock = (InitializeSRWLockPtr)GetProcAddress(m_kernel32, "InitializeSRWLock"); pAcquireSRWLockExclusive = (AcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "AcquireSRWLockExclusive"); + pTryAcquireSRWLockExclusive = (TryAcquireSRWLockExclusivePtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockExclusive"); pReleaseSRWLockExclusive = (ReleaseSRWLockExclusivePtr)GetProcAddress(m_kernel32, "ReleaseSRWLockExclusive"); + pAcquireSRWLockShared = (AcquireSRWLockSharedPtr)GetProcAddress(m_kernel32, "AcquireSRWLockShared"); + pTryAcquireSRWLockShared = (TryAcquireSRWLockSharedPtr)GetProcAddress(m_kernel32, "TryAcquireSRWLockShared"); + pReleaseSRWLockShared = (ReleaseSRWLockSharedPtr)GetProcAddress(m_kernel32, "ReleaseSRWLockShared"); } virtual ~InitCondVar() diff --git a/plugins/GSdx/GSThread.h b/plugins/GSdx/GSThread.h index 99247d8431..4a9e7223dc 100644 --- a/plugins/GSdx/GSThread.h +++ b/plugins/GSdx/GSThread.h @@ -21,25 +21,56 @@ #pragma once +#include "GSdx.h" + +class IGSThread +{ +protected: + virtual void ThreadProc() = 0; +}; + +class IGSLock +{ +public: + virtual void Lock() = 0; + virtual bool TryLock() = 0; + virtual void Unlock() = 0; +}; + +class IGSEvent +{ +public: + virtual void Set() = 0; + virtual bool Wait(IGSLock* l) = 0; +}; + #ifdef _WINDOWS typedef void (WINAPI * InitializeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable); typedef void (WINAPI * WakeConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable); typedef void (WINAPI * WakeAllConditionVariablePtr)(CONDITION_VARIABLE* ConditionVariable); -typedef void (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags); +typedef BOOL (WINAPI * SleepConditionVariableSRWPtr)(CONDITION_VARIABLE* ConditionVariable, SRWLOCK* SRWLock, DWORD dwMilliseconds, ULONG Flags); typedef void (WINAPI * InitializeSRWLockPtr)(SRWLOCK* SRWLock); typedef void (WINAPI * AcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock); +typedef BOOLEAN (WINAPI * TryAcquireSRWLockExclusivePtr)(SRWLOCK* SRWLock); typedef void (WINAPI * ReleaseSRWLockExclusivePtr)(SRWLOCK* SRWLock); +typedef void (WINAPI * AcquireSRWLockSharedPtr)(SRWLOCK* SRWLock); +typedef BOOLEAN (WINAPI * TryAcquireSRWLockSharedPtr)(SRWLOCK* SRWLock); +typedef void (WINAPI * ReleaseSRWLockSharedPtr)(SRWLOCK* SRWLock); extern InitializeConditionVariablePtr pInitializeConditionVariable; extern WakeConditionVariablePtr pWakeConditionVariable; extern WakeAllConditionVariablePtr pWakeAllConditionVariable; extern SleepConditionVariableSRWPtr pSleepConditionVariableSRW; -extern InitializeSRWLockPtr pInitializeSRWLock;; +extern InitializeSRWLockPtr pInitializeSRWLock; extern AcquireSRWLockExclusivePtr pAcquireSRWLockExclusive; +extern TryAcquireSRWLockExclusivePtr pTryAcquireSRWLockExclusive; extern ReleaseSRWLockExclusivePtr pReleaseSRWLockExclusive; +extern AcquireSRWLockSharedPtr pAcquireSRWLockShared; +extern TryAcquireSRWLockSharedPtr pTryAcquireSRWLockShared; +extern ReleaseSRWLockSharedPtr pReleaseSRWLockShared; -class GSThread +class GSThread : public IGSThread { DWORD m_ThreadId; HANDLE m_hThread; @@ -47,8 +78,6 @@ class GSThread static DWORD WINAPI StaticThreadProc(void* lpParam); protected: - virtual void ThreadProc() = 0; - void CreateThread(); void CloseThread(); @@ -57,7 +86,7 @@ public: virtual ~GSThread(); }; -class GSCritSec +class GSCritSec : public IGSLock { CRITICAL_SECTION m_cs; @@ -65,26 +94,25 @@ public: GSCritSec() {InitializeCriticalSection(&m_cs);} ~GSCritSec() {DeleteCriticalSection(&m_cs);} - void Lock() {EnterCriticalSection(&m_cs);} - bool TryLock() {return TryEnterCriticalSection(&m_cs) == TRUE;} - void Unlock() {LeaveCriticalSection(&m_cs);} + void Lock() {EnterCriticalSection(&m_cs);} + bool TryLock() {return TryEnterCriticalSection(&m_cs) == TRUE;} + void Unlock() {LeaveCriticalSection(&m_cs);} }; -class GSEvent +class GSEvent : public IGSEvent { protected: HANDLE m_hEvent; public: - GSEvent(bool manual = false, bool initial = false) {m_hEvent = CreateEvent(NULL, manual, initial, NULL);} + GSEvent() {m_hEvent = CreateEvent(NULL, FALSE, FALSE, NULL);} ~GSEvent() {CloseHandle(m_hEvent);} void Set() {SetEvent(m_hEvent);} - void Reset() {ResetEvent(m_hEvent);} - bool Wait() {return WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0;} + bool Wait(IGSLock* l) {if(l) l->Unlock(); bool b = WaitForSingleObject(m_hEvent, INFINITE) == WAIT_OBJECT_0; if(l) l->Lock(); return b;} }; -class GSCondVarLock +class GSCondVarLock : public IGSLock { SRWLOCK m_lock; @@ -92,12 +120,13 @@ public: GSCondVarLock() {pInitializeSRWLock(&m_lock);} void Lock() {pAcquireSRWLockExclusive(&m_lock);} + bool TryLock() {return pTryAcquireSRWLockExclusive(&m_lock) == TRUE;} void Unlock() {pReleaseSRWLockExclusive(&m_lock);} - + operator SRWLOCK* () {return &m_lock;} }; -class GSCondVar +class GSCondVar : public IGSEvent { CONDITION_VARIABLE m_cv; @@ -105,7 +134,7 @@ public: GSCondVar() {pInitializeConditionVariable(&m_cv);} void Set() {pWakeConditionVariable(&m_cv);} - void Wait(GSCondVarLock& lock) {pSleepConditionVariableSRW(&m_cv, lock, INFINITE, 0);} + bool Wait(IGSLock* l) {return pSleepConditionVariableSRW(&m_cv, *(GSCondVarLock*)l, INFINITE, 0) != 0;} operator CONDITION_VARIABLE* () {return &m_cv;} }; @@ -114,9 +143,8 @@ public: #include #include -#include "GSdx.h" -class GSThread +class GSThread : public IGSThread { pthread_attr_t m_thread_attr; pthread_t m_thread; @@ -124,8 +152,6 @@ class GSThread static void* StaticThreadProc(void* param); protected: - virtual void ThreadProc() = 0; - void CreateThread(); void CloseThread(); @@ -134,16 +160,16 @@ public: virtual ~GSThread(); }; -class GSCritSec +class GSCritSec : public IGSLock { pthread_mutexattr_t m_mutex_attr; pthread_mutex_t m_mutex; public: - GSCritSec() + GSCritSec(bool recursive = true) { pthread_mutexattr_init(&m_mutex_attr); - pthread_mutexattr_settype(&m_mutex_attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutexattr_settype(&m_mutex_attr, recursive ? PTHREAD_MUTEX_RECURSIVE : PTHREAD_MUTEX_NORMAL); pthread_mutex_init(&m_mutex, &m_mutex_attr); } @@ -158,7 +184,7 @@ public: void Unlock() {pthread_mutex_unlock(&m_mutex);} }; -class GSEvent +class GSEvent : public IGSEvent { protected: sem_t m_sem; @@ -168,35 +194,18 @@ public: ~GSEvent() {sem_destroy(&m_sem);} void Set() {sem_post(&m_sem);} - bool Wait() {return sem_wait(&m_sem) == 0;} + bool Wait(IGSLock* l) {if(l) l->Unlock(); bool b = sem_wait(&m_sem) == 0; if(l) l->Lock(); return b;} }; -// Note except the mutex attribute the code is same as GSCritSec object -class GSCondVarLock +class GSCondVarLock : public GSCritSec { - pthread_mutexattr_t m_mutex_attr; - pthread_mutex_t m_mutex; - public: - GSCondVarLock() + GSCondVarLock() : GSCritSec(false) { - pthread_mutexattr_init(&m_mutex_attr); - pthread_mutexattr_settype(&m_mutex_attr, PTHREAD_MUTEX_NORMAL); - pthread_mutex_init(&m_mutex, &m_mutex_attr); } - virtual ~GSCondVarLock() - { - pthread_mutex_destroy(&m_mutex); - pthread_mutexattr_destroy(&m_mutex_attr); - } - - void Lock() {pthread_mutex_lock(&m_mutex);} - void Unlock() {pthread_mutex_unlock(&m_mutex);} - - operator pthread_mutex_t* () {return &m_mutex;} }; -class GSCondVar +class GSCondVar : public IGSEvent { pthread_cond_t m_cv; pthread_condattr_t m_cv_attr; @@ -207,6 +216,7 @@ public: pthread_condattr_init(&m_cv_attr); pthread_cond_init(&m_cv, &m_cv_attr); } + virtual ~GSCondVar() { pthread_condattr_destroy(&m_cv_attr); @@ -214,7 +224,7 @@ public: } void Set() {pthread_cond_signal(&m_cv);} - void Wait(GSCondVarLock& lock) {pthread_cond_wait(&m_cv, lock);} + bool Wait(IGSLock* l) {pthread_cond_wait(&m_cv, *(GSCondVarLock*)l) == 0;} operator pthread_cond_t* () {return &m_cv;} }; @@ -223,102 +233,49 @@ public: class GSAutoLock { -protected: - GSCritSec* m_cs; + IGSLock* m_lock; public: - GSAutoLock(GSCritSec* cs) {m_cs = cs; m_cs->Lock();} - ~GSAutoLock() {m_cs->Unlock();} -}; - -class GSEventSpin -{ -protected: - volatile long m_sync; - volatile bool m_manual; - -public: - GSEventSpin(bool manual = false, bool initial = false) {m_sync = initial ? 1 : 0; m_manual = manual;} - ~GSEventSpin() {} - - void Set() {_interlockedbittestandset(&m_sync, 0);} - void Reset() {_interlockedbittestandreset(&m_sync, 0);} - bool Wait() - { - if(m_manual) while(!m_sync) _mm_pause(); - else while(!_interlockedbittestandreset(&m_sync, 0)) _mm_pause(); - return true; - } + GSAutoLock(IGSLock* l) {(m_lock = l)->Lock();} + ~GSAutoLock() {m_lock->Unlock();} }; template class GSJobQueue : private GSThread { protected: - int m_count; queue m_queue; + volatile long m_count; // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent) volatile bool m_exit; - struct {GSCritSec lock; GSEvent notempty; volatile long count;} m_ev; - struct {GSCondVar notempty, empty; GSCondVarLock lock; bool available;} m_cv; + IGSEvent* m_notempty; + IGSEvent* m_empty; + IGSLock* m_lock; void ThreadProc() { - if(m_cv.available) + m_lock->Lock(); + + while(true) { - m_cv.lock.Lock(); - - while(true) + while(m_queue.empty()) { - while(m_queue.empty()) - { - m_cv.notempty.Wait(m_cv.lock); + m_notempty->Wait(m_lock); - if(m_exit) {m_cv.lock.Unlock(); return;} - } - - T& item = m_queue.front(); - - m_cv.lock.Unlock(); - - Process(item); - - m_cv.lock.Lock(); - - m_queue.pop(); - - if(m_queue.empty()) - { - m_cv.empty.Set(); - } + if(m_exit) {m_lock->Unlock(); return;} } - } - else - { - m_ev.lock.Lock(); - while(true) + T& item = m_queue.front(); + + m_lock->Unlock(); + + Process(item); + + m_lock->Lock(); + + m_queue.pop(); + + if(--m_count == 0) { - while(m_queue.empty()) - { - m_ev.lock.Unlock(); - - m_ev.notempty.Wait(); - - if(m_exit) {return;} - - m_ev.lock.Lock(); - } - - T& item = m_queue.front(); - - m_ev.lock.Unlock(); - - Process(item); - - m_ev.lock.Lock(); - - m_queue.pop(); - - _InterlockedDecrement(&m_ev.count); + m_empty->Set(); } } } @@ -328,19 +285,30 @@ public: : m_count(0) , m_exit(false) { - m_ev.count = 0; + bool condvar = !!theApp.GetConfig("condvar", 1); #ifdef _WINDOWS - m_cv.available = pInitializeConditionVariable != NULL; - - #elif defined(_LINUX) - - //m_cv.available = true; - m_cv.available = !!theApp.GetConfig("condvar", 1); + if(pInitializeConditionVariable == NULL) + { + condvar = false; + } #endif + if(condvar) + { + m_notempty = new GSCondVar(); + m_empty = new GSCondVar(); + m_lock = new GSCondVarLock(); + } + else + { + m_notempty = new GSEvent(); + m_empty = new GSEvent(); + m_lock = new GSCritSec(); + } + CreateThread(); } @@ -348,68 +316,51 @@ public: { m_exit = true; - if(m_cv.available) - { - m_cv.notempty.Set(); - } - else - { - m_ev.notempty.Set(); - } + m_notempty->Set(); + + CloseThread(); + + delete m_notempty; + delete m_empty; + delete m_lock; } - int GetCount() const + bool IsEmpty() const { - return m_count; + ASSERT(m_count >= 0); + + return m_count == 0; } - virtual void Push(const T& item) + void Push(const T& item) { - if(m_cv.available) - { - m_cv.lock.Lock(); + m_lock->Lock(); - m_queue.push(item); + m_queue.push(item); - m_cv.lock.Unlock(); - - m_cv.notempty.Set(); - } - else + if(m_count++ == 0) { - GSAutoLock l(&m_ev.lock); - - m_queue.push(item); - - _InterlockedIncrement(&m_ev.count); - - m_ev.notempty.Set(); + m_notempty->Set(); } - m_count++; + m_lock->Unlock(); } - virtual void Wait() + void Wait() { - if(m_cv.available) + if(m_count > 0) { - m_cv.lock.Lock(); + m_lock->Lock(); - while(!m_queue.empty()) + while(m_count != 0) { - m_cv.empty.Wait(m_cv.lock); + m_empty->Wait(m_lock); } - m_cv.lock.Unlock(); + ASSERT(m_queue.empty()); + + m_lock->Unlock(); } - else - { - // NOTE: it is the safest to have our own counter because m_queue.pop() might decrement its own before the last item runs out of its scope and gets destroyed (implementation dependent) - - while(m_ev.count > 0) _mm_pause(); - } - - m_count++; } virtual void Process(T& item) = 0; diff --git a/plugins/GSdx/GSVector.cpp b/plugins/GSdx/GSVector.cpp index 47e724966c..27b6a5a0e9 100644 --- a/plugins/GSdx/GSVector.cpp +++ b/plugins/GSdx/GSVector.cpp @@ -22,6 +22,48 @@ #include "stdafx.h" #include "GSVector.h" +const GSVector4i GSVector4i::m_xff[17] = +{ + GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0x000000ff, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0x00ffffff, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0xffffffff, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0xffffffff, 0x000000ff, 0x00000000, 0x00000000), + GSVector4i(0xffffffff, 0x0000ffff, 0x00000000, 0x00000000), + GSVector4i(0xffffffff, 0x00ffffff, 0x00000000, 0x00000000), + GSVector4i(0xffffffff, 0xffffffff, 0x00000000, 0x00000000), + GSVector4i(0xffffffff, 0xffffffff, 0x000000ff, 0x00000000), + GSVector4i(0xffffffff, 0xffffffff, 0x0000ffff, 0x00000000), + GSVector4i(0xffffffff, 0xffffffff, 0x00ffffff, 0x00000000), + GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00000000), + GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x000000ff), + GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x0000ffff), + GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0x00ffffff), + GSVector4i(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff), +}; + +const GSVector4i GSVector4i::m_x0f[17] = +{ + GSVector4i(0x00000000, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0x0000000f, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0x00000f0f, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0x000f0f0f, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x00000000, 0x00000000, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x0000000f, 0x00000000, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x00000f0f, 0x00000000, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x000f0f0f, 0x00000000, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000000, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000000), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0000000f), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x00000f0f), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x000f0f0f), + GSVector4i(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f), +}; + const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f); const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f); const GSVector4 GSVector4::m_half(0.5f); diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 98afea5d9f..de20807352 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -79,6 +79,9 @@ class GSVector4; __aligned(class, 16) GSVector4i { + static const GSVector4i m_xff[17]; + static const GSVector4i m_x0f[17]; + public: union { @@ -2343,6 +2346,9 @@ public: __forceinline static GSVector4i xfff8(const GSVector4i& v) {return xffffffff(v).sll16( 3);} __forceinline static GSVector4i xfffc(const GSVector4i& v) {return xffffffff(v).sll16( 2);} __forceinline static GSVector4i xfffe(const GSVector4i& v) {return xffffffff(v).sll16( 1);} + + __forceinline static GSVector4i xff(int n) {return m_xff[n];} + __forceinline static GSVector4i x0f(int n) {return m_x0f[n];} }; __aligned(class, 16) GSVector4 @@ -2909,6 +2915,11 @@ public: return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p)); } + __forceinline static void storent(void* p, const GSVector4& v) + { + _mm_stream_ps((float*)p, v.m); + } + __forceinline static void storel(void* p, const GSVector4& v) { _mm_store_sd((double*)p, _mm_castps_pd(v.m)); diff --git a/plugins/GSdx/GSVertex.h b/plugins/GSdx/GSVertex.h index fb69cb86f0..94457f9560 100644 --- a/plugins/GSdx/GSVertex.h +++ b/plugins/GSdx/GSVertex.h @@ -37,7 +37,8 @@ __aligned(struct, 32) GSVertex GIFRegST ST; GIFRegRGBAQ RGBAQ; GIFRegXYZ XYZ; - uint32 UV, FOG; + union {uint32 UV; struct {uint16 U, V;};}; + uint32 FOG; }; __m128i m[2]; diff --git a/plugins/GSdx/GSVertexHW.h b/plugins/GSdx/GSVertexHW.h index 6262e28804..0f39edb836 100644 --- a/plugins/GSdx/GSVertexHW.h +++ b/plugins/GSdx/GSVertexHW.h @@ -37,45 +37,4 @@ __aligned(struct, 32) GSVertexHW9 GSVertexHW9& operator = (GSVertexHW9& v) {t = v.t; p = v.p; return *this;} }; -__aligned(union, 32) GSVertexHW11 -{ - struct - { - union - { - struct {float x, y;} t; - GIFRegST ST; - }; - - union - { - union {struct {uint8 r, g, b, a; float q;}; uint32 c0;}; - GIFRegRGBAQ RGBAQ; - }; - - union - { - struct {union {struct {uint16 x, y;}; uint32 xy;}; uint32 z;} p; - GIFRegXYZ XYZ; - }; - - union - { - struct {uint32 _pad; union {struct {uint8 ta0, ta1, res, f;}; uint32 c1;};}; - GIFRegFOG FOG; - }; - }; - - GSVertexHW11& operator = (GSVertexHW11& v) - { - GSVector4i* RESTRICT src = (GSVector4i*)&v; - GSVector4i* RESTRICT dst = (GSVector4i*)this; - - dst[0] = src[0]; - dst[1] = src[1]; - - return *this; - } -}; - #pragma pack(pop) diff --git a/plugins/GSdx/GSVertexTrace.cpp b/plugins/GSdx/GSVertexTrace.cpp index 413d5799aa..955c1bc801 100644 --- a/plugins/GSdx/GSVertexTrace.cpp +++ b/plugins/GSdx/GSVertexTrace.cpp @@ -29,10 +29,38 @@ const GSVector4 GSVertexTrace::s_minmax(FLT_MAX, -FLT_MAX); GSVertexTrace::GSVertexTrace(const GSState* state) : m_state(state) { + #define InitUpdate3(P, IIP, TME, FST, COLOR) \ + m_fmm[COLOR][FST][TME][IIP][P] = &GSVertexTrace::FindMinMax; + + #define InitUpdate2(P, IIP, TME) \ + InitUpdate3(P, IIP, TME, 0, 0) \ + InitUpdate3(P, IIP, TME, 0, 1) \ + InitUpdate3(P, IIP, TME, 1, 0) \ + InitUpdate3(P, IIP, TME, 1, 1) \ + + #define InitUpdate(P) \ + InitUpdate2(P, 0, 0) \ + InitUpdate2(P, 0, 1) \ + InitUpdate2(P, 1, 0) \ + InitUpdate2(P, 1, 1) \ + + InitUpdate(GS_POINT_CLASS); + InitUpdate(GS_LINE_CLASS); + InitUpdate(GS_TRIANGLE_CLASS); + InitUpdate(GS_SPRITE_CLASS); } void GSVertexTrace::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass) { + m_primclass = primclass; + + uint32 iip = m_state->PRIM->IIP; + uint32 tme = m_state->PRIM->TME; + uint32 fst = m_state->PRIM->FST; + uint32 color = !(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC); + + (this->*m_fmm[color][fst][tme][iip][primclass])(vertex, index, count); + m_eq.value = (m_min.c == m_max.c).mask() | ((m_min.p == m_max.p).mask() << 16) | ((m_min.t == m_max.t).mask() << 20); m_alpha.valid = false; @@ -82,90 +110,350 @@ void GSVertexTrace::Update(const void* vertex, const uint32* index, int count, G } } -uint32 GSVertexTrace::Hash(GS_PRIM_CLASS primclass) +template +void GSVertexTrace::FindMinMax(const void* vertex, const uint32* index, int count) { - m_primclass = primclass; - - uint32 hash = m_primclass | (m_state->PRIM->IIP << 2) | (m_state->PRIM->TME << 3) | (m_state->PRIM->FST << 4); - - if(!(m_state->PRIM->TME && m_state->m_context->TEX0.TFX == TFX_DECAL && m_state->m_context->TEX0.TCC)) - { - hash |= 1 << 5; - } - - return hash; -} - -GSVertexTraceSW::GSVertexTraceSW(const GSState* state) - : GSVertexTrace(state) - , m_map("VertexTraceSW", NULL) -{ -} - -void GSVertexTraceSW::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass) -{ - m_map[Hash(primclass)](count, vertex, index, m_min, m_max); - - GSVertexTrace::Update(vertex, index, count, primclass); -} - -GSVertexTraceDX9::GSVertexTraceDX9(const GSState* state) - : GSVertexTrace(state) - , m_map("VertexTraceHW9", NULL) -{ -} - -void GSVertexTraceDX9::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass) -{ - m_map[Hash(primclass)](count, vertex, index, m_min, m_max); - const GSDrawingContext* context = m_state->m_context; - GSVector4 o(context->XYOFFSET); - GSVector4 s(1.0f / 16, 1.0f / 16, 1.0f, 1.0f); + int n = 1; - m_min.p = (m_min.p - o) * s; - m_max.p = (m_max.p - o) * s; - - if(m_state->PRIM->TME) + switch(primclass) { - if(m_state->PRIM->FST) - { - s = GSVector4(1 << (16 - 4), 1).xxyy(); - } - else - { - s = GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH, 1, 1); - } - - m_min.t *= s; - m_max.t *= s; + case GS_POINT_CLASS: + n = 1; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + n = 2; + break; + case GS_TRIANGLE_CLASS: + n = 3; + break; } - GSVertexTrace::Update(vertex, index, count, primclass); -} + GSVector4 tmin = s_minmax.xxxx(); + GSVector4 tmax = s_minmax.yyyy(); + GSVector4i cmin = GSVector4i::xffffffff(); + GSVector4i cmax = GSVector4i::zero(); -GSVertexTraceDX11::GSVertexTraceDX11(const GSState* state) - : GSVertexTrace(state) - , m_map("VertexTraceHW11", NULL) -{ -} + #if _M_SSE >= 0x401 -void GSVertexTraceDX11::Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass) -{ - m_map[Hash(primclass)](count, vertex, index, m_min, m_max); + GSVector4i pmin = GSVector4i::xffffffff(); + GSVector4i pmax = GSVector4i::zero(); - const GSDrawingContext* context = m_state->m_context; + #else + + GSVector4 pmin = s_minmax.xxxx(); + GSVector4 pmax = s_minmax.yyyy(); + + #endif + + const GSVertex* RESTRICT v = (GSVertex*)vertex; + + for(int i = 0; i < count; i += n) + { + if(primclass == GS_POINT_CLASS) + { + GSVector4i c(v[index[i]].m[0]); + + if(color) + { + cmin = cmin.min_u8(c); + cmax = cmax.max_u8(c); + } + + if(tme) + { + if(!fst) + { + GSVector4 stq = GSVector4::cast(c); + + GSVector4 q = stq.wwww(); + + stq = (stq.xyww() * q.rcpnr()).xyww(q); + + tmin = tmin.min(stq); + tmax = tmax.max(stq); + } + else + { + GSVector4i uv(v[index[i]].m[1]); + + GSVector4 st = GSVector4(uv.uph16()).xyxy(); + + tmin = tmin.min(st); + tmax = tmax.max(st); + } + } + + GSVector4i xyzf(v[index[i]].m[1]); + + GSVector4i xy = xyzf.upl16(); + GSVector4i z = xyzf.yyyy(); + + #if _M_SSE >= 0x401 + + GSVector4i p = xy.blend16<0xf0>(z.uph32(xyzf)); + + pmin = pmin.min_u32(p); + pmax = pmax.max_u32(p); + + #else + + GSVector4 p = GSVector4(xy.upl64(z.srl32(1).upl32(xyzf.wwww()))); + + pmin = pmin.min(p); + pmax = pmax.max(p); + + #endif + } + else if(primclass == GS_LINE_CLASS) + { + GSVector4i c0(v[index[i + 0]].m[0]); + GSVector4i c1(v[index[i + 1]].m[0]); + + if(color) + { + if(iip) + { + cmin = cmin.min_u8(c0.min_u8(c1)); + cmax = cmax.max_u8(c0.max_u8(c1)); + } + else + { + cmin = cmin.min_u8(c1); + cmax = cmax.max_u8(c1); + } + } + + if(tme) + { + if(!fst) + { + GSVector4 stq0 = GSVector4::cast(c0); + GSVector4 stq1 = GSVector4::cast(c1); + + GSVector4 q = stq0.wwww(stq1).rcpnr(); + + stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0); + stq1 = (stq1.xyww() * q.zzzz()).xyww(stq1); + + tmin = tmin.min(stq0.min(stq1)); + tmax = tmax.max(stq0.max(stq1)); + } + else + { + GSVector4i uv0(v[index[i + 0]].m[1]); + GSVector4i uv1(v[index[i + 1]].m[1]); + + GSVector4 st0 = GSVector4(uv0.uph16()).xyxy(); + GSVector4 st1 = GSVector4(uv1.uph16()).xyxy(); + + tmin = tmin.min(st0.min(st1)); + tmax = tmax.max(st0.max(st1)); + } + } + + GSVector4i xyzf0(v[index[i + 0]].m[1]); + GSVector4i xyzf1(v[index[i + 1]].m[1]); + + GSVector4i xy0 = xyzf0.upl16(); + GSVector4i z0 = xyzf0.yyyy(); + GSVector4i xy1 = xyzf1.upl16(); + GSVector4i z1 = xyzf1.yyyy(); + + #if _M_SSE >= 0x401 + + GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0)); + GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1)); + + pmin = pmin.min_u32(p0.min_u32(p1)); + pmax = pmax.max_u32(p0.max_u32(p1)); + + #else + + GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww()))); + GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww()))); + + pmin = pmin.min(p0.min(p1)); + pmax = pmax.max(p0.max(p1)); + + #endif + } + else if(primclass == GS_TRIANGLE_CLASS) + { + GSVector4i c0(v[index[i + 0]].m[0]); + GSVector4i c1(v[index[i + 1]].m[0]); + GSVector4i c2(v[index[i + 2]].m[0]); + + if(color) + { + if(iip) + { + cmin = cmin.min_u8(c2).min_u8(c0.min_u8(c1)); + cmax = cmax.max_u8(c2).max_u8(c0.max_u8(c1)); + } + else + { + cmin = cmin.min_u8(c2); + cmax = cmax.max_u8(c2); + } + } + + if(tme) + { + if(!fst) + { + GSVector4 stq0 = GSVector4::cast(c0); + GSVector4 stq1 = GSVector4::cast(c1); + GSVector4 stq2 = GSVector4::cast(c2); + + GSVector4 q = stq0.wwww(stq1).xzww(stq2).rcpnr(); + + stq0 = (stq0.xyww() * q.xxxx()).xyww(stq0); + stq1 = (stq1.xyww() * q.yyyy()).xyww(stq1); + stq2 = (stq2.xyww() * q.zzzz()).xyww(stq2); + + tmin = tmin.min(stq2).min(stq0.min(stq1)); + tmax = tmax.max(stq2).max(stq0.max(stq1)); + } + else + { + GSVector4i uv0(v[index[i + 0]].m[1]); + GSVector4i uv1(v[index[i + 1]].m[1]); + GSVector4i uv2(v[index[i + 2]].m[1]); + + GSVector4 st0 = GSVector4(uv0.uph16()).xyxy(); + GSVector4 st1 = GSVector4(uv1.uph16()).xyxy(); + GSVector4 st2 = GSVector4(uv2.uph16()).xyxy(); + + tmin = tmin.min(st2).min(st0.min(st1)); + tmax = tmax.max(st2).max(st0.max(st1)); + } + } + + GSVector4i xyzf0(v[index[i + 0]].m[1]); + GSVector4i xyzf1(v[index[i + 1]].m[1]); + GSVector4i xyzf2(v[index[i + 2]].m[1]); + + GSVector4i xy0 = xyzf0.upl16(); + GSVector4i z0 = xyzf0.yyyy(); + GSVector4i xy1 = xyzf1.upl16(); + GSVector4i z1 = xyzf1.yyyy(); + GSVector4i xy2 = xyzf2.upl16(); + GSVector4i z2 = xyzf2.yyyy(); + + #if _M_SSE >= 0x401 + + GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf0)); + GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1)); + GSVector4i p2 = xy2.blend16<0xf0>(z2.uph32(xyzf2)); + + pmin = pmin.min_u32(p2).min_u32(p0.min_u32(p1)); + pmax = pmax.max_u32(p2).max_u32(p0.max_u32(p1)); + + #else + + GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf0.wwww()))); + GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww()))); + GSVector4 p2 = GSVector4(xy2.upl64(z2.srl32(1).upl32(xyzf2.wwww()))); + + pmin = pmin.min(p2).min(p0.min(p1)); + pmax = pmax.max(p2).max(p0.max(p1)); + + #endif + } + else if(primclass == GS_SPRITE_CLASS) + { + GSVector4i c0(v[index[i + 0]].m[0]); + GSVector4i c1(v[index[i + 1]].m[0]); + + if(color) + { + if(iip) + { + cmin = cmin.min_u8(c0.min_u8(c1)); + cmax = cmax.max_u8(c0.max_u8(c1)); + } + else + { + cmin = cmin.min_u8(c1); + cmax = cmax.max_u8(c1); + } + } + + if(tme) + { + if(!fst) + { + GSVector4 stq0 = GSVector4::cast(c0); + GSVector4 stq1 = GSVector4::cast(c1); + + GSVector4 q = stq1.wwww().rcpnr(); + + stq0 = (stq0.xyww() * q).xyww(stq1); + stq1 = (stq1.xyww() * q).xyww(stq1); + + tmin = tmin.min(stq0.min(stq1)); + tmax = tmax.max(stq0.max(stq1)); + } + else + { + GSVector4i uv0(v[index[i + 0]].m[1]); + GSVector4i uv1(v[index[i + 1]].m[1]); + + GSVector4 st0 = GSVector4(uv0.uph16()).xyxy(); + GSVector4 st1 = GSVector4(uv1.uph16()).xyxy(); + + tmin = tmin.min(st0.min(st1)); + tmax = tmax.max(st0.max(st1)); + } + } + + GSVector4i xyzf0(v[index[i + 0]].m[1]); + GSVector4i xyzf1(v[index[i + 1]].m[1]); + + GSVector4i xy0 = xyzf0.upl16(); + GSVector4i z0 = xyzf0.yyyy(); + GSVector4i xy1 = xyzf1.upl16(); + GSVector4i z1 = xyzf1.yyyy(); + + #if _M_SSE >= 0x401 + + GSVector4i p0 = xy0.blend16<0xf0>(z0.uph32(xyzf1)); + GSVector4i p1 = xy1.blend16<0xf0>(z1.uph32(xyzf1)); + + pmin = pmin.min_u32(p0.min_u32(p1)); + pmax = pmax.max_u32(p0.max_u32(p1)); + + #else + + GSVector4 p0 = GSVector4(xy0.upl64(z0.srl32(1).upl32(xyzf1.wwww()))); + GSVector4 p1 = GSVector4(xy1.upl64(z1.srl32(1).upl32(xyzf1.wwww()))); + + pmin = pmin.min(p0.min(p1)); + pmax = pmax.max(p0.max(p1)); + + #endif + } + } + + #if _M_SSE >= 0x401 + + pmin = pmin.blend16<0x30>(pmin.srl32(1)); + pmax = pmax.blend16<0x30>(pmax.srl32(1)); + + #endif GSVector4 o(context->XYOFFSET); GSVector4 s(1.0f / 16, 1.0f / 16, 2.0f, 1.0f); - m_min.p = (m_min.p - o) * s; - m_max.p = (m_max.p - o) * s; + m_min.p = (GSVector4(pmin) - o) * s; + m_max.p = (GSVector4(pmax) - o) * s; - if(m_state->PRIM->TME) + if(tme) { - if(m_state->PRIM->FST) + if(fst) { s = GSVector4(1 << (16 - 4), 1).xxyy(); } @@ -174,10 +462,23 @@ void GSVertexTraceDX11::Update(const void* vertex, const uint32* index, int coun s = GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH, 1, 1); } - m_min.t *= s; - m_max.t *= s; + m_min.t = tmin * s; + m_max.t = tmax * s; + } + else + { + m_min.t = GSVector4::zero(); + m_max.t = GSVector4::zero(); } - GSVertexTrace::Update(vertex, index, count, primclass); + if(color) + { + m_min.c = cmin.zzzz().u8to32(); + m_max.c = cmax.zzzz().u8to32(); + } + else + { + m_min.c = GSVector4i::zero(); + m_max.c = GSVector4i::zero(); + } } - diff --git a/plugins/GSdx/GSVertexTrace.h b/plugins/GSdx/GSVertexTrace.h index 4c0a5653b0..a5734fafc7 100644 --- a/plugins/GSdx/GSVertexTrace.h +++ b/plugins/GSdx/GSVertexTrace.h @@ -38,12 +38,15 @@ public: protected: const GSState* m_state; - uint32 Hash(GS_PRIM_CLASS primclass); - - typedef void (*VertexTracePtr)(int count, const void* vertex, const uint32* index, Vertex& min, Vertex& max); - static const GSVector4 s_minmax; + typedef void (GSVertexTrace::*FindMinMaxPtr)(const void* vertex, const uint32* index, int count); + + FindMinMaxPtr m_fmm[2][2][2][2][4]; + + template + void FindMinMax(const void* vertex, const uint32* index, int count); + public: GS_PRIM_CLASS m_primclass; @@ -69,55 +72,7 @@ public: GSVertexTrace(const GSState* state); virtual ~GSVertexTrace() {} - virtual void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass); + void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass); bool IsLinear() const {return m_filter.linear;} }; - -__aligned(class, 32) GSVertexTraceSW : public GSVertexTrace -{ - class CG : public GSCodeGenerator - { - public: - CG(const void* param, uint32 key, void* code, size_t maxsize); - }; - - GSCodeGeneratorFunctionMap m_map; - -public: - GSVertexTraceSW(const GSState* state); - - void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass); -}; - -__aligned(class, 32) GSVertexTraceDX9 : public GSVertexTrace -{ - class CG : public GSCodeGenerator - { - public: - CG(const void* param, uint32 key, void* code, size_t maxsize); - }; - - GSCodeGeneratorFunctionMap m_map; - -public: - GSVertexTraceDX9(const GSState* state); - - void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass); -}; - -__aligned(class, 32) GSVertexTraceDX11 : public GSVertexTrace -{ - class CG : public GSCodeGenerator - { - public: - CG(const void* param, uint32 key, void* code, size_t maxsize); - }; - - GSCodeGeneratorFunctionMap m_map; - -public: - GSVertexTraceDX11(const GSState* state); - - void Update(const void* vertex, const uint32* index, int count, GS_PRIM_CLASS primclass); -}; diff --git a/plugins/GSdx/GSVertexTrace.x64.avx.cpp b/plugins/GSdx/GSVertexTrace.x64.avx.cpp deleted file mode 100644 index 880e5644e4..0000000000 --- a/plugins/GSdx/GSVertexTrace.x64.avx.cpp +++ /dev/null @@ -1,496 +0,0 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSVertexTrace.h" - -#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - sub(rsp, 8 + 2 * 16); - - vmovdqa(ptr[rsp + 0], xmm6); - vmovdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - vbroadcastss(xmm4, ptr[rax + 0]); - vbroadcastss(xmm5, ptr[rax + 4]); - - if(color) - { - // min.c = FLT_MAX; - // max.c = -FLT_MAX; - - vmovaps(xmm2, xmm4); - vmovaps(xmm3, xmm5); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - vmovaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); - vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1)) - { - // min.c = min.c.minv(v[i + j].c); - // max.c = max.c.maxv(v[i + j].c); - - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]); - - vminps(xmm2, xmm0); - vmaxps(xmm3, xmm0); - } - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]); - - vminps(xmm4, xmm0); - vmaxps(xmm5, xmm0); - - if(tme) - { - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); - - if(!fst) - { - if(primclass != GS_SPRITE_CLASS) - { - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - } - - vdivps(xmm0, xmm1); - vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); - } - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - } - - add(rdx, n * sizeof(GSVertexSW)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - vcvttps2dq(xmm2, xmm2); - vpsrld(xmm2, 7); - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); - - vcvttps2dq(xmm3, xmm3); - vpsrld(xmm3, 7); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - vmovdqa(xmm6, ptr[rsp + 0]); - vmovdqa(xmm7, ptr[rsp + 16]); - - add(rsp, 8 + 2 * 16); - - ret(); -} - -GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - case GS_SPRITE_CLASS: - n = 6; - break; - } - - sub(rsp, 8 + 2 * 16); - - vmovdqa(ptr[rsp + 0], xmm6); - vmovdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - vbroadcastss(xmm4, ptr[rax + 0]); - vbroadcastss(xmm5, ptr[rax + 4]); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - vpcmpeqd(xmm2, xmm2); - vpxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - vmovaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); - vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - for(int j = 0; j < n; j++) - { - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); - - vminps(xmm4, xmm0); - vmaxps(xmm5, xmm0); - - if(tme && !fst && primclass != GS_SPRITE_CLASS) - { - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - } - - if(color && (iip || j == n - 1) || tme) - { - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]); - } - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.min_u8(v[i + j].c); - // max.c = max.c.min_u8(v[i + j].c); - - vpminub(xmm2, xmm0); - vpmaxub(xmm3, xmm0); - } - - if(tme) - { - vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral - - if(!fst) - { - // t /= p.wwww(); - - vdivps(xmm0, xmm1); - } - - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - } - - add(rdx, n * sizeof(GSVertexHW9)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm2, xmm2); - - vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm3, xmm3); - - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - // m_min.p = pmin; - // m_max.p = pmax; - - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - // m_min.t = tmin.xyww(pmin); - // m_max.t = tmax.xyww(pmax); - - vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - vmovdqa(xmm6, ptr[rsp + 0]); - vmovdqa(xmm7, ptr[rsp + 16]); - - add(rsp, 8 + 2 * 16); - - ret(); -} - -GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - sub(rsp, 8 + 2 * 16); - - vmovdqa(ptr[rsp + 0], xmm6); - vmovdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - vbroadcastss(xmm4, ptr[rax + 0]); - vbroadcastss(xmm5, ptr[rax + 4]); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - vpcmpeqd(xmm2, xmm2); - vpxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1) || tme) - { - vmovaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]); - } - - if(color && (iip || j == n - 1)) - { - vpminub(xmm2, xmm0); - vpmaxub(xmm3, xmm0); - } - - if(tme) - { - if(!fst) - { - vmovaps(xmm1, xmm0); - } - - vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral - - if(!fst) - { - vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - vdivps(xmm0, xmm1); - vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q - } - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - - vmovdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]); - vpmovzxwd(xmm1, xmm0); - - vpsrld(xmm0, 1); - vpunpcklqdq(xmm1, xmm0); - vcvtdq2ps(xmm1, xmm1); - - vminps(xmm4, xmm1); - vmaxps(xmm5, xmm1); - } - - add(rdx, n * sizeof(GSVertexHW11)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm2, xmm2); - - vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm3, xmm3); - - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - // m_min.p = pmin.xyww(); - // m_max.p = pmax.xyww(); - - vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - // m_min.t = tmin; - // m_max.t = tmax; - - vmovaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); - vmovaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - vmovdqa(xmm6, ptr[rsp + 0]); - vmovdqa(xmm7, ptr[rsp + 16]); - - add(rsp, 8 + 2 * 16); - - ret(); -} - -#endif \ No newline at end of file diff --git a/plugins/GSdx/GSVertexTrace.x64.cpp b/plugins/GSdx/GSVertexTrace.x64.cpp deleted file mode 100644 index 8dfc6db296..0000000000 --- a/plugins/GSdx/GSVertexTrace.x64.cpp +++ /dev/null @@ -1,543 +0,0 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSVertexTrace.h" - -#if _M_SSE < 0x500 && (defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - sub(rsp, 8 + 2 * 16); - - movdqa(ptr[rsp + 0], xmm6); - movdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - movss(xmm4, ptr[rax + 0]); - movss(xmm5, ptr[rax + 4]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = FLT_MAX; - // max.c = -FLT_MAX; - - movaps(xmm2, xmm4); - movaps(xmm3, xmm5); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - movaps(xmm1, ptr[rdx + 1 * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1)) - { - // min.c = min.c.minv(v[i + j].c); - // max.c = max.c.maxv(v[i + j].c); - - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, c)]); - - minps(xmm2, xmm0); - maxps(xmm3, xmm0); - } - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, p)]); - - minps(xmm4, xmm0); - maxps(xmm5, xmm0); - - if(tme) - { - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexSW) + offsetof(GSVertexSW, t)]); - - if(!fst) - { - if(primclass != GS_SPRITE_CLASS) - { - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - divps(xmm0, xmm1); - shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); - } - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - } - - add(rdx, n * sizeof(GSVertexSW)); - sub(rcx, n); - - jg("loop"); - - // } - - if(color) - { - cvttps2dq(xmm2, xmm2); - psrld(xmm2, 7); - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); - - cvttps2dq(xmm3, xmm3); - psrld(xmm3, 7); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - movdqa(xmm6, ptr[rsp + 0]); - movdqa(xmm7, ptr[rsp + 16]); - - add(rsp, 8 + 2 * 16); - - ret(); -} - -GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - case GS_SPRITE_CLASS: - n = 6; - break; - } - - sub(rsp, 8 + 2 * 16); - - movdqa(ptr[rsp + 0], xmm6); - movdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - movss(xmm4, ptr[rax + 0]); - movss(xmm5, ptr[rax + 16]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - pcmpeqd(xmm2, xmm2); - pxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - movaps(xmm1, ptr[rdx + 5 * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - for(int j = 0; j < n; j++) - { - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, p)]); - - minps(xmm4, xmm0); - maxps(xmm5, xmm0); - - if(tme && !fst && primclass != GS_SPRITE_CLASS) - { - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - if(color && (iip || j == n - 1) || tme) - { - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW9) + offsetof(GSVertexHW9, t)]); - } - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.min_u8(v[i + j].c); - // max.c = max.c.min_u8(v[i + j].c); - - pminub(xmm2, xmm0); - pmaxub(xmm3, xmm0); - } - - if(tme) - { - shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral - - if(!fst) - { - // t /= p.wwww(); - - divps(xmm0, xmm1); - } - - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - } - - add(rdx, n * sizeof(GSVertexHW9)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm2, xmm2); - - pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm3, xmm3); - } - else - { - pxor(xmm0, xmm0); - - punpckhbw(xmm2, xmm0); - punpcklwd(xmm2, xmm0); - - punpckhbw(xmm3, xmm0); - punpcklwd(xmm3, xmm0); - } - - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - // m_min.p = pmin; - // m_max.p = pmax; - - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - // m_min.t = tmin.xyww(pmin); - // m_max.t = tmax.xyww(pmax); - - shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - movdqa(xmm6, ptr[rsp + 0]); - movdqa(xmm7, ptr[rsp + 16]); - - add(rsp, 8 + 2 * 16); - - ret(); -} - -GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - sub(rsp, 8 + 2 * 16); - - movdqa(ptr[rsp + 0], xmm6); - movdqa(ptr[rsp + 16], xmm7); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - mov(rax, (size_t)&s_minmax); - - movss(xmm4, ptr[rax + 0]); - movss(xmm5, ptr[rax + 16]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - pcmpeqd(xmm2, xmm2); - pxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - align(16); - - L("loop"); - - for(int j = 0; j < n; j++) - { - if(color && (iip || j == n - 1) || tme) - { - movaps(xmm0, ptr[rdx + j * sizeof(GSVertexHW11)]); - } - - if(color && (iip || j == n - 1)) - { - pminub(xmm2, xmm0); - pmaxub(xmm3, xmm0); - } - - if(tme) - { - if(!fst) - { - movaps(xmm1, xmm0); - } - - shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral - - if(!fst) - { - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - divps(xmm0, xmm1); - shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q - } - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - - movdqa(xmm0, ptr[rdx + j * sizeof(GSVertexHW11) + 16]); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pmovzxwd(xmm1, xmm0); - } - else - { - movdqa(xmm1, xmm0); - punpcklwd(xmm1, xmm1); - psrld(xmm1, 16); - } - - psrld(xmm0, 1); - punpcklqdq(xmm1, xmm0); - cvtdq2ps(xmm1, xmm1); - - minps(xmm4, xmm1); - maxps(xmm5, xmm1); - } - - add(rdx, n * sizeof(GSVertexHW11)); - sub(ecx, n); - - jg("loop"); - - // } - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm2, xmm2); - - pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm3, xmm3); - } - else - { - pxor(xmm0, xmm0); - - punpckhbw(xmm2, xmm0); - punpcklwd(xmm2, xmm0); - - punpckhbw(xmm3, xmm0); - punpcklwd(xmm3, xmm0); - } - - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, c)], xmm2); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - // m_min.p = pmin.xyww(); - // m_max.p = pmax.xyww(); - - shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, p)], xmm4); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - // m_min.t = tmin; - // m_max.t = tmax; - - movaps(ptr[r8 + offsetof(GSVertexTrace::Vertex, t)], xmm6); - movaps(ptr[r9 + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - movdqa(xmm6, ptr[rsp + 0]); - movdqa(xmm7, ptr[rsp + 16]); - - add(rsp, 8 + 2 * 16); - - ret(); -} - -#endif diff --git a/plugins/GSdx/GSVertexTrace.x86.avx.cpp b/plugins/GSdx/GSVertexTrace.x86.avx.cpp deleted file mode 100644 index 560680285f..0000000000 --- a/plugins/GSdx/GSVertexTrace.x86.avx.cpp +++ /dev/null @@ -1,513 +0,0 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSVertexTrace.h" - -#if _M_SSE >= 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -static const int _args = 4; -static const int _count = _args + 4; // rcx -static const int _vertex = _args + 8; // rdx -static const int _index = _args + 12; // r8 -static const int _min = _args + 16; // r9 -static const int _max = _args + 20; // _args + 4 - -GSVertexTraceSW::CG::CG(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - push(ebx); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - vbroadcastss(xmm4, ptr[&s_minmax.x]); - vbroadcastss(xmm5, ptr[&s_minmax.y]); - - if(color) - { - // min.c = FLT_MAX; - // max.c = -FLT_MAX; - - vmovaps(xmm2, xmm4); - vmovaps(xmm3, xmm5); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _vertex]); - mov(ebx, dword[esp + _index]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - mov(eax, ptr[ebx + 1 * sizeof(uint32)]); - shl(eax, 6); // * sizeof(GSVertexSW) - - vmovaps(xmm1, ptr[edx + eax + offsetof(GSVertexSW, t)]); - vshufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - for(int j = 0; j < n; j++) - { - mov(eax, ptr[ebx + j * sizeof(uint32)]); - shl(eax, 6); // * sizeof(GSVertexSW) - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.minv(v[i + j].c); - // max.c = max.c.maxv(v[i + j].c); - - vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, c)]); - - vminps(xmm2, xmm0); - vmaxps(xmm3, xmm0); - } - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, p)]); - - vminps(xmm4, xmm0); - vmaxps(xmm5, xmm0); - - if(tme) - { - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, t)]); - - if(!fst) - { - if(primclass != GS_SPRITE_CLASS) - { - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - } - - vdivps(xmm0, xmm1); - vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); - } - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - } - - add(ebx, n * sizeof(uint32)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - vcvttps2dq(xmm2, xmm2); - vpsrld(xmm2, 7); - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); - - vcvttps2dq(xmm3, xmm3); - vpsrld(xmm3, 7); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - pop(ebx); - - ret(); -} - -GSVertexTraceDX9::CG::CG(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_SPRITE_CLASS: - case GS_LINE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - push(ebx); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - vbroadcastss(xmm4, ptr[&s_minmax.x]); - vbroadcastss(xmm5, ptr[&s_minmax.y]); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - vpcmpeqd(xmm2, xmm2); - vpxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _vertex]); - mov(ebx, dword[esp + _index]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - mov(eax, ptr[ebx + 1 * sizeof(uint32)]); - shl(eax, 5); // * sizeof(GSVertexHW9) - - vmovaps(xmm1, ptr[edx + eax + offsetof(GSVertexHW9, p)]); - vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - for(int j = 0; j < n; j++) - { - mov(eax, ptr[ebx + j * sizeof(uint32)]); - shl(eax, 5); // * sizeof(GSVertexHW9) - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexHW9, p)]); - - vminps(xmm4, xmm0); - vmaxps(xmm5, xmm0); - - if(tme && !fst && primclass != GS_SPRITE_CLASS) - { - vshufps(xmm1, xmm0, xmm0, _MM_SHUFFLE(3, 3, 3, 3)); - } - - if(color && (iip || j == n - 1) || tme) - { - vmovaps(xmm0, ptr[edx + eax + offsetof(GSVertexHW9, t)]); - } - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.min_u8(v[i + j].c); - // max.c = max.c.min_u8(v[i + j].c); - - vpminub(xmm2, xmm0); - vpmaxub(xmm3, xmm0); - } - - if(tme) - { - vshufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral - - if(!fst) - { - // t /= p.wwww(); - - vdivps(xmm0, xmm1); - } - - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - } - - add(ebx, n * sizeof(uint32)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm2, xmm2); - - vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm3, xmm3); - - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - // m_min.p = pmin; - // m_max.p = pmax; - - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - // m_min.t = tmin.xyww(pmin); - // m_max.t = tmax.xyww(pmax); - - vshufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - vshufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - pop(ebx); - - ret(); -} - -GSVertexTraceDX11::CG::CG(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - push(ebx); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - vbroadcastss(xmm4, ptr[&s_minmax.x]); - vbroadcastss(xmm5, ptr[&s_minmax.y]); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - vpcmpeqd(xmm2, xmm2); - vpxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - vmovaps(xmm6, xmm4); - vmovaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _vertex]); - mov(ebx, dword[esp + _index]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - for(int j = 0; j < n; j++) - { - mov(eax, ptr[ebx + j * sizeof(uint32)]); - shl(eax, 5); // * sizeof(GSVertexHW11) - - if(color && (iip || j == n - 1) || tme) - { - vmovaps(xmm0, ptr[edx + eax]); - } - - if(color && (iip || j == n - 1)) - { - vpminub(xmm2, xmm0); - vpmaxub(xmm3, xmm0); - } - - if(tme) - { - if(!fst) - { - vmovaps(xmm1, xmm0); - } - - vshufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral - - if(!fst) - { - vshufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - vdivps(xmm0, xmm1); - vshufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q - } - - vminps(xmm6, xmm0); - vmaxps(xmm7, xmm0); - } - - vmovdqa(xmm0, ptr[edx + eax + 16]); - vpmovzxwd(xmm1, xmm0); - - vpsrld(xmm0, 1); - vpunpcklqdq(xmm1, xmm0); - vcvtdq2ps(xmm1, xmm1); - - vminps(xmm4, xmm1); - vmaxps(xmm5, xmm1); - } - - add(ebx, n * sizeof(uint32)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - vpshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm2, xmm2); - - vpshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - vpmovzxbd(xmm3, xmm3); - - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - // m_min.p = pmin.xyww(); - // m_max.p = pmax.xyww(); - - vshufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - vshufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - // m_min.t = tmin; - // m_max.t = tmax; - - vmovaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); - vmovaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - pop(ebx); - - ret(); -} - -#endif diff --git a/plugins/GSdx/GSVertexTrace.x86.cpp b/plugins/GSdx/GSVertexTrace.x86.cpp deleted file mode 100644 index 58d4df2daa..0000000000 --- a/plugins/GSdx/GSVertexTrace.x86.cpp +++ /dev/null @@ -1,562 +0,0 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSVertexTrace.h" - -#if _M_SSE < 0x500 && !(defined(_M_AMD64) || defined(_WIN64)) - -using namespace Xbyak; - -static const int _args = 4; -static const int _count = _args + 4; // rcx -static const int _vertex = _args + 8; // rdx -static const int _index = _args + 12; // r8 -static const int _min = _args + 16; // r9 -static const int _max = _args + 20; // _args + 4 - -GSVertexTraceSW::CG::CG(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - push(ebx); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - movss(xmm4, ptr[&s_minmax.x]); - movss(xmm5, ptr[&s_minmax.y]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = FLT_MAX; - // max.c = -FLT_MAX; - - movaps(xmm2, xmm4); - movaps(xmm3, xmm5); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _vertex]); - mov(ebx, dword[esp + _index]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - mov(eax, ptr[ebx + 1 * sizeof(uint32)]); - shl(eax, 6); // * sizeof(GSVertexSW) - - movaps(xmm1, ptr[edx + eax + offsetof(GSVertexSW, t)]); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - for(int j = 0; j < n; j++) - { - mov(eax, ptr[ebx + j * sizeof(uint32)]); - shl(eax, 6); // * sizeof(GSVertexSW) - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.minv(v[i + j].c); - // max.c = max.c.maxv(v[i + j].c); - - movaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, c)]); - - minps(xmm2, xmm0); - maxps(xmm3, xmm0); - } - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - movaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, p)]); - - minps(xmm4, xmm0); - maxps(xmm5, xmm0); - - if(tme) - { - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - movaps(xmm0, ptr[edx + eax + offsetof(GSVertexSW, t)]); - - if(!fst) - { - if(primclass != GS_SPRITE_CLASS) - { - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(2, 2, 2, 2)); - } - - divps(xmm0, xmm1); - shufps(xmm0, xmm1, _MM_SHUFFLE(3, 2, 1, 0)); - } - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - } - - add(ebx, n * sizeof(uint32)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - cvttps2dq(xmm2, xmm2); - psrld(xmm2, 7); - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); - - cvttps2dq(xmm3, xmm3); - psrld(xmm3, 7); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - pop(ebx); - - ret(); -} - -GSVertexTraceDX9::CG::CG(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - case GS_SPRITE_CLASS: - n = 6; - break; - } - - push(ebx); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - movss(xmm4, ptr[&s_minmax.x]); - movss(xmm5, ptr[&s_minmax.y]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - pcmpeqd(xmm2, xmm2); - pxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _vertex]); - mov(ebx, dword[esp + _index]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - if(tme && !fst && primclass == GS_SPRITE_CLASS) - { - mov(eax, ptr[ebx + 1 * sizeof(uint32)]); - shl(eax, 5); // * sizeof(GSVertexHW9) - - movaps(xmm1, ptr[edx + eax + offsetof(GSVertexHW9, p)]); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - for(int j = 0; j < n; j++) - { - mov(eax, ptr[ebx + j * sizeof(uint32)]); - shl(eax, 5); // * sizeof(GSVertexHW9) - - // min.p = min.p.minv(v[i + j].p); - // max.p = max.p.maxv(v[i + j].p); - - movaps(xmm0, ptr[edx + eax + offsetof(GSVertexHW9, p)]); - - minps(xmm4, xmm0); - maxps(xmm5, xmm0); - - if(tme && !fst && primclass != GS_SPRITE_CLASS) - { - movaps(xmm1, xmm0); - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - } - - if(color && (iip || j == n - 1) || tme) - { - movaps(xmm0, ptr[edx + eax + offsetof(GSVertexHW9, t)]); - } - - if(color && (iip || j == n - 1)) - { - // min.c = min.c.min_u8(v[i + j].c); - // max.c = max.c.min_u8(v[i + j].c); - - pminub(xmm2, xmm0); - pmaxub(xmm3, xmm0); - } - - if(tme) - { - shufps(xmm0, xmm0, _MM_SHUFFLE(1, 0, 1, 0)); // avoid FP assist, high part is integral - - if(!fst) - { - // t /= p.wwww(); - - divps(xmm0, xmm1); - } - - // min.t = min.t.minv(v[i + j].t); - // max.t = max.t.maxv(v[i + j].t); - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - } - - add(ebx, n * sizeof(uint32)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm2, xmm2); - - pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm3, xmm3); - } - else - { - pxor(xmm0, xmm0); - - punpckhbw(xmm2, xmm0); - punpcklwd(xmm2, xmm0); - - punpckhbw(xmm3, xmm0); - punpcklwd(xmm3, xmm0); - } - - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - // m_min.p = pmin; - // m_max.p = pmax; - - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - // m_min.t = tmin.xyww(pmin); - // m_max.t = tmax.xyww(pmax); - - shufps(xmm6, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - shufps(xmm7, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - pop(ebx); - - ret(); -} - -GSVertexTraceDX11::CG::CG(const void* param, uint32 key, void* code, size_t maxsize) - : GSCodeGenerator(code, maxsize) -{ - uint32 primclass = (key >> 0) & 3; - uint32 iip = (key >> 2) & 1; - uint32 tme = (key >> 3) & 1; - uint32 fst = (key >> 4) & 1; - uint32 color = (key >> 5) & 1; - - int n = 1; - - switch(primclass) - { - case GS_POINT_CLASS: - n = 1; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - n = 2; - break; - case GS_TRIANGLE_CLASS: - n = 3; - break; - } - - push(ebx); - - // min.p = FLT_MAX; - // max.p = -FLT_MAX; - - movss(xmm4, ptr[&s_minmax.x]); - movss(xmm5, ptr[&s_minmax.y]); - - shufps(xmm4, xmm4, _MM_SHUFFLE(0, 0, 0, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(0, 0, 0, 0)); - - if(color) - { - // min.c = 0xffffffff; - // max.c = 0; - - pcmpeqd(xmm2, xmm2); - pxor(xmm3, xmm3); - } - - if(tme) - { - // min.t = FLT_MAX; - // max.t = -FLT_MAX; - - movaps(xmm6, xmm4); - movaps(xmm7, xmm5); - } - - // for(int i = 0; i < count; i += step) { - - mov(edx, dword[esp + _vertex]); - mov(ebx, dword[esp + _index]); - mov(ecx, dword[esp + _count]); - - align(16); - - L("loop"); - - for(int j = 0; j < n; j++) - { - mov(eax, ptr[ebx + j * sizeof(uint32)]); - shl(eax, 5); // * sizeof(GSVertexHW11) - - if(color && (iip || j == n - 1) || tme) - { - movaps(xmm0, ptr[edx + eax]); - } - - if(color && (iip || j == n - 1)) - { - pminub(xmm2, xmm0); - pmaxub(xmm3, xmm0); - } - - if(tme) - { - if(!fst) - { - movaps(xmm1, xmm0); - } - - shufps(xmm0, xmm0, _MM_SHUFFLE(3, 3, 1, 0)); // avoid FP assist, third dword is integral - - if(!fst) - { - shufps(xmm1, xmm1, _MM_SHUFFLE(3, 3, 3, 3)); - divps(xmm0, xmm1); - shufps(xmm0, xmm1, _MM_SHUFFLE(3, 3, 1, 0)); // restore q - } - - minps(xmm6, xmm0); - maxps(xmm7, xmm0); - } - - movdqa(xmm0, ptr[edx + eax + 16]); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pmovzxwd(xmm1, xmm0); - } - else - { - movdqa(xmm1, xmm0); - punpcklwd(xmm1, xmm1); - psrld(xmm1, 16); - } - - psrld(xmm0, 1); - punpcklqdq(xmm1, xmm0); - cvtdq2ps(xmm1, xmm1); - - minps(xmm4, xmm1); - maxps(xmm5, xmm1); - } - - add(ebx, n * sizeof(uint32)); - sub(ecx, n); - - jg("loop"); - - // } - - mov(eax, dword[esp + _min]); - mov(edx, dword[esp + _max]); - - if(color) - { - // m_min.c = cmin.zzzz().u8to32(); - // m_max.c = cmax.zzzz().u8to32(); - - if(m_cpu.has(util::Cpu::tSSE41)) - { - pshufd(xmm2, xmm2, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm2, xmm2); - - pshufd(xmm3, xmm3, _MM_SHUFFLE(2, 2, 2, 2)); - pmovzxbd(xmm3, xmm3); - } - else - { - pxor(xmm0, xmm0); - - punpckhbw(xmm2, xmm0); - punpcklwd(xmm2, xmm0); - - punpckhbw(xmm3, xmm0); - punpcklwd(xmm3, xmm0); - } - - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, c)], xmm2); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, c)], xmm3); - } - - // m_min.p = pmin.xyww(); - // m_max.p = pmax.xyww(); - - shufps(xmm4, xmm4, _MM_SHUFFLE(3, 3, 1, 0)); - shufps(xmm5, xmm5, _MM_SHUFFLE(3, 3, 1, 0)); - - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, p)], xmm4); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, p)], xmm5); - - if(tme) - { - // m_min.t = tmin; - // m_max.t = tmax; - - movaps(ptr[eax + offsetof(GSVertexTrace::Vertex, t)], xmm6); - movaps(ptr[edx + offsetof(GSVertexTrace::Vertex, t)], xmm7); - } - - pop(ebx); - - ret(); -} - -#endif \ No newline at end of file diff --git a/plugins/GSdx/GSdx.vcxproj b/plugins/GSdx/GSdx.vcxproj index b33203905f..fb69ea004e 100644 --- a/plugins/GSdx/GSdx.vcxproj +++ b/plugins/GSdx/GSdx.vcxproj @@ -618,62 +618,6 @@ - - true - true - true - true - true - true - true - true - true - true - true - true - true - true - - - true - true - true - true - true - true - true - true - true - true - - - true - true - true - true - true - true - true - true - true - true - true - true - true - true - - - true - true - true - true - true - true - true - true - true - true - Create diff --git a/plugins/GSdx/GSdx.vcxproj.filters b/plugins/GSdx/GSdx.vcxproj.filters index 53417912be..a2bc719e26 100644 --- a/plugins/GSdx/GSdx.vcxproj.filters +++ b/plugins/GSdx/GSdx.vcxproj.filters @@ -288,18 +288,6 @@ Source Files - - Source Files - - - Source Files - - - Source Files - - - Source Files - Source Files diff --git a/plugins/GSdx/GSdx_vs2008.vcproj b/plugins/GSdx/GSdx_vs2008.vcproj index 8a4b1a71dc..30da187049 100644 --- a/plugins/GSdx/GSdx_vs2008.vcproj +++ b/plugins/GSdx/GSdx_vs2008.vcproj @@ -1024,6 +1024,10 @@ RelativePath=".\GSRenderer.cpp" > + + @@ -1240,110 +1244,6 @@ RelativePath=".\GSVertexTrace.cpp" > - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1630,6 +1530,10 @@ RelativePath=".\GSRenderer.h" > + + diff --git a/plugins/GSdx/res/cs.fx b/plugins/GSdx/res/cs.fx index 7579753e77..594e99e85b 100644 --- a/plugins/GSdx/res/cs.fx +++ b/plugins/GSdx/res/cs.fx @@ -1,73 +1,383 @@ -struct Vertex +#ifndef VS_TME +#define VS_TME 1 +#define VS_FST 1 +#endif + +#ifndef GS_IIP +#define GS_IIP 0 +#define GS_PRIM 2 +#endif + +#ifndef PS_BATCH_SIZE +#define PS_BATCH_SIZE 2048 +#define PS_FPSM PSM_PSMCT32 +#define PS_ZPSM PSM_PSMZ16 +#endif + +#define PSM_PSMCT32 0 +#define PSM_PSMCT24 1 +#define PSM_PSMCT16 2 +#define PSM_PSMCT16S 10 +#define PSM_PSMT8 19 +#define PSM_PSMT4 20 +#define PSM_PSMT8H 27 +#define PSM_PSMT4HL 36 +#define PSM_PSMT4HH 44 +#define PSM_PSMZ32 48 +#define PSM_PSMZ24 49 +#define PSM_PSMZ16 50 +#define PSM_PSMZ16S 58 + +struct VS_INPUT { - float2 st; - uint c; - float q; - uint xy, z; - uint uv, f; + float2 st : TEXCOORD0; + float4 c : COLOR0; + float q : TEXCOORD1; + uint2 p : POSITION0; + uint z : POSITION1; + uint2 uv : TEXCOORD2; + float4 f : COLOR1; +}; + +struct VS_OUTPUT +{ + float4 p : SV_Position; + float2 z : TEXCOORD0; + float4 t : TEXCOORD1; + float4 c : COLOR0; +}; + +struct GS_OUTPUT +{ + float4 p : SV_Position; + float2 z : TEXCOORD0; + float4 t : TEXCOORD1; + float4 c : COLOR0; + uint id : SV_PrimitiveID; +}; + +cbuffer VSConstantBuffer : register(c0) +{ + float4 VertexScale; + float4 VertexOffset; +}; + +cbuffer PSConstantBuffer : register(c0) +{ + uint2 WriteMask; +}; + +struct FragmentLinkItem +{ + uint c, z, id, next; }; RWByteAddressBuffer VideoMemory : register(u0); +RWStructuredBuffer FragmentLinkBuffer : register(u1); +RWByteAddressBuffer StartOffsetBuffer : register(u2); +//RWTexture2D VideoMemory : register(u2); // 8192 * 512 R8_UINT -StructuredBuffer VertexBuffer : register(t0); -Buffer IndexBuffer : register(t1); +Buffer FZRowOffset : register(t0); +Buffer FZColOffset : register(t1); +Texture2D Palette : register(t2); +Texture2D Texture : register(t3); -Buffer FrameRowOffset : register(t2); -Buffer FrameColOffset : register(t3); -Buffer ZBufRowOffset : register(t4); -Buffer ZBufColOffset : register(t5); - -cbuffer DrawingEnvironment : register(c0) +VS_OUTPUT vs_main(VS_INPUT input) { - // TODO -}; + VS_OUTPUT output; -// one group is 16x8 pixels and one thread does 2 pixels, otherwise could not read-merge-write 16-bit targets safely -// neighburing pixels are next to eachother in memory, at least we don't have to calculate the address twice + output.p = float4(input.p, 0.0f, 0.0f) * VertexScale - VertexOffset; + output.z = float2(input.z & 0xffff, input.z >> 16); // TODO: min(input.z, 0xffffff00) ? -// TODO: they say groupshared memory is faster, try unswizzling the corresponding chunk of memory initially (how to do that once by only one thread?) then write-back when finished, unless it was untouched - -[numthreads(8, 8, 1)] -void cs_main(uint3 gid : SV_GroupID, uint3 tid : SV_GroupThreadID) -{ - uint count; - - IndexBuffer.GetDimensions(count); - - // #if GS_PRIM == 2 (triangle) - - for(uint i = 0; i < count; i += 3) + if(VS_TME) { - Vertex v0 = VertexBuffer[IndexBuffer[i + 0]]; - Vertex v1 = VertexBuffer[IndexBuffer[i + 1]]; - Vertex v2 = VertexBuffer[IndexBuffer[i + 2]]; - - uint x = gid.x + tid.x * 2; - uint y = gid.y + tid.y; - - uint fa = FrameRowOffset[y] + FrameColOffset[x]; - uint za = ZBufRowOffset[y] + ZBufColOffset[x]; - - // TODO: quickly reject if x, y is outside the triangle - // TODO: calculate interpolated values at x, y - // TODO: run the GS pipeline - // TODO: repeat for x+1, y - // TODO: output two pixels (might be better to process a single pixel, more threads, if there is no 16-bit target involved) - - // testing... - - uint4 c = VideoMemory.Load4(fa); // does this load 4*4 bytes? or 4 bytes each expanded uint? - - c = (v0.c >> uint4(0, 8, 16, 24)) & 0xff; // => ushr r1.yzw, r1.xxxx, l(0, 8, 16, 24), v0.c auto-converted to uint4 and per-component shift in one instruction, SSE is embarrassed - - VideoMemory.Store4(fa, c); // same question, 4*4 bytes or compressed to uint + if(VS_FST) + { + output.t.xy = input.uv; + output.t.w = 1.0f; + } + else + { + output.t.xy = input.st; + output.t.w = input.q; + } + } + else + { + output.t.xy = 0; + output.t.w = 1.0f; } - // #endif + output.c = input.c; + output.t.z = input.f.r; + + return output; } -// TODO: DrawPoint (this is going to be a waste of resources) -// TODO: DrawLine (line hit-test, will it work?) -// TODO: DrawSprite (similar to DrawTriangle) -// TODO: if read-backs are too slow, implement GSState::Write/FlushWrite/Read/clut.Write in a compute shader -// TODO: unswizzle pages from VideoMemory to the texture cache (if they are marked as valid, otherwise upload from GSLocalMemory::m_vm8) +#if GS_PRIM == 0 + +[maxvertexcount(1)] +void gs_main(point VS_OUTPUT input[1], inout PointStream stream, uint id : SV_PrimitiveID) +{ + GS_OUTPUT output; + + output.p = input[0].p; + output.z = input[0].z; + output.t = input[0].t; + output.c = input[0].c; + output.id = id; + + stream.Append(output); +} + +#elif GS_PRIM == 1 + +[maxvertexcount(2)] +void gs_main(line VS_OUTPUT input[2], inout LineStream stream, uint id : SV_PrimitiveID) +{ + [unroll] + for(int i = 0; i < 2; i++) + { + GS_OUTPUT output; + + output.p = input[i].p; + output.z = input[i].z; + output.t = input[i].t; + output.c = input[i].c; + output.id = id; + +#if GS_IIP == 0 + if(i != 1) output.c = input[1].c; +#endif + + stream.Append(output); + } +} + +#elif GS_PRIM == 2 + +[maxvertexcount(3)] +void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream stream, uint id : SV_PrimitiveID) +{ + [unroll] + for(int i = 0; i < 3; i++) + { + GS_OUTPUT output; + + output.p = input[i].p; + output.z = input[i].z; + output.t = input[i].t; + output.c = input[i].c; + output.id = id; + +#if GS_IIP == 0 + if(i != 2) output.c = input[2].c; +#endif + + stream.Append(output); + } +} + +#elif GS_PRIM == 3 + +[maxvertexcount(4)] +void gs_main(line VS_OUTPUT input[2], inout TriangleStream stream, uint id : SV_PrimitiveID) +{ + GS_OUTPUT lt, rb, lb, rt; + + lt.p = input[0].p; + lt.z = input[1].z; + lt.t.xy = input[0].t.xy; + lt.t.zw = input[1].t.zw; + lt.c = input[0].c; + lt.id = id; + +#if GS_IIP == 0 + lt.c = input[1].c; +#endif + + rb.p = input[1].p; + rb.z = input[1].z; + rb.t = input[1].t; + rb.c = input[1].c; + rb.id = id; + + lb = lt; + lb.p.y = rb.p.y; + lb.t.y = rb.t.y; + + rt = rb; + rt.p.y = lt.p.y; + rt.t.y = lt.t.y; + + stream.Append(lt); + stream.Append(lb); + stream.Append(rt); + stream.Append(rb); +} + +#endif + +uint CompressColor32(float4 f) +{ + uint4 c = (uint4)(f * 0xff) << uint4(0, 8, 16, 24); + + return c.r | c.g | c.b | c.a; +} + +uint DecompressColor16(uint c) +{ + uint r = (c & 0x001f) << 3; + uint g = (c & 0x03e0) << 6; + uint b = (c & 0x7c00) << 9; + uint a = (c & 0x8000) << 15; + + return r | g | b | a; +} + +uint ReadPixel(uint addr) +{ + return VideoMemory.Load(addr) >> ((addr & 2) << 3); +} + +void WritePixel(uint addr, uint value, uint psm) +{ + uint tmp; + + switch(psm) + { + case PSM_PSMCT32: + case PSM_PSMZ32: + case PSM_PSMCT24: + case PSM_PSMZ24: + VideoMemory.Store(addr, value); + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + tmp = (addr & 2) << 3; + value = ((value << tmp) ^ VideoMemory.Load(addr)) & (0x0000ffff << tmp); + VideoMemory.InterlockedXor(addr, value, tmp); + break; + } +} + +void ps_main0(GS_OUTPUT input) +{ + uint x = (uint)input.p.x; + uint y = (uint)input.p.y; + + uint tail = FragmentLinkBuffer.IncrementCounter(); + + uint index = (y << 11) + x; + uint next = 0; + + StartOffsetBuffer.InterlockedExchange(index * 4, tail, next); + + FragmentLinkItem item; + + // TODO: preprocess color (tfx, alpha test), z-test + + item.c = CompressColor32(input.c); + item.z = (uint)(input.z.y * 0x10000 + input.z.x); + item.id = input.id; + item.next = next; + + FragmentLinkBuffer[tail] = item; +} + +void ps_main1(GS_OUTPUT input) +{ + uint2 pos = (uint2)input.p.xy; + + // sort fragments + + uint StartOffsetIndex = (pos.y << 11) + pos.x; + + int index[PS_BATCH_SIZE]; + int count = 0; + + uint next = StartOffsetBuffer.Load(StartOffsetIndex * 4); + + StartOffsetBuffer.Store(StartOffsetIndex * 4, 0); + + [allow_uav_condition] + while(next != 0) + { + index[count++] = next; + + next = FragmentLinkBuffer[next].next; + } + + int N2 = 1 << (int)(ceil(log2(count))); + + [allow_uav_condition] + for(int i = count; i < N2; i++) + { + index[i] = 0; + } + + [allow_uav_condition] + for(int k = 2; k <= N2; k = 2 * k) + { + [allow_uav_condition] + for(int j = k >> 1; j > 0 ; j = j >> 1) + { + [allow_uav_condition] + for(int i = 0; i < N2; i++) + { + uint i_id = FragmentLinkBuffer[index[i]].id; + + int ixj = i ^ j; + + if(ixj > i) + { + uint ixj_id = FragmentLinkBuffer[index[ixj]].id; + + if((i & k) == 0 && i_id > ixj_id) + { + int temp = index[i]; + index[i] = index[ixj]; + index[ixj] = temp; + } + + if((i & k) != 0 && i_id < ixj_id) + { + int temp = index[i]; + index[i] = index[ixj]; + index[ixj] = temp; + } + } + } + } + } + + uint2 addr = (uint2)(FZRowOffset[pos.y] + FZColOffset[pos.x]) << 1; + + uint dc = ReadPixel(addr.x); + uint dz = ReadPixel(addr.y); + + uint sc = dc; + uint sz = dz; + + [allow_uav_condition] + while(--count >= 0) + { + FragmentLinkItem f = FragmentLinkBuffer[index[count]]; + + // TODO + + if(sz < f.z) + { + sc = f.c; + sz = f.z; + } + } + + uint c = sc; // (dc & ~WriteMask.x) | (sc & WriteMask.x); + uint z = 0;//sz; //(dz & ~WriteMask.y) | (sz & WriteMask.y); + + WritePixel(addr.x, c, PS_FPSM); + WritePixel(addr.y, z, PS_ZPSM); +} diff --git a/plugins/GSdx/res/tfx.fx b/plugins/GSdx/res/tfx.fx index 3ce182dcea..2372bbf1d4 100644 --- a/plugins/GSdx/res/tfx.fx +++ b/plugins/GSdx/res/tfx.fx @@ -40,11 +40,12 @@ struct VS_INPUT { + float2 st : TEXCOORD0; + float4 c : COLOR0; + float q : TEXCOORD1; uint2 p : POSITION0; uint z : POSITION1; - float2 t : TEXCOORD0; - float q : TEXCOORD1; - float4 c : COLOR0; + uint2 uv : TEXCOORD2; float4 f : COLOR1; }; @@ -602,12 +603,12 @@ VS_OUTPUT vs_main(VS_INPUT input) { if(VS_FST) { - output.t.xy = input.t * TextureScale; + output.t.xy = input.uv * TextureScale; output.t.w = 1.0f; } else { - output.t.xy = input.t; + output.t.xy = input.st; output.t.w = input.q; } }