diff --git a/plugins/GSdx/GSCapture.cpp b/plugins/GSdx/GSCapture.cpp index 8cec5a02db..53feaf6878 100644 --- a/plugins/GSdx/GSCapture.cpp +++ b/plugins/GSdx/GSCapture.cpp @@ -249,10 +249,15 @@ public: GSVector4 ys(0.257f, 0.504f, 0.098f, 0.0f); GSVector4 us(-0.148f / 2, -0.291f / 2, 0.439f / 2, 0.0f); GSVector4 vs(0.439f / 2, -0.368f / 2, -0.071f / 2, 0.0f); - const GSVector4 offset(16, 128, 16, 128); - if (!rgba) - ys = ys.zyxw(), us = us.zyxw(), vs = vs.zyxw(); + if(!rgba) + { + ys = ys.zyxw(); + us = us.zyxw(); + vs = vs.zyxw(); + } + + const GSVector4 offset(16, 128, 16, 128); for(int j = 0; j < h; j++, dst += dstpitch, src += srcpitch) { @@ -261,8 +266,8 @@ public: for(int i = 0; i < w; i += 2) { - GSVector4 c0 = GSVector4(s[i + 0]); - GSVector4 c1 = GSVector4(s[i + 1]); + GSVector4 c0 = GSVector4::rgba32(s[i + 0]); + GSVector4 c1 = GSVector4::rgba32(s[i + 1]); GSVector4 c2 = c0 + c1; GSVector4 lo = (c0 * ys).hadd(c2 * us); diff --git a/plugins/GSdx/GSClut.cpp b/plugins/GSdx/GSClut.cpp index e223b72cb6..8481ed7b1e 100644 --- a/plugins/GSdx/GSClut.cpp +++ b/plugins/GSdx/GSClut.cpp @@ -109,7 +109,7 @@ bool GSClut::WriteTest(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT) case 4: if(m_CBP[0] == TEX0.CBP) return false; m_CBP[0] = TEX0.CBP; break; case 5: if(m_CBP[1] == TEX0.CBP) return false; m_CBP[1] = TEX0.CBP; break; case 6: ASSERT(0); return false; // ffx2 menu - case 7: ASSERT(0); return false; + case 7: ASSERT(0); return false; // ford mustang racing default: __assume(0); } diff --git a/plugins/GSdx/GSDevice11.cpp b/plugins/GSdx/GSDevice11.cpp index 9c9e7866f1..c080398e29 100644 --- a/plugins/GSdx/GSDevice11.cpp +++ b/plugins/GSdx/GSDevice11.cpp @@ -350,7 +350,7 @@ void GSDevice11::ClearRenderTarget(GSTexture* t, const GSVector4& c) void GSDevice11::ClearRenderTarget(GSTexture* t, uint32 c) { - GSVector4 color = GSVector4(c) * (1.0f / 255); + GSVector4 color = GSVector4::rgba32(c) * (1.0f / 255); m_ctx->ClearRenderTargetView(*(GSTexture11*)t, color.v); } diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp index 069abae945..36797c7f5e 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x64.avx.cpp @@ -25,7 +25,7 @@ #if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) -#error TODO: this is still bogus somewhere +#error TODO void GSDrawScanlineCodeGenerator::Generate() { @@ -38,17 +38,13 @@ void GSDrawScanlineCodeGenerator::Generate() push(r12); push(r13); - enter(10 * 16, true); + sub(rsp, 8 + 10 * 16); for(int i = 6; i < 16; i++) { vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i)); } - movsxd(rcx, ecx); // right - movsxd(rdx, edx); // left - movsxd(r8, r8d); // top - mov(r10, (size_t)&m_test[0]); mov(r11, (size_t)&m_local); mov(r12, (size_t)m_local.gd); @@ -84,7 +80,14 @@ L("loop"); // ebp = za - SampleTexture(); + if(m_sel.mmin) + { + SampleTextureLOD(); + } + else + { + SampleTexture(); + } // ebp = za // xmm2 = rb @@ -201,7 +204,7 @@ L("exit"); vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]); } - leave(); + add(rsp, 8 + 10 * 16); pop(r13); pop(r12); @@ -237,10 +240,9 @@ void GSDrawScanlineCodeGenerator::Init() mov(rax, rcx); sar(rax, 63); and(rax, rcx); - add(rax, 7); shl(rax, 4); - vpor(xmm15, ptr[rax + r10]); + vpor(xmm15, ptr[rax + r10 + 7 * 16]); // GSVector2i* fza_base = &m_local.gd->fzbr[top]; @@ -256,8 +258,7 @@ void GSDrawScanlineCodeGenerator::Init() { // edx = &m_local.d[skip] - shl(rdx, 3); - lea(rdx, ptr[rdx + r11 + offsetof(GSScanlineLocalData, d)]); + lea(rdx, ptr[rdx * 8 + r11 + offsetof(GSScanlineLocalData, d)]); } if(!m_sel.sprite) @@ -325,7 +326,7 @@ void GSDrawScanlineCodeGenerator::Init() vpaddd(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]); - if(!m_sel.sprite) + if(!m_sel.sprite || m_sel.mmin) { vpaddd(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]); } @@ -338,12 +339,6 @@ void GSDrawScanlineCodeGenerator::Init() vpsrlw(xmm6, 1); } } - - if(m_sel.mipmap && !m_sel.lcm) - { - vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm12, ptr[rdx + offsetof(GSScanlineLocalData::skip, q)]); - } } else { @@ -441,17 +436,11 @@ void GSDrawScanlineCodeGenerator::Step() vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpaddd(xmm10, xmm1); - if(!m_sel.sprite) + if(!m_sel.sprite || m_sel.mmin) { vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vpaddd(xmm11, xmm1); } - - if(m_sel.mipmap && !m_sel.lcm) - { - vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2)); - vaddps(xmm12, xmm3); - } } else { @@ -510,10 +499,9 @@ void GSDrawScanlineCodeGenerator::Step() mov(rdx, rcx); sar(rdx, 63); and(rdx, rcx); - add(rdx, 7); shl(rdx, 4); - vmovdqa(xmm15, ptr[rdx + r10]); + vmovdqa(xmm15, ptr[rdx + r10 + 7 * 16]); } void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) @@ -589,11 +577,9 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) vpslld(xmm2, 31); // GSVector4i zso = zs - o; - - vpsubd(xmm0, xmm2); - // GSVector4i zdo = zd - o; + vpsubd(xmm0, xmm2); vpsubd(xmm1, xmm2); } @@ -629,11 +615,6 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // ebx = tex - if(m_sel.mipmap && !m_sel.lcm) - { - - } - if(!m_sel.fst) { vrcpps(xmm0, xmm12); @@ -766,10 +747,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(xmm0, xmm0, xmm4, xmm5); - ReadTexel(xmm1, xmm1, xmm4, xmm5); - ReadTexel(xmm2, xmm2, xmm4, xmm5); - ReadTexel(xmm3, xmm3, xmm4, xmm5); + ReadTexel(4, 0); // xmm0 = c00 // xmm1 = c01 @@ -863,7 +841,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); - ReadTexel(xmm2, xmm3, xmm0, xmm1); + ReadTexel(1, 0); // GSVector4i mask = GSVector4i::x00ff(); @@ -1032,6 +1010,18 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1) } } +void GSDrawScanlineCodeGenerator::SampleTextureLOD() +{ +} + +void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv) +{ +} + +void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1) +{ +} + void GSDrawScanlineCodeGenerator::AlphaTFX() { if(!m_sel.fb) @@ -1046,6 +1036,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() // gat = gat.modulate16<1>(ga).clamp8(); modulate16(xmm3, xmm14, 1); + clamp16(xmm3, xmm0); // if(!tcc) gat = gat.mix16(ga.srl16(7)); @@ -1053,6 +1044,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(!m_sel.tcc) { vpsrlw(xmm1, xmm14, 7); + mix16(xmm3, xmm1, xmm0); } @@ -1065,6 +1057,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(!m_sel.tcc) { vpsrlw(xmm1, xmm14, 7); + mix16(xmm3, xmm1, xmm0); } @@ -1075,7 +1068,12 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); vpsrlw(xmm1, xmm14, 7); - if(m_sel.tcc) vpaddusb(xmm1, xmm3); + + if(m_sel.tcc) + { + vpaddusb(xmm1, xmm3); + } + mix16(xmm3, xmm1, xmm0); break; @@ -1087,6 +1085,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() if(!m_sel.tcc) { vpsrlw(xmm1, xmm14, 7); + mix16(xmm3, xmm1, xmm0); } @@ -1103,6 +1102,8 @@ void GSDrawScanlineCodeGenerator::AlphaTFX() break; } + + // TODO: aa1 } void GSDrawScanlineCodeGenerator::ReadMask() @@ -1218,6 +1219,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX() // rbt = rbt.modulate16<1>(rb).clamp8(); modulate16(xmm2, xmm13, 1); + clamp16(xmm2, xmm0); break; @@ -1229,22 +1231,28 @@ void GSDrawScanlineCodeGenerator::ColorTFX() case TFX_HIGHLIGHT: case TFX_HIGHLIGHT2: + // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); + + vmovdqa(xmm1, xmm3); + + modulate16(xmm3, xmm14, 1); + vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1)); vpsrlw(xmm6, 7); - // gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat); - - vmovdqa(xmm1, xmm3); - modulate16(xmm3, xmm14, 1); vpaddw(xmm3, xmm6); + clamp16(xmm3, xmm0); + mix16(xmm3, xmm1, xmm0); // rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); modulate16(xmm2, xmm13, 1); + vpaddw(xmm2, xmm6); + clamp16(xmm2, xmm0); break; @@ -1797,25 +1805,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr, } } -void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) +void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset) { - ReadTexel(dst, addr, 0); - ReadTexel(dst, addr, 1); - ReadTexel(dst, addr, 2); - ReadTexel(dst, addr, 3); + // TODO } void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) { const Address& src = m_sel.tlu ? ptr[r12 + rax * 4 + offsetof(GSScanlineGlobalData, clut)] : ptr[rbx + rax * 4]; - vpextrd(eax, addr, i); - - movsxd(rax, eax); + if(i == 0) vmovd(eax, addr); + else vpextrd(eax, addr, i); if(m_sel.tlu) movzx(rax, byte[rbx + rax]); - vpinsrd(dst, src, i); + if(i == 0) vmovd(dst, src); + else vpinsrd(dst, src, i); } #endif \ No newline at end of file diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index c5a57dc86a..8a8dbb9d66 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -19,9 +19,6 @@ * */ -// TODO: x64 (use the extra regs to avoid spills of zs, zd, uf, vf, rb, ga and keep a few constants in the last two like aref or afix) -// TODO: for edges doing 4 pixels is wasteful (needed memory access * 4) - #include "stdafx.h" #include "GSDrawScanlineCodeGenerator.h" #include "GSVertexSW.h" diff --git a/plugins/GSdx/GSFunctionMap.cpp b/plugins/GSdx/GSFunctionMap.cpp index 098617fda0..8478648819 100644 --- a/plugins/GSdx/GSFunctionMap.cpp +++ b/plugins/GSdx/GSFunctionMap.cpp @@ -21,37 +21,3 @@ #include "stdafx.h" #include "GSFunctionMap.h" - -void GSCodeGenerator::enter(uint32 size, bool align) -{ - #ifdef _M_AMD64 - - push(r15); - mov(r15, rsp); - if(size > 0) sub(rsp, size); - if(align) and(rsp, 0xfffffffffffffff0); - - #else - - push(ebp); - mov(ebp, esp); - if(size > 0) sub(esp, size); - if(align) and(esp, 0xfffffff0); - - #endif -} - -void GSCodeGenerator::leave() -{ - #ifdef _M_AMD64 - - mov(rsp, r15); - pop(r15); - - #else - - mov(esp, ebp); - pop(ebp); - - #endif -} diff --git a/plugins/GSdx/GSFunctionMap.h b/plugins/GSdx/GSFunctionMap.h index d692c0a9c3..55cb474f54 100644 --- a/plugins/GSdx/GSFunctionMap.h +++ b/plugins/GSdx/GSFunctionMap.h @@ -161,9 +161,6 @@ class GSCodeGenerator : public Xbyak::CodeGenerator protected: Xbyak::util::Cpu m_cpu; - void enter(uint32 size, bool align); - void leave(); - public: GSCodeGenerator(void* code, size_t maxsize) : Xbyak::CodeGenerator(maxsize, code) diff --git a/plugins/GSdx/GSRendererDX.h b/plugins/GSdx/GSRendererDX.h index 9534570ae3..2e0a4221f5 100644 --- a/plugins/GSdx/GSRendererDX.h +++ b/plugins/GSdx/GSRendererDX.h @@ -263,7 +263,7 @@ public: { ps_sel.fog = 1; - ps_cb.FogColor_AREF = GSVector4(env.FOGCOL.u32[0]) / 255; + ps_cb.FogColor_AREF = GSVector4::rgba32(env.FOGCOL.u32[0]) / 255; } if(context->TEST.ATE) diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 0477b20116..bae4da6616 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -384,7 +384,7 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) { - k = (int)m_vt.m_lod.x << 16; // set lod to max + k = (int)m_vt.m_lod.x << 16; // set lod to max level gd.sel.lcm = 1; // lod is constant gd.sel.mmin = 1; // tri-linear is meaningless @@ -432,7 +432,11 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) static int s_counter = 0; - //t->Save(format("c:/temp1/%08d_%05x_0.bmp", s_counter, context->TEX0.TBP0)); + if(0) + //if(context->TEX0.TH > context->TEX0.TW) + //if(s_n >= s_saven && s_n < s_saven + 3) + //if(context->TEX0.TBP0 >= 0x2b80 && context->TEX0.TBW == 2 && context->TEX0.PSM == PSM_PSMT4) + t->Save(format("c:/temp1/%08d_%05x_0.bmp", s_counter, context->TEX0.TBP0)); for(int i = 1, j = std::min((int)context->TEX1.MXL, 6); i <= j; i++) { @@ -487,7 +491,28 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd) gd.tex[i] = t->m_buff; - // t->Save(format("c:/temp1/%08d_%05x_%d.bmp", s_counter, context->TEX0.TBP0, i)); + if(0) + //if(context->TEX0.TH > context->TEX0.TW) + //if(s_n >= s_saven && s_n < s_saven + 3) + //if(context->TEX0.TBP0 >= 0x2b80 && context->TEX0.TBW == 2 && context->TEX0.PSM == PSM_PSMT4) + { + t->Save(format("c:/temp1/%08d_%05x_%d.bmp", s_counter, context->TEX0.TBP0, i)); + /* + GIFRegTEX0 TEX0 = MIP_TEX0; + TEX0.TBP0 = context->TEX0.TBP0; + do + { + TEX0.TBP0++; + const GSTextureCacheSW::Texture* t = m_tc->Lookup(TEX0, env.TEXA, r, gd.sel.tw + 3); + if(t == NULL) {ASSERT(0); return false;} + t->Save(format("c:/temp1/%08d_%05x_%d.bmp", s_counter, TEX0.TBP0, i)); + } + while(TEX0.TBP0 < 0x3fff); + */ + + int i = 0; + } + } s_counter++; @@ -701,38 +726,31 @@ void GSRendererSW::VertexKick(bool skip) { const GSDrawingContext* context = m_context; - GSVector4i xy = GSVector4i::load((int)m_v.XYZ.u32[0]); + GSVertexSW& dst = m_vl.AddTail(); - xy = xy.insert16<3>(m_v.FOG.F); - xy = xy.upl16(); - xy -= context->XYOFFSET; + GSVector4i xy = GSVector4i::load((int)m_v.XYZ.u32[0]).upl16() - context->XYOFFSET; + GSVector4i zf = GSVector4i((int)std::min(m_v.XYZ.Z, 0xffffff00), m_v.FOG.F); - GSVertexSW v; - - v.p = GSVector4(xy) * g_pos_scale; - - v.c = GSVector4(GSVector4i::load((int)m_v.RGBAQ.u32[0]).u8to32() << 7); + dst.p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale; if(tme) { + GSVector4 t; + if(fst) { - v.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4)); + t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4)); } else { - v.t = GSVector4(m_v.ST.S, m_v.ST.T); - v.t *= GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH); + t = GSVector4(m_v.ST.S, m_v.ST.T) * GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH); + t = t.xyxy(GSVector4::load(m_v.RGBAQ.Q)); } - v.t = v.t.xyxy(GSVector4::load(m_v.RGBAQ.Q)); + dst.t = t; } - GSVertexSW& dst = m_vl.AddTail(); - - dst = v; - - dst.p.z = (float)min(m_v.XYZ.Z, 0xffffff00); // max value which can survive the uint32 => float => uint32 conversion + dst.c = GSVector4::rgba32(m_v.RGBAQ.u32[0], 7); int count = 0; diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp index 7c9c7fd224..06c99c945b 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.avx.cpp @@ -29,7 +29,7 @@ using namespace Xbyak; void GSSetupPrimCodeGenerator::Generate() { - enter(32, true); + sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); @@ -55,7 +55,7 @@ void GSSetupPrimCodeGenerator::Generate() vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); - leave(); + add(rsp, 8 + 2 * 16); ret(); } @@ -186,16 +186,11 @@ void GSSetupPrimCodeGenerator::Texture() if(m_sel.fst) { - // m_local.d4.st = GSVector4i(t * 4.0f); - - if(m_sel.mipmap && !m_sel.lcm) - { - vmovhps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq.z)], xmm1); - } + // m_local.d4.stq = GSVector4i(t * 4.0f); vcvttps2dq(xmm1, xmm1); - vmovq(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } else { @@ -204,7 +199,7 @@ void GSSetupPrimCodeGenerator::Texture() vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } - for(int j = 0, k = m_sel.fst && !(m_sel.mipmap && !m_sel.lcm) ? 2 : 3; j < k; j++) + for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) { // GSVector4 ds = t.xxxx(); // GSVector4 dt = t.yyyy(); @@ -218,7 +213,7 @@ void GSSetupPrimCodeGenerator::Texture() vmulps(xmm2, xmm1, Xmm(4 + i)); - if(m_sel.fst && !(m_sel.mipmap && !m_sel.lcm)) + if(m_sel.fst) { // m_local.d[i].s/t = GSVector4i(v); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp index 4492c8e837..e599c97f71 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x64.cpp @@ -29,7 +29,7 @@ using namespace Xbyak; void GSSetupPrimCodeGenerator::Generate() { - enter(32, true); + sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); @@ -53,7 +53,7 @@ void GSSetupPrimCodeGenerator::Generate() vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); - leave(); + add(rsp, 8 + 2 * 16); ret(); } @@ -191,16 +191,11 @@ void GSSetupPrimCodeGenerator::Texture() if(m_sel.fst) { - // m_local.d4.st = GSVector4i(t * 4.0f); - - if(m_sel.mipmap && !m_sel.lcm) - { - movhps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq.z)], xmm1); - } + // m_local.d4.stq = GSVector4i(t * 4.0f); cvttps2dq(xmm1, xmm1); - movq(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); + movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } else { @@ -209,7 +204,7 @@ void GSSetupPrimCodeGenerator::Texture() movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); } - for(int j = 0, k = m_sel.fst && !(m_sel.mipmap && !m_sel.lcm) ? 2 : 3; j < k; j++) + for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++) { // GSVector4 ds = t.xxxx(); // GSVector4 dt = t.yyyy(); @@ -225,7 +220,7 @@ void GSSetupPrimCodeGenerator::Texture() movaps(xmm2, xmm1); mulps(xmm2, Xmm(4 + i)); - if(m_sel.fst && !(m_sel.mipmap && !m_sel.lcm)) + if(m_sel.fst) { // m_local.d[i].s/t = GSVector4i(v); diff --git a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp index 3ca05cd23a..67be7a7b36 100644 --- a/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSSetupPrimCodeGenerator.x86.cpp @@ -322,8 +322,7 @@ void GSSetupPrimCodeGenerator::Color() { // GSVector4i c = GSVector4i(vertices[0].c); - movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]); - cvttps2dq(xmm0, xmm0); + cvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]); // c = c.upl16(c.zwxy()); diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index b6cf2f19c7..cf6caf6baa 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -590,41 +590,41 @@ template void GSState::GIFRegHandlerTEX0(const GIFReg* r) if(m_env.CTXT[i].TEX1.MTBA) { + // NOTE 1: TEX1.MXL must not be automatically set to 3 here. + // NOTE 2: Mipmap levels are tightly packed, if (tbw << 6) > (1 << tw) then the left-over space to the right is used. (common for PSM_PSMT4) + // NOTE 3: Non-rectangular textures are treated as rectangular when calculating the occupied space (height is extended, not sure about width) + + uint32 bp = TEX0.TBP0; + uint32 bw = TEX0.TBW; + uint32 w = 1u << TEX0.TW; + uint32 h = 1u << TEX0.TH; uint32 bpp = GSLocalMemory::m_psm[TEX0.PSM].bpp; - uint32 tbp = TEX0.TBP0; - uint32 tbw = TEX0.TBW; - uint32 th = TEX0.TH; + if(h < w) h = w; - if(th >= 3) - { - tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; - tbw = std::max(tbw >> 1, 1); - th--; + bp += ((w * h * bpp >> 3) + 255) >> 8; + bw = std::max(bw >> 1, 1); + w = std::max(w >> 1, 1); + h = std::max(h >> 1, 1); - m_env.CTXT[i].MIPTBP1.TBP1 = tbp; - m_env.CTXT[i].MIPTBP1.TBW1 = tbw; + m_env.CTXT[i].MIPTBP1.TBP1 = bp; + m_env.CTXT[i].MIPTBP1.TBW1 = bw; - tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; - tbw = std::max(tbw >> 1, 1); - th--; + bp += ((w * h * bpp >> 3) + 255) >> 8; + bw = std::max(bw >> 1, 1); + w = std::max(w >> 1, 1); + h = std::max(h >> 1, 1); - m_env.CTXT[i].MIPTBP1.TBP2 = tbp; - m_env.CTXT[i].MIPTBP1.TBW2 = tbw; + m_env.CTXT[i].MIPTBP1.TBP2 = bp; + m_env.CTXT[i].MIPTBP1.TBW2 = bw; - tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; - tbw = std::max(tbw >> 1, 1); - th--; + bp += ((w * h * bpp >> 3) + 255) >> 8; + bw = std::max(bw >> 1, 1); + w = std::max(w >> 1, 1); + h = std::max(h >> 1, 1); - m_env.CTXT[i].MIPTBP1.TBP3 = tbp; - m_env.CTXT[i].MIPTBP1.TBW3 = tbw; - - // NOTE: TEX1.MXL must not be automatically set to 3 here - } - else - { - ASSERT(0); - } + m_env.CTXT[i].MIPTBP1.TBP3 = bp; + m_env.CTXT[i].MIPTBP1.TBW3 = bw; // printf("MTBA\n"); } diff --git a/plugins/GSdx/GSVector.cpp b/plugins/GSdx/GSVector.cpp index eb2769744a..47e724966c 100644 --- a/plugins/GSdx/GSVector.cpp +++ b/plugins/GSdx/GSVector.cpp @@ -29,6 +29,7 @@ const GSVector4 GSVector4::m_one(1.0f); const GSVector4 GSVector4::m_two(2.0f); const GSVector4 GSVector4::m_four(4.0f); const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000))); +const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f800000))); GSVector4i GSVector4i::fit(int arx, int ary) const { diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index 3b37743a3a..8d14d091e8 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -2333,6 +2333,7 @@ public: static const GSVector4 m_two; static const GSVector4 m_four; static const GSVector4 m_x4b000000; + static const GSVector4 m_x4f800000; __forceinline GSVector4() { @@ -2385,9 +2386,18 @@ public: this->m = m; } - __forceinline explicit GSVector4(uint32 u32) + __forceinline explicit GSVector4(int i) { - *this = GSVector4(GSVector4i::load((int)u32).u8to32()); + GSVector4i v((int)i); + + *this = GSVector4(v); + } + + __forceinline explicit GSVector4(uint32 u) + { + GSVector4i v((int)u); + + *this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31))); } __forceinline explicit GSVector4(const GSVector4i& v); @@ -2407,11 +2417,6 @@ public: this->m = m; } - __forceinline void operator = (uint32 u32) - { - *this = GSVector4(GSVector4i::load((int)u32).u8to32()); - } - __forceinline operator __m128() const { return m; @@ -2422,6 +2427,16 @@ public: return GSVector4i(*this).rgba32(); } + __forceinline static GSVector4 rgba32(uint32 rgba) + { + return GSVector4(GSVector4i::load((int)rgba).u8to32()); + } + + __forceinline static GSVector4 rgba32(uint32 rgba, int shift) + { + return GSVector4(GSVector4i::load((int)rgba).u8to32() << shift); + } + __forceinline static GSVector4 cast(const GSVector4i& v); __forceinline GSVector4 abs() const @@ -2840,6 +2855,13 @@ public: return GSVector4(_mm_load_ss(&f)); } + __forceinline static GSVector4 load(uint32 u) + { + GSVector4i v = GSVector4i::load((int)u); + + return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31))); + } + template __forceinline static GSVector4 load(const void* p) { return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p)); diff --git a/plugins/GSdx/GSVertexTrace.x64.avx.cpp b/plugins/GSdx/GSVertexTrace.x64.avx.cpp index 3624ed5b6b..880e5644e4 100644 --- a/plugins/GSdx/GSVertexTrace.x64.avx.cpp +++ b/plugins/GSdx/GSVertexTrace.x64.avx.cpp @@ -51,7 +51,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs break; } - enter(32, true); + sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); @@ -168,7 +168,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); - leave(); + add(rsp, 8 + 2 * 16); ret(); } @@ -200,7 +200,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma break; } - enter(32, true); + sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); @@ -334,7 +334,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); - leave(); + add(rsp, 8 + 2 * 16); ret(); } @@ -364,7 +364,7 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t break; } - enter(32, true); + sub(rsp, 8 + 2 * 16); vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 16], xmm7); @@ -488,7 +488,7 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm7, ptr[rsp + 16]); - leave(); + add(rsp, 8 + 2 * 16); ret(); } diff --git a/plugins/GSdx/GSVertexTrace.x64.cpp b/plugins/GSdx/GSVertexTrace.x64.cpp index e7e9a0a0f2..8dfc6db296 100644 --- a/plugins/GSdx/GSVertexTrace.x64.cpp +++ b/plugins/GSdx/GSVertexTrace.x64.cpp @@ -51,7 +51,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs break; } - enter(32, true); + sub(rsp, 8 + 2 * 16); movdqa(ptr[rsp + 0], xmm6); movdqa(ptr[rsp + 16], xmm7); @@ -172,7 +172,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs movdqa(xmm6, ptr[rsp + 0]); movdqa(xmm7, ptr[rsp + 16]); - leave(); + add(rsp, 8 + 2 * 16); ret(); } @@ -204,7 +204,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma break; } - enter(32, true); + sub(rsp, 8 + 2 * 16); movdqa(ptr[rsp + 0], xmm6); movdqa(ptr[rsp + 16], xmm7); @@ -355,7 +355,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma movdqa(xmm6, ptr[rsp + 0]); movdqa(xmm7, ptr[rsp + 16]); - leave(); + add(rsp, 8 + 2 * 16); ret(); } @@ -385,7 +385,7 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t break; } - enter(32, true); + sub(rsp, 8 + 2 * 16); movdqa(ptr[rsp + 0], xmm6); movdqa(ptr[rsp + 16], xmm7); @@ -535,7 +535,7 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t movdqa(xmm6, ptr[rsp + 0]); movdqa(xmm7, ptr[rsp + 16]); - leave(); + add(rsp, 8 + 2 * 16); ret(); }