diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index fcad7494df..2f9613f8a4 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -324,7 +324,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS } else if(sel.ltf) { - vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); + vf = v.xxzzlh().srl16(12); } s = GSVector4::cast(u); @@ -514,8 +514,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS u -= 0x8000; v -= 0x8000; - uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); - vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); + uf = u.xxzzlh().srl16(12); + vf = v.xxzzlh().srl16(12); } GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); @@ -581,19 +581,19 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS GSVector4i rb01 = c01.sll16(8).srl16(8); GSVector4i ga01 = c01.srl16(8); - rb00 = rb00.lerp16<0>(rb01, uf); - ga00 = ga00.lerp16<0>(ga01, uf); + rb00 = rb00.lerp16_4(rb01, uf); + ga00 = ga00.lerp16_4(ga01, uf); GSVector4i rb10 = c10.sll16(8).srl16(8); GSVector4i ga10 = c10.srl16(8); GSVector4i rb11 = c11.sll16(8).srl16(8); GSVector4i ga11 = c11.srl16(8); - rb10 = rb10.lerp16<0>(rb11, uf); - ga10 = ga10.lerp16<0>(ga11, uf); + rb10 = rb10.lerp16_4(rb11, uf); + ga10 = ga10.lerp16_4(ga11, uf); - rb = rb00.lerp16<0>(rb10, vf); - ga = ga00.lerp16<0>(ga10, vf); + rb = rb00.lerp16_4(rb10, vf); + ga = ga00.lerp16_4(ga10, vf); } else { @@ -635,8 +635,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS u -= 0x8000; v -= 0x8000; - uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); - vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); + uf = u.xxzzlh().srl16(12); + vf = v.xxzzlh().srl16(12); } GSVector4i uv0 = u.sra32(16).ps32(v.sra32(16)); @@ -702,19 +702,19 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS GSVector4i rb01 = c01.sll16(8).srl16(8); GSVector4i ga01 = c01.srl16(8); - rb00 = rb00.lerp16<0>(rb01, uf); - ga00 = ga00.lerp16<0>(ga01, uf); + rb00 = rb00.lerp16_4(rb01, uf); + ga00 = ga00.lerp16_4(ga01, uf); GSVector4i rb10 = c10.sll16(8).srl16(8); GSVector4i ga10 = c10.srl16(8); GSVector4i rb11 = c11.sll16(8).srl16(8); GSVector4i ga11 = c11.srl16(8); - rb10 = rb10.lerp16<0>(rb11, uf); - ga10 = ga10.lerp16<0>(ga11, uf); + rb10 = rb10.lerp16_4(rb11, uf); + ga10 = ga10.lerp16_4(ga11, uf); - rb2 = rb00.lerp16<0>(rb10, vf); - ga2 = ga00.lerp16<0>(ga10, vf); + rb2 = rb00.lerp16_4(rb10, vf); + ga2 = ga00.lerp16_4(ga10, vf); } else { @@ -745,7 +745,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS rb = rb.lerp16<0>(rb2, lodf); ga = ga.lerp16<0>(ga2, lodf); - } + } } else { @@ -770,11 +770,11 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS if(sel.ltf) { - uf = u.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); + uf = u.xxzzlh().srl16(12); if(sel.prim != GS_SPRITE_CLASS) { - vf = v.xxzzlh().srl16(16 - GS_BILINEAR_PRECISION).sll16(15 - GS_BILINEAR_PRECISION); + vf = v.xxzzlh().srl16(12); } } @@ -835,19 +835,19 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS GSVector4i rb01 = c01.sll16(8).srl16(8); GSVector4i ga01 = c01.srl16(8); - rb00 = rb00.lerp16<0>(rb01, uf); - ga00 = ga00.lerp16<0>(ga01, uf); + rb00 = rb00.lerp16_4(rb01, uf); + ga00 = ga00.lerp16_4(ga01, uf); GSVector4i rb10 = c10.sll16(8).srl16(8); GSVector4i ga10 = c10.srl16(8); GSVector4i rb11 = c11.sll16(8).srl16(8); GSVector4i ga11 = c11.srl16(8); - rb10 = rb10.lerp16<0>(rb11, uf); - ga10 = ga10.lerp16<0>(ga11, uf); + rb10 = rb10.lerp16_4(rb11, uf); + ga10 = ga10.lerp16_4(ga11, uf); - rb = rb00.lerp16<0>(rb10, vf); - ga = ga00.lerp16<0>(ga10, vf); + rb = rb00.lerp16_4(rb10, vf); + ga = ga00.lerp16_4(ga10, vf); } else { diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp index 7309f75fed..e2d731978f 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.cpp @@ -97,6 +97,25 @@ void GSDrawScanlineCodeGenerator::lerp16(const Xmm& a, const Xmm& b, const Xmm& #endif } +void GSDrawScanlineCodeGenerator::lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f) +{ + #if _M_SSE >= 0x500 + + vpsubw(a, b); + vpmullw(a, f); + vpsraw(a, 4); + vpaddw(a, b); + + #else + + psubw(a, b); + pmullw(a, f); + psraw(a, 4); + paddw(a, b); + + #endif +} + void GSDrawScanlineCodeGenerator::mix16(const Xmm& a, const Xmm& b, const Xmm& temp) { #if _M_SSE >= 0x500 diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.h b/plugins/GSdx/GSDrawScanlineCodeGenerator.h index 4b5b6c746d..73f4b59f04 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.h +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.h @@ -71,6 +71,7 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator void modulate16(const Xmm& a, const Operand& f, int shift); void lerp16(const Xmm& a, const Xmm& b, const Xmm& f, int shift); + void lerp16_4(const Xmm& a, const Xmm& b, const Xmm& f); void mix16(const Xmm& a, const Xmm& b, const Xmm& temp); void clamp16(const Xmm& a, const Xmm& temp); void alltrue(); diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp index c0d938f10c..374b3d24fd 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.avx.cpp @@ -389,8 +389,7 @@ void GSDrawScanlineCodeGenerator::Init() { vpshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm6, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm6, 15 - GS_BILINEAR_PRECISION); + vpsrlw(xmm6, 12); vmovdqa(ptr[&m_local.temp.vf], xmm6); } } @@ -743,8 +742,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); + vpsrlw(xmm0, 12); vmovdqa(ptr[&m_local.temp.uf], xmm0); if(m_sel.prim != GS_SPRITE_CLASS) @@ -753,8 +751,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); + vpsrlw(xmm0, 12); vmovdqa(ptr[&m_local.temp.vf], xmm0); } } @@ -878,11 +875,11 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm5 = c11 // xmm7 = used - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); - lerp16(xmm3, xmm2, xmm0, 0); - lerp16(xmm4, xmm6, xmm0, 0); + lerp16_4(xmm3, xmm2, xmm0); + lerp16_4(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 @@ -915,11 +912,11 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm6 = ga11 // xmm7 = used - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); - lerp16(xmm5, xmm1, xmm0, 0); - lerp16(xmm6, xmm2, xmm0, 0); + lerp16_4(xmm5, xmm1, xmm0); + lerp16_4(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 @@ -928,13 +925,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm0, xmm1, xmm2 = free // xmm7 = used - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); vmovdqa(xmm0, ptr[&m_local.temp.vf]); - lerp16(xmm5, xmm3, xmm0, 0); - lerp16(xmm6, xmm4, xmm0, 0); + lerp16_4(xmm5, xmm3, xmm0); + lerp16_4(xmm6, xmm4, xmm0); } else { @@ -1298,16 +1295,14 @@ return; vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); + vpsrlw(xmm0, 12); vmovdqa(ptr[&m_local.temp.uf], xmm0); // GSVector4i vf = v.xxzzlh().srl16(1); vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); + vpsrlw(xmm0, 12); vmovdqa(ptr[&m_local.temp.vf], xmm0); } @@ -1430,11 +1425,11 @@ return; // xmm5 = c11 // xmm7 = used - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); - lerp16(xmm3, xmm2, xmm0, 0); - lerp16(xmm4, xmm6, xmm0, 0); + lerp16_4(xmm3, xmm2, xmm0); + lerp16_4(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 @@ -1467,11 +1462,11 @@ return; // xmm6 = ga11 // xmm7 = used - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); - lerp16(xmm5, xmm1, xmm0, 0); - lerp16(xmm6, xmm2, xmm0, 0); + lerp16_4(xmm5, xmm1, xmm0); + lerp16_4(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 @@ -1480,13 +1475,13 @@ return; // xmm0, xmm1, xmm2 = free // xmm7 = used - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); vmovdqa(xmm0, ptr[&m_local.temp.vf]); - lerp16(xmm5, xmm3, xmm0, 0); - lerp16(xmm6, xmm4, xmm0, 0); + lerp16_4(xmm5, xmm3, xmm0); + lerp16_4(xmm6, xmm4, xmm0); } else { @@ -1541,16 +1536,14 @@ return; vpshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); + vpsrlw(xmm0, 12); vmovdqa(ptr[&m_local.temp.uf], xmm0); // GSVector4i vf = v.xxzzlh().srl16(1); vpshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); vpshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - vpsrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) vpsllw(xmm0, 15 - GS_BILINEAR_PRECISION); + vpsrlw(xmm0, 12); vmovdqa(ptr[&m_local.temp.vf], xmm0); } @@ -1673,11 +1666,11 @@ return; // xmm5 = c11 // xmm7 = used - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); + // rb00 = rb00.lerp16_4(rb01, uf); + // ga00 = ga00.lerp16_4(ga01, uf); - lerp16(xmm3, xmm2, xmm0, 0); - lerp16(xmm4, xmm6, xmm0, 0); + lerp16_4(xmm3, xmm2, xmm0); + lerp16_4(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 @@ -1710,11 +1703,11 @@ return; // xmm6 = ga11 // xmm7 = used - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); + // rb10 = rb10.lerp16_4(rb11, uf); + // ga10 = ga10.lerp16_4(ga11, uf); - lerp16(xmm5, xmm1, xmm0, 0); - lerp16(xmm6, xmm2, xmm0, 0); + lerp16_4(xmm5, xmm1, xmm0); + lerp16_4(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 @@ -1723,13 +1716,13 @@ return; // xmm0, xmm1, xmm2 = free // xmm7 = used - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); + // rb00 = rb00.lerp16_4(rb10, vf); + // ga00 = ga00.lerp16_4(ga10, vf); vmovdqa(xmm0, ptr[&m_local.temp.vf]); - lerp16(xmm5, xmm3, xmm0, 0); - lerp16(xmm6, xmm4, xmm0, 0); + lerp16_4(xmm5, xmm3, xmm0); + lerp16_4(xmm6, xmm4, xmm0); } else { diff --git a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp index b37dc11638..51ba25f4b2 100644 --- a/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp +++ b/plugins/GSdx/GSDrawScanlineCodeGenerator.x86.cpp @@ -389,8 +389,7 @@ void GSDrawScanlineCodeGenerator::Init() { pshuflw(xmm6, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm6, xmm6, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm6, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) psllw(xmm6, 15 - GS_BILINEAR_PRECISION); + psrlw(xmm6, 12); movdqa(ptr[&m_local.temp.vf], xmm6); } } @@ -748,8 +747,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); + psrlw(xmm0, 12); movdqa(ptr[&m_local.temp.uf], xmm0); if(m_sel.prim != GS_SPRITE_CLASS) @@ -758,8 +756,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture() pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); + psrlw(xmm0, 12); movdqa(ptr[&m_local.temp.vf], xmm0); } } @@ -891,11 +888,11 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm5 = c11 // xmm7 = used - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); + // rb00 = rb00.lerp_4(rb01, uf); + // ga00 = ga00.lerp_4(ga01, uf); - lerp16(xmm3, xmm2, xmm0, 0); - lerp16(xmm4, xmm6, xmm0, 0); + lerp16_4(xmm3, xmm2, xmm0); + lerp16_4(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 @@ -930,11 +927,11 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm6 = ga11 // xmm7 = used - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); + // rb10 = rb10.lerp_4(rb11, uf); + // ga10 = ga10.lerp_4(ga11, uf); - lerp16(xmm5, xmm1, xmm0, 0); - lerp16(xmm6, xmm2, xmm0, 0); + lerp16_4(xmm5, xmm1, xmm0); + lerp16_4(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 @@ -943,13 +940,13 @@ void GSDrawScanlineCodeGenerator::SampleTexture() // xmm0, xmm1, xmm2 = free // xmm7 = used - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); + // rb00 = rb00.lerp_4(rb10, vf); + // ga00 = ga00.lerp_4(ga10, vf); movdqa(xmm0, ptr[&m_local.temp.vf]); - lerp16(xmm5, xmm3, xmm0, 0); - lerp16(xmm6, xmm4, xmm0, 0); + lerp16_4(xmm5, xmm3, xmm0); + lerp16_4(xmm6, xmm4, xmm0); } else { @@ -1353,16 +1350,14 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); + psrlw(xmm0, 12); movdqa(ptr[&m_local.temp.uf], xmm0); // GSVector4i vf = v.xxzzlh().srl16(1); pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); + psrlw(xmm0, 12); movdqa(ptr[&m_local.temp.vf], xmm0); } @@ -1493,11 +1488,11 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // xmm5 = c11 // xmm7 = used - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); + // rb00 = rb00.lerp_4(rb01, uf); + // ga00 = ga00.lerp_4(ga01, uf); - lerp16(xmm3, xmm2, xmm0, 0); - lerp16(xmm4, xmm6, xmm0, 0); + lerp16_4(xmm3, xmm2, xmm0); + lerp16_4(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 @@ -1532,11 +1527,11 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // xmm6 = ga11 // xmm7 = used - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); + // rb10 = rb10.lerp_4(rb11, uf); + // ga10 = ga10.lerp_4(ga11, uf); - lerp16(xmm5, xmm1, xmm0, 0); - lerp16(xmm6, xmm2, xmm0, 0); + lerp16_4(xmm5, xmm1, xmm0); + lerp16_4(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 @@ -1545,13 +1540,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // xmm0, xmm1, xmm2 = free // xmm7 = used - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); + // rb00 = rb00.lerp_4(rb10, vf); + // ga00 = ga00.lerp_4(ga10, vf); movdqa(xmm0, ptr[&m_local.temp.vf]); - lerp16(xmm5, xmm3, xmm0, 0); - lerp16(xmm6, xmm4, xmm0, 0); + lerp16_4(xmm5, xmm3, xmm0); + lerp16_4(xmm6, xmm4, xmm0); } else { @@ -1608,16 +1603,14 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() pshuflw(xmm0, xmm2, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); + psrlw(xmm0, 12); movdqa(ptr[&m_local.temp.uf], xmm0); // GSVector4i vf = v.xxzzlh().srl16(1); pshuflw(xmm0, xmm3, _MM_SHUFFLE(2, 2, 0, 0)); pshufhw(xmm0, xmm0, _MM_SHUFFLE(2, 2, 0, 0)); - psrlw(xmm0, 16 - GS_BILINEAR_PRECISION); - if(GS_BILINEAR_PRECISION < 15) psllw(xmm0, 15 - GS_BILINEAR_PRECISION); + psrlw(xmm0, 12); movdqa(ptr[&m_local.temp.vf], xmm0); } @@ -1748,11 +1741,11 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // xmm5 = c11 // xmm7 = used - // rb00 = rb00.lerp16<0>(rb01, uf); - // ga00 = ga00.lerp16<0>(ga01, uf); + // rb00 = rb00.lerp_4(rb01, uf); + // ga00 = ga00.lerp_4(ga01, uf); - lerp16(xmm3, xmm2, xmm0, 0); - lerp16(xmm4, xmm6, xmm0, 0); + lerp16_4(xmm3, xmm2, xmm0); + lerp16_4(xmm4, xmm6, xmm0); // xmm0 = uf // xmm3 = rb00 @@ -1787,11 +1780,11 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // xmm6 = ga11 // xmm7 = used - // rb10 = rb10.lerp16<0>(rb11, uf); - // ga10 = ga10.lerp16<0>(ga11, uf); + // rb10 = rb10.lerp_4(rb11, uf); + // ga10 = ga10.lerp_4(ga11, uf); - lerp16(xmm5, xmm1, xmm0, 0); - lerp16(xmm6, xmm2, xmm0, 0); + lerp16_4(xmm5, xmm1, xmm0); + lerp16_4(xmm6, xmm2, xmm0); // xmm3 = rb00 // xmm4 = ga00 @@ -1800,13 +1793,13 @@ void GSDrawScanlineCodeGenerator::SampleTextureLOD() // xmm0, xmm1, xmm2 = free // xmm7 = used - // rb00 = rb00.lerp16<0>(rb10, vf); - // ga00 = ga00.lerp16<0>(ga10, vf); + // rb00 = rb00.lerp_4(rb10, vf); + // ga00 = ga00.lerp_4(ga10, vf); movdqa(xmm0, ptr[&m_local.temp.vf]); - lerp16(xmm5, xmm3, xmm0, 0); - lerp16(xmm6, xmm4, xmm0, 0); + lerp16_4(xmm5, xmm3, xmm0); + lerp16_4(xmm6, xmm4, xmm0); } else { diff --git a/plugins/GSdx/GSLocalMemory.cpp b/plugins/GSdx/GSLocalMemory.cpp index dfdd11274a..cd6e28b49a 100644 --- a/plugins/GSdx/GSLocalMemory.cpp +++ b/plugins/GSdx/GSLocalMemory.cpp @@ -447,6 +447,7 @@ GSLocalMemory::~GSLocalMemory() vmfree(m_vm8, m_vmsize * 2); for_each(m_omap.begin(), m_omap.end(), aligned_free_second()); + for_each(m_pomap.begin(), m_pomap.end(), aligned_free_second()); for_each(m_po4map.begin(), m_po4map.end(), aligned_free_second()); for(hash_map*>::iterator i = m_p2tmap.begin(); i != m_p2tmap.end(); i++) diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 44de110260..3869b8faed 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -680,6 +680,8 @@ bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pag bool fb = fb_pages != NULL; bool zb = zb_pages != NULL; + bool res = false; + if(m_fzb != m_context->offset.fzb4) { // targets changed, check everything @@ -724,7 +726,7 @@ bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pag { if(LOG) {fprintf(s_fp, "syncpoint 0\n"); fflush(s_fp);} - return true; + res = true; } //if(LOG) {fprintf(s_fp, "no syncpoint *\n"); fflush(s_fp);} @@ -785,7 +787,7 @@ bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pag { if(LOG) {fprintf(s_fp, "syncpoint 1\n"); fflush(s_fp);} - return true; + res = true; } } } @@ -795,7 +797,7 @@ bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pag // chross-check frame and z-buffer pages, they cannot overlap with eachother and with previous batches in queue, // have to be careful when the two buffers are mutually enabled/disabled and alternating (Bully FBP/ZBP = 0x2300) - if(fb) + if(fb && !res) { for(const uint32* p = fb_pages; *p != GSOffset::EOP; p++) { @@ -803,12 +805,14 @@ bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pag { if(LOG) {fprintf(s_fp, "syncpoint 2\n"); fflush(s_fp);} - return true; + res = true; + + break; } } } - if(zb) + if(zb && !res) { for(const uint32* p = zb_pages; *p != GSOffset::EOP; p++) { @@ -816,14 +820,19 @@ bool GSRendererSW::CheckTargetPages(const uint32* fb_pages, const uint32* zb_pag { if(LOG) {fprintf(s_fp, "syncpoint 3\n"); fflush(s_fp);} - return true; + res = true; + + break; } } } } } - return false; + if(!fb && fb_pages != NULL) delete [] fb_pages; + if(!zb && zb_pages != NULL) delete [] zb_pages; + + return res; } bool GSRendererSW::CheckSourcePages(SharedData* sd) @@ -1334,7 +1343,7 @@ bool GSRendererSW::GetScanlineGlobalData(SharedData* data) gd.zm |= GSVector4i::xffff0000(); } - if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) + if(gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data->bbox.eq(data->bbox.rintersect(data->scissor))) // TODO: check scissor horizontally only { gd.sel.notest = 1; diff --git a/plugins/GSdx/GSScanlineEnvironment.h b/plugins/GSdx/GSScanlineEnvironment.h index a8d9637c40..aee43b8409 100644 --- a/plugins/GSdx/GSScanlineEnvironment.h +++ b/plugins/GSdx/GSScanlineEnvironment.h @@ -24,8 +24,6 @@ #include "GSLocalMemory.h" #include "GSVector.h" -#define GS_BILINEAR_PRECISION 4 // max precision 15, but several games like okami, rogue galaxy, dq8 break above 4 - union GSScanlineSelector { struct @@ -70,6 +68,7 @@ union GSScanlineSelector uint32 lcm:1; // 52 uint32 mmin:2; // 53 uint32 notest:1; // 54 (no ztest, no atest, no date, no scissor test, and horizontally aligned to 4 pixels) + // TODO: 1D texture flag? could save 2 texture reads and 4 lerps with bilinear, and also the texture coordinate clamp/wrap code in one direction }; struct diff --git a/plugins/GSdx/GSTextureCacheSW.cpp b/plugins/GSdx/GSTextureCacheSW.cpp index 37e2720067..5f9fd5e1ba 100644 --- a/plugins/GSdx/GSTextureCacheSW.cpp +++ b/plugins/GSdx/GSTextureCacheSW.cpp @@ -131,25 +131,6 @@ void GSTextureCacheSW::RemoveAll() } } -void GSTextureCacheSW::RemoveAt(Texture* t) -{ - m_textures.erase(t); - - for(uint32 start = t->m_TEX0.TBP0 >> 5, end = countof(m_map) - 1; start <= end; start++) - { - list& m = m_map[start]; - - for(list::iterator i = m.begin(); i != m.end(); ) - { - list::iterator j = i++; - - if(*j == t) {m.erase(j); break;} - } - } - - delete t; -} - void GSTextureCacheSW::IncAge() { for(hash_set::iterator i = m_textures.begin(); i != m_textures.end(); ) @@ -158,9 +139,23 @@ void GSTextureCacheSW::IncAge() Texture* t = *j; - if(++t->m_age > 30) + if(++t->m_age > 10) { - RemoveAt(t); + m_textures.erase(j); + + for(const uint32* p = t->m_pages.n; *p != GSOffset::EOP; p++) + { + list& m = m_map[*p]; + + for(list::iterator i = m.begin(); i != m.end(); ) + { + list::iterator j = i++; + + if(*j == t) {m.erase(j); break;} + } + } + + delete t; } } } diff --git a/plugins/GSdx/GSTextureCacheSW.h b/plugins/GSdx/GSTextureCacheSW.h index 8c80456c16..f15998c64a 100644 --- a/plugins/GSdx/GSTextureCacheSW.h +++ b/plugins/GSdx/GSTextureCacheSW.h @@ -68,6 +68,5 @@ public: void InvalidatePages(const uint32* pages, uint32 psm); void RemoveAll(); - void RemoveAt(Texture* t); void IncAge(); }; diff --git a/plugins/GSdx/GSVector.h b/plugins/GSdx/GSVector.h index de20807352..19a21ded08 100644 --- a/plugins/GSdx/GSVector.h +++ b/plugins/GSdx/GSVector.h @@ -1004,6 +1004,13 @@ public: return d.add16(a.sub16(b).modulate16(c)); } + __forceinline GSVector4i lerp16_4(const GSVector4i& a, const GSVector4i& f) const + { + // (a - this) * f >> 4 + this (a, this: 8-bit, f: 4-bit) + + return add16(a.sub16(*this).mul16l(f).sra16(4)); + } + template __forceinline GSVector4i modulate16(const GSVector4i& f) const { // a * f << shift