GSdx: mipmapping fix (ford mustang racing, and probably other games which use small, non-square textures)

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4529 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-04-04 11:05:54 +00:00
parent 15f7b2b6d8
commit 732b038571
17 changed files with 194 additions and 194 deletions

View File

@ -249,10 +249,15 @@ public:
GSVector4 ys(0.257f, 0.504f, 0.098f, 0.0f); GSVector4 ys(0.257f, 0.504f, 0.098f, 0.0f);
GSVector4 us(-0.148f / 2, -0.291f / 2, 0.439f / 2, 0.0f); GSVector4 us(-0.148f / 2, -0.291f / 2, 0.439f / 2, 0.0f);
GSVector4 vs(0.439f / 2, -0.368f / 2, -0.071f / 2, 0.0f); GSVector4 vs(0.439f / 2, -0.368f / 2, -0.071f / 2, 0.0f);
const GSVector4 offset(16, 128, 16, 128);
if (!rgba) if(!rgba)
ys = ys.zyxw(), us = us.zyxw(), vs = vs.zyxw(); {
ys = ys.zyxw();
us = us.zyxw();
vs = vs.zyxw();
}
const GSVector4 offset(16, 128, 16, 128);
for(int j = 0; j < h; j++, dst += dstpitch, src += srcpitch) for(int j = 0; j < h; j++, dst += dstpitch, src += srcpitch)
{ {
@ -261,8 +266,8 @@ public:
for(int i = 0; i < w; i += 2) for(int i = 0; i < w; i += 2)
{ {
GSVector4 c0 = GSVector4(s[i + 0]); GSVector4 c0 = GSVector4::rgba32(s[i + 0]);
GSVector4 c1 = GSVector4(s[i + 1]); GSVector4 c1 = GSVector4::rgba32(s[i + 1]);
GSVector4 c2 = c0 + c1; GSVector4 c2 = c0 + c1;
GSVector4 lo = (c0 * ys).hadd(c2 * us); GSVector4 lo = (c0 * ys).hadd(c2 * us);

View File

@ -109,7 +109,7 @@ bool GSClut::WriteTest(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT)
case 4: if(m_CBP[0] == TEX0.CBP) return false; m_CBP[0] = TEX0.CBP; break; case 4: if(m_CBP[0] == TEX0.CBP) return false; m_CBP[0] = TEX0.CBP; break;
case 5: if(m_CBP[1] == TEX0.CBP) return false; m_CBP[1] = TEX0.CBP; break; case 5: if(m_CBP[1] == TEX0.CBP) return false; m_CBP[1] = TEX0.CBP; break;
case 6: ASSERT(0); return false; // ffx2 menu case 6: ASSERT(0); return false; // ffx2 menu
case 7: ASSERT(0); return false; case 7: ASSERT(0); return false; // ford mustang racing
default: __assume(0); default: __assume(0);
} }

View File

@ -350,7 +350,7 @@ void GSDevice11::ClearRenderTarget(GSTexture* t, const GSVector4& c)
void GSDevice11::ClearRenderTarget(GSTexture* t, uint32 c) void GSDevice11::ClearRenderTarget(GSTexture* t, uint32 c)
{ {
GSVector4 color = GSVector4(c) * (1.0f / 255); GSVector4 color = GSVector4::rgba32(c) * (1.0f / 255);
m_ctx->ClearRenderTargetView(*(GSTexture11*)t, color.v); m_ctx->ClearRenderTargetView(*(GSTexture11*)t, color.v);
} }

View File

@ -25,7 +25,7 @@
#if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64)) #if _M_SSE >= 0x500 && (defined(_M_AMD64) || defined(_WIN64))
#error TODO: this is still bogus somewhere #error TODO
void GSDrawScanlineCodeGenerator::Generate() void GSDrawScanlineCodeGenerator::Generate()
{ {
@ -38,17 +38,13 @@ void GSDrawScanlineCodeGenerator::Generate()
push(r12); push(r12);
push(r13); push(r13);
enter(10 * 16, true); sub(rsp, 8 + 10 * 16);
for(int i = 6; i < 16; i++) for(int i = 6; i < 16; i++)
{ {
vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i)); vmovdqa(ptr[rsp + (i - 6) * 16], Xmm(i));
} }
movsxd(rcx, ecx); // right
movsxd(rdx, edx); // left
movsxd(r8, r8d); // top
mov(r10, (size_t)&m_test[0]); mov(r10, (size_t)&m_test[0]);
mov(r11, (size_t)&m_local); mov(r11, (size_t)&m_local);
mov(r12, (size_t)m_local.gd); mov(r12, (size_t)m_local.gd);
@ -84,7 +80,14 @@ L("loop");
// ebp = za // ebp = za
SampleTexture(); if(m_sel.mmin)
{
SampleTextureLOD();
}
else
{
SampleTexture();
}
// ebp = za // ebp = za
// xmm2 = rb // xmm2 = rb
@ -201,7 +204,7 @@ L("exit");
vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]); vmovdqa(Xmm(i), ptr[rsp + (i - 6) * 16]);
} }
leave(); add(rsp, 8 + 10 * 16);
pop(r13); pop(r13);
pop(r12); pop(r12);
@ -237,10 +240,9 @@ void GSDrawScanlineCodeGenerator::Init()
mov(rax, rcx); mov(rax, rcx);
sar(rax, 63); sar(rax, 63);
and(rax, rcx); and(rax, rcx);
add(rax, 7);
shl(rax, 4); shl(rax, 4);
vpor(xmm15, ptr[rax + r10]); vpor(xmm15, ptr[rax + r10 + 7 * 16]);
// GSVector2i* fza_base = &m_local.gd->fzbr[top]; // GSVector2i* fza_base = &m_local.gd->fzbr[top];
@ -256,8 +258,7 @@ void GSDrawScanlineCodeGenerator::Init()
{ {
// edx = &m_local.d[skip] // edx = &m_local.d[skip]
shl(rdx, 3); lea(rdx, ptr[rdx * 8 + r11 + offsetof(GSScanlineLocalData, d)]);
lea(rdx, ptr[rdx + r11 + offsetof(GSScanlineLocalData, d)]);
} }
if(!m_sel.sprite) if(!m_sel.sprite)
@ -325,7 +326,7 @@ void GSDrawScanlineCodeGenerator::Init()
vpaddd(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]); vpaddd(xmm10, ptr[rdx + offsetof(GSScanlineLocalData::skip, s)]);
if(!m_sel.sprite) if(!m_sel.sprite || m_sel.mmin)
{ {
vpaddd(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]); vpaddd(xmm11, ptr[rdx + offsetof(GSScanlineLocalData::skip, t)]);
} }
@ -338,12 +339,6 @@ void GSDrawScanlineCodeGenerator::Init()
vpsrlw(xmm6, 1); vpsrlw(xmm6, 1);
} }
} }
if(m_sel.mipmap && !m_sel.lcm)
{
vshufps(xmm12, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm12, ptr[rdx + offsetof(GSScanlineLocalData::skip, q)]);
}
} }
else else
{ {
@ -441,17 +436,11 @@ void GSDrawScanlineCodeGenerator::Step()
vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); vpshufd(xmm1, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
vpaddd(xmm10, xmm1); vpaddd(xmm10, xmm1);
if(!m_sel.sprite) if(!m_sel.sprite || m_sel.mmin)
{ {
vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1)); vpshufd(xmm1, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
vpaddd(xmm11, xmm1); vpaddd(xmm11, xmm1);
} }
if(m_sel.mipmap && !m_sel.lcm)
{
vshufps(xmm3, xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
vaddps(xmm12, xmm3);
}
} }
else else
{ {
@ -510,10 +499,9 @@ void GSDrawScanlineCodeGenerator::Step()
mov(rdx, rcx); mov(rdx, rcx);
sar(rdx, 63); sar(rdx, 63);
and(rdx, rcx); and(rdx, rcx);
add(rdx, 7);
shl(rdx, 4); shl(rdx, 4);
vmovdqa(xmm15, ptr[rdx + r10]); vmovdqa(xmm15, ptr[rdx + r10 + 7 * 16]);
} }
void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2) void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
@ -589,11 +577,9 @@ void GSDrawScanlineCodeGenerator::TestZ(const Xmm& temp1, const Xmm& temp2)
vpslld(xmm2, 31); vpslld(xmm2, 31);
// GSVector4i zso = zs - o; // GSVector4i zso = zs - o;
vpsubd(xmm0, xmm2);
// GSVector4i zdo = zd - o; // GSVector4i zdo = zd - o;
vpsubd(xmm0, xmm2);
vpsubd(xmm1, xmm2); vpsubd(xmm1, xmm2);
} }
@ -629,11 +615,6 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// ebx = tex // ebx = tex
if(m_sel.mipmap && !m_sel.lcm)
{
}
if(!m_sel.fst) if(!m_sel.fst)
{ {
vrcpps(xmm0, xmm12); vrcpps(xmm0, xmm12);
@ -766,10 +747,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]); // c10 = addr10.gather32_32((const uint32/uint8*)tex[, clut]);
// c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]); // c11 = addr11.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(xmm0, xmm0, xmm4, xmm5); ReadTexel(4, 0);
ReadTexel(xmm1, xmm1, xmm4, xmm5);
ReadTexel(xmm2, xmm2, xmm4, xmm5);
ReadTexel(xmm3, xmm3, xmm4, xmm5);
// xmm0 = c00 // xmm0 = c00
// xmm1 = c01 // xmm1 = c01
@ -863,7 +841,7 @@ void GSDrawScanlineCodeGenerator::SampleTexture()
// c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]); // c00 = addr00.gather32_32((const uint32/uint8*)tex[, clut]);
ReadTexel(xmm2, xmm3, xmm0, xmm1); ReadTexel(1, 0);
// GSVector4i mask = GSVector4i::x00ff(); // GSVector4i mask = GSVector4i::x00ff();
@ -1032,6 +1010,18 @@ void GSDrawScanlineCodeGenerator::Wrap(const Xmm& uv0, const Xmm& uv1)
} }
} }
void GSDrawScanlineCodeGenerator::SampleTextureLOD()
{
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv)
{
}
void GSDrawScanlineCodeGenerator::WrapLOD(const Xmm& uv0, const Xmm& uv1)
{
}
void GSDrawScanlineCodeGenerator::AlphaTFX() void GSDrawScanlineCodeGenerator::AlphaTFX()
{ {
if(!m_sel.fb) if(!m_sel.fb)
@ -1046,6 +1036,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
// gat = gat.modulate16<1>(ga).clamp8(); // gat = gat.modulate16<1>(ga).clamp8();
modulate16(xmm3, xmm14, 1); modulate16(xmm3, xmm14, 1);
clamp16(xmm3, xmm0); clamp16(xmm3, xmm0);
// if(!tcc) gat = gat.mix16(ga.srl16(7)); // if(!tcc) gat = gat.mix16(ga.srl16(7));
@ -1053,6 +1044,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
if(!m_sel.tcc) if(!m_sel.tcc)
{ {
vpsrlw(xmm1, xmm14, 7); vpsrlw(xmm1, xmm14, 7);
mix16(xmm3, xmm1, xmm0); mix16(xmm3, xmm1, xmm0);
} }
@ -1065,6 +1057,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
if(!m_sel.tcc) if(!m_sel.tcc)
{ {
vpsrlw(xmm1, xmm14, 7); vpsrlw(xmm1, xmm14, 7);
mix16(xmm3, xmm1, xmm0); mix16(xmm3, xmm1, xmm0);
} }
@ -1075,7 +1068,12 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
// gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7))); // gat = gat.mix16(!tcc ? ga.srl16(7) : gat.addus8(ga.srl16(7)));
vpsrlw(xmm1, xmm14, 7); vpsrlw(xmm1, xmm14, 7);
if(m_sel.tcc) vpaddusb(xmm1, xmm3);
if(m_sel.tcc)
{
vpaddusb(xmm1, xmm3);
}
mix16(xmm3, xmm1, xmm0); mix16(xmm3, xmm1, xmm0);
break; break;
@ -1087,6 +1085,7 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
if(!m_sel.tcc) if(!m_sel.tcc)
{ {
vpsrlw(xmm1, xmm14, 7); vpsrlw(xmm1, xmm14, 7);
mix16(xmm3, xmm1, xmm0); mix16(xmm3, xmm1, xmm0);
} }
@ -1103,6 +1102,8 @@ void GSDrawScanlineCodeGenerator::AlphaTFX()
break; break;
} }
// TODO: aa1
} }
void GSDrawScanlineCodeGenerator::ReadMask() void GSDrawScanlineCodeGenerator::ReadMask()
@ -1218,6 +1219,7 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
// rbt = rbt.modulate16<1>(rb).clamp8(); // rbt = rbt.modulate16<1>(rb).clamp8();
modulate16(xmm2, xmm13, 1); modulate16(xmm2, xmm13, 1);
clamp16(xmm2, xmm0); clamp16(xmm2, xmm0);
break; break;
@ -1229,22 +1231,28 @@ void GSDrawScanlineCodeGenerator::ColorTFX()
case TFX_HIGHLIGHT: case TFX_HIGHLIGHT:
case TFX_HIGHLIGHT2: case TFX_HIGHLIGHT2:
// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
vmovdqa(xmm1, xmm3);
modulate16(xmm3, xmm14, 1);
vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1)); vpshuflw(xmm6, xmm14, _MM_SHUFFLE(3, 3, 1, 1));
vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1)); vpshufhw(xmm6, xmm6, _MM_SHUFFLE(3, 3, 1, 1));
vpsrlw(xmm6, 7); vpsrlw(xmm6, 7);
// gat = gat.modulate16<1>(ga).add16(af).clamp8().mix16(gat);
vmovdqa(xmm1, xmm3);
modulate16(xmm3, xmm14, 1);
vpaddw(xmm3, xmm6); vpaddw(xmm3, xmm6);
clamp16(xmm3, xmm0); clamp16(xmm3, xmm0);
mix16(xmm3, xmm1, xmm0); mix16(xmm3, xmm1, xmm0);
// rbt = rbt.modulate16<1>(rb).add16(af).clamp8(); // rbt = rbt.modulate16<1>(rb).add16(af).clamp8();
modulate16(xmm2, xmm13, 1); modulate16(xmm2, xmm13, 1);
vpaddw(xmm2, xmm6); vpaddw(xmm2, xmm6);
clamp16(xmm2, xmm0); clamp16(xmm2, xmm0);
break; break;
@ -1797,25 +1805,22 @@ void GSDrawScanlineCodeGenerator::WritePixel(const Xmm& src, const Reg64& addr,
} }
} }
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, const Xmm& temp1, const Xmm& temp2) void GSDrawScanlineCodeGenerator::ReadTexel(int pixels, int mip_offset)
{ {
ReadTexel(dst, addr, 0); // TODO
ReadTexel(dst, addr, 1);
ReadTexel(dst, addr, 2);
ReadTexel(dst, addr, 3);
} }
void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i) void GSDrawScanlineCodeGenerator::ReadTexel(const Xmm& dst, const Xmm& addr, uint8 i)
{ {
const Address& src = m_sel.tlu ? ptr[r12 + rax * 4 + offsetof(GSScanlineGlobalData, clut)] : ptr[rbx + rax * 4]; const Address& src = m_sel.tlu ? ptr[r12 + rax * 4 + offsetof(GSScanlineGlobalData, clut)] : ptr[rbx + rax * 4];
vpextrd(eax, addr, i); if(i == 0) vmovd(eax, addr);
else vpextrd(eax, addr, i);
movsxd(rax, eax);
if(m_sel.tlu) movzx(rax, byte[rbx + rax]); if(m_sel.tlu) movzx(rax, byte[rbx + rax]);
vpinsrd(dst, src, i); if(i == 0) vmovd(dst, src);
else vpinsrd(dst, src, i);
} }
#endif #endif

View File

@ -19,9 +19,6 @@
* *
*/ */
// TODO: x64 (use the extra regs to avoid spills of zs, zd, uf, vf, rb, ga and keep a few constants in the last two like aref or afix)
// TODO: for edges doing 4 pixels is wasteful (needed memory access * 4)
#include "stdafx.h" #include "stdafx.h"
#include "GSDrawScanlineCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.h"
#include "GSVertexSW.h" #include "GSVertexSW.h"

View File

@ -21,37 +21,3 @@
#include "stdafx.h" #include "stdafx.h"
#include "GSFunctionMap.h" #include "GSFunctionMap.h"
void GSCodeGenerator::enter(uint32 size, bool align)
{
#ifdef _M_AMD64
push(r15);
mov(r15, rsp);
if(size > 0) sub(rsp, size);
if(align) and(rsp, 0xfffffffffffffff0);
#else
push(ebp);
mov(ebp, esp);
if(size > 0) sub(esp, size);
if(align) and(esp, 0xfffffff0);
#endif
}
void GSCodeGenerator::leave()
{
#ifdef _M_AMD64
mov(rsp, r15);
pop(r15);
#else
mov(esp, ebp);
pop(ebp);
#endif
}

View File

@ -161,9 +161,6 @@ class GSCodeGenerator : public Xbyak::CodeGenerator
protected: protected:
Xbyak::util::Cpu m_cpu; Xbyak::util::Cpu m_cpu;
void enter(uint32 size, bool align);
void leave();
public: public:
GSCodeGenerator(void* code, size_t maxsize) GSCodeGenerator(void* code, size_t maxsize)
: Xbyak::CodeGenerator(maxsize, code) : Xbyak::CodeGenerator(maxsize, code)

View File

@ -263,7 +263,7 @@ public:
{ {
ps_sel.fog = 1; ps_sel.fog = 1;
ps_cb.FogColor_AREF = GSVector4(env.FOGCOL.u32[0]) / 255; ps_cb.FogColor_AREF = GSVector4::rgba32(env.FOGCOL.u32[0]) / 255;
} }
if(context->TEST.ATE) if(context->TEST.ATE)

View File

@ -384,7 +384,7 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL) if((int)m_vt.m_lod.x >= (int)context->TEX1.MXL)
{ {
k = (int)m_vt.m_lod.x << 16; // set lod to max k = (int)m_vt.m_lod.x << 16; // set lod to max level
gd.sel.lcm = 1; // lod is constant gd.sel.lcm = 1; // lod is constant
gd.sel.mmin = 1; // tri-linear is meaningless gd.sel.mmin = 1; // tri-linear is meaningless
@ -432,7 +432,11 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
static int s_counter = 0; static int s_counter = 0;
//t->Save(format("c:/temp1/%08d_%05x_0.bmp", s_counter, context->TEX0.TBP0)); if(0)
//if(context->TEX0.TH > context->TEX0.TW)
//if(s_n >= s_saven && s_n < s_saven + 3)
//if(context->TEX0.TBP0 >= 0x2b80 && context->TEX0.TBW == 2 && context->TEX0.PSM == PSM_PSMT4)
t->Save(format("c:/temp1/%08d_%05x_0.bmp", s_counter, context->TEX0.TBP0));
for(int i = 1, j = std::min<int>((int)context->TEX1.MXL, 6); i <= j; i++) for(int i = 1, j = std::min<int>((int)context->TEX1.MXL, 6); i <= j; i++)
{ {
@ -487,7 +491,28 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
gd.tex[i] = t->m_buff; gd.tex[i] = t->m_buff;
// t->Save(format("c:/temp1/%08d_%05x_%d.bmp", s_counter, context->TEX0.TBP0, i)); if(0)
//if(context->TEX0.TH > context->TEX0.TW)
//if(s_n >= s_saven && s_n < s_saven + 3)
//if(context->TEX0.TBP0 >= 0x2b80 && context->TEX0.TBW == 2 && context->TEX0.PSM == PSM_PSMT4)
{
t->Save(format("c:/temp1/%08d_%05x_%d.bmp", s_counter, context->TEX0.TBP0, i));
/*
GIFRegTEX0 TEX0 = MIP_TEX0;
TEX0.TBP0 = context->TEX0.TBP0;
do
{
TEX0.TBP0++;
const GSTextureCacheSW::Texture* t = m_tc->Lookup(TEX0, env.TEXA, r, gd.sel.tw + 3);
if(t == NULL) {ASSERT(0); return false;}
t->Save(format("c:/temp1/%08d_%05x_%d.bmp", s_counter, TEX0.TBP0, i));
}
while(TEX0.TBP0 < 0x3fff);
*/
int i = 0;
}
} }
s_counter++; s_counter++;
@ -701,38 +726,31 @@ void GSRendererSW::VertexKick(bool skip)
{ {
const GSDrawingContext* context = m_context; const GSDrawingContext* context = m_context;
GSVector4i xy = GSVector4i::load((int)m_v.XYZ.u32[0]); GSVertexSW& dst = m_vl.AddTail();
xy = xy.insert16<3>(m_v.FOG.F); GSVector4i xy = GSVector4i::load((int)m_v.XYZ.u32[0]).upl16() - context->XYOFFSET;
xy = xy.upl16(); GSVector4i zf = GSVector4i((int)std::min<uint32>(m_v.XYZ.Z, 0xffffff00), m_v.FOG.F);
xy -= context->XYOFFSET;
GSVertexSW v; dst.p = GSVector4(xy).xyxy(GSVector4(zf) + (GSVector4::m_x4f800000 & GSVector4::cast(zf.sra32(31)))) * g_pos_scale;
v.p = GSVector4(xy) * g_pos_scale;
v.c = GSVector4(GSVector4i::load((int)m_v.RGBAQ.u32[0]).u8to32() << 7);
if(tme) if(tme)
{ {
GSVector4 t;
if(fst) if(fst)
{ {
v.t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4)); t = GSVector4(((GSVector4i)m_v.UV).upl16() << (16 - 4));
} }
else else
{ {
v.t = GSVector4(m_v.ST.S, m_v.ST.T); t = GSVector4(m_v.ST.S, m_v.ST.T) * GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH);
v.t *= GSVector4(0x10000 << context->TEX0.TW, 0x10000 << context->TEX0.TH); t = t.xyxy(GSVector4::load(m_v.RGBAQ.Q));
} }
v.t = v.t.xyxy(GSVector4::load(m_v.RGBAQ.Q)); dst.t = t;
} }
GSVertexSW& dst = m_vl.AddTail(); dst.c = GSVector4::rgba32(m_v.RGBAQ.u32[0], 7);
dst = v;
dst.p.z = (float)min(m_v.XYZ.Z, 0xffffff00); // max value which can survive the uint32 => float => uint32 conversion
int count = 0; int count = 0;

View File

@ -29,7 +29,7 @@ using namespace Xbyak;
void GSSetupPrimCodeGenerator::Generate() void GSSetupPrimCodeGenerator::Generate()
{ {
enter(32, true); sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7); vmovdqa(ptr[rsp + 16], xmm7);
@ -55,7 +55,7 @@ void GSSetupPrimCodeGenerator::Generate()
vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]); vmovdqa(xmm7, ptr[rsp + 16]);
leave(); add(rsp, 8 + 2 * 16);
ret(); ret();
} }
@ -186,16 +186,11 @@ void GSSetupPrimCodeGenerator::Texture()
if(m_sel.fst) if(m_sel.fst)
{ {
// m_local.d4.st = GSVector4i(t * 4.0f); // m_local.d4.stq = GSVector4i(t * 4.0f);
if(m_sel.mipmap && !m_sel.lcm)
{
vmovhps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq.z)], xmm1);
}
vcvttps2dq(xmm1, xmm1); vcvttps2dq(xmm1, xmm1);
vmovq(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); vmovdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
} }
else else
{ {
@ -204,7 +199,7 @@ void GSSetupPrimCodeGenerator::Texture()
vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); vmovaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
} }
for(int j = 0, k = m_sel.fst && !(m_sel.mipmap && !m_sel.lcm) ? 2 : 3; j < k; j++) for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{ {
// GSVector4 ds = t.xxxx(); // GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy(); // GSVector4 dt = t.yyyy();
@ -218,7 +213,7 @@ void GSSetupPrimCodeGenerator::Texture()
vmulps(xmm2, xmm1, Xmm(4 + i)); vmulps(xmm2, xmm1, Xmm(4 + i));
if(m_sel.fst && !(m_sel.mipmap && !m_sel.lcm)) if(m_sel.fst)
{ {
// m_local.d[i].s/t = GSVector4i(v); // m_local.d[i].s/t = GSVector4i(v);

View File

@ -29,7 +29,7 @@ using namespace Xbyak;
void GSSetupPrimCodeGenerator::Generate() void GSSetupPrimCodeGenerator::Generate()
{ {
enter(32, true); sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7); vmovdqa(ptr[rsp + 16], xmm7);
@ -53,7 +53,7 @@ void GSSetupPrimCodeGenerator::Generate()
vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]); vmovdqa(xmm7, ptr[rsp + 16]);
leave(); add(rsp, 8 + 2 * 16);
ret(); ret();
} }
@ -191,16 +191,11 @@ void GSSetupPrimCodeGenerator::Texture()
if(m_sel.fst) if(m_sel.fst)
{ {
// m_local.d4.st = GSVector4i(t * 4.0f); // m_local.d4.stq = GSVector4i(t * 4.0f);
if(m_sel.mipmap && !m_sel.lcm)
{
movhps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq.z)], xmm1);
}
cvttps2dq(xmm1, xmm1); cvttps2dq(xmm1, xmm1);
movq(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); movdqa(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
} }
else else
{ {
@ -209,7 +204,7 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1); movaps(ptr[r8 + offsetof(GSScanlineLocalData, d4.stq)], xmm1);
} }
for(int j = 0, k = m_sel.fst && !(m_sel.mipmap && !m_sel.lcm) ? 2 : 3; j < k; j++) for(int j = 0, k = m_sel.fst ? 2 : 3; j < k; j++)
{ {
// GSVector4 ds = t.xxxx(); // GSVector4 ds = t.xxxx();
// GSVector4 dt = t.yyyy(); // GSVector4 dt = t.yyyy();
@ -225,7 +220,7 @@ void GSSetupPrimCodeGenerator::Texture()
movaps(xmm2, xmm1); movaps(xmm2, xmm1);
mulps(xmm2, Xmm(4 + i)); mulps(xmm2, Xmm(4 + i));
if(m_sel.fst && !(m_sel.mipmap && !m_sel.lcm)) if(m_sel.fst)
{ {
// m_local.d[i].s/t = GSVector4i(v); // m_local.d[i].s/t = GSVector4i(v);

View File

@ -322,8 +322,7 @@ void GSSetupPrimCodeGenerator::Color()
{ {
// GSVector4i c = GSVector4i(vertices[0].c); // GSVector4i c = GSVector4i(vertices[0].c);
movaps(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]); cvttps2dq(xmm0, ptr[ecx + offsetof(GSVertexSW, c)]);
cvttps2dq(xmm0, xmm0);
// c = c.upl16(c.zwxy()); // c = c.upl16(c.zwxy());

View File

@ -590,41 +590,41 @@ template<int i> void GSState::GIFRegHandlerTEX0(const GIFReg* r)
if(m_env.CTXT[i].TEX1.MTBA) if(m_env.CTXT[i].TEX1.MTBA)
{ {
// NOTE 1: TEX1.MXL must not be automatically set to 3 here.
// NOTE 2: Mipmap levels are tightly packed, if (tbw << 6) > (1 << tw) then the left-over space to the right is used. (common for PSM_PSMT4)
// NOTE 3: Non-rectangular textures are treated as rectangular when calculating the occupied space (height is extended, not sure about width)
uint32 bp = TEX0.TBP0;
uint32 bw = TEX0.TBW;
uint32 w = 1u << TEX0.TW;
uint32 h = 1u << TEX0.TH;
uint32 bpp = GSLocalMemory::m_psm[TEX0.PSM].bpp; uint32 bpp = GSLocalMemory::m_psm[TEX0.PSM].bpp;
uint32 tbp = TEX0.TBP0; if(h < w) h = w;
uint32 tbw = TEX0.TBW;
uint32 th = TEX0.TH;
if(th >= 3) bp += ((w * h * bpp >> 3) + 255) >> 8;
{ bw = std::max<uint32>(bw >> 1, 1);
tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; w = std::max<uint32>(w >> 1, 1);
tbw = std::max<uint32>(tbw >> 1, 1); h = std::max<uint32>(h >> 1, 1);
th--;
m_env.CTXT[i].MIPTBP1.TBP1 = tbp; m_env.CTXT[i].MIPTBP1.TBP1 = bp;
m_env.CTXT[i].MIPTBP1.TBW1 = tbw; m_env.CTXT[i].MIPTBP1.TBW1 = bw;
tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; bp += ((w * h * bpp >> 3) + 255) >> 8;
tbw = std::max<uint32>(tbw >> 1, 1); bw = std::max<uint32>(bw >> 1, 1);
th--; w = std::max<uint32>(w >> 1, 1);
h = std::max<uint32>(h >> 1, 1);
m_env.CTXT[i].MIPTBP1.TBP2 = tbp; m_env.CTXT[i].MIPTBP1.TBP2 = bp;
m_env.CTXT[i].MIPTBP1.TBW2 = tbw; m_env.CTXT[i].MIPTBP1.TBW2 = bw;
tbp += (((tbw << 6) * (1 << th) * bpp >> 3) + 255) >> 8; bp += ((w * h * bpp >> 3) + 255) >> 8;
tbw = std::max<uint32>(tbw >> 1, 1); bw = std::max<uint32>(bw >> 1, 1);
th--; w = std::max<uint32>(w >> 1, 1);
h = std::max<uint32>(h >> 1, 1);
m_env.CTXT[i].MIPTBP1.TBP3 = tbp; m_env.CTXT[i].MIPTBP1.TBP3 = bp;
m_env.CTXT[i].MIPTBP1.TBW3 = tbw; m_env.CTXT[i].MIPTBP1.TBW3 = bw;
// NOTE: TEX1.MXL must not be automatically set to 3 here
}
else
{
ASSERT(0);
}
// printf("MTBA\n"); // printf("MTBA\n");
} }

View File

@ -29,6 +29,7 @@ const GSVector4 GSVector4::m_one(1.0f);
const GSVector4 GSVector4::m_two(2.0f); const GSVector4 GSVector4::m_two(2.0f);
const GSVector4 GSVector4::m_four(4.0f); const GSVector4 GSVector4::m_four(4.0f);
const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000))); const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f800000)));
GSVector4i GSVector4i::fit(int arx, int ary) const GSVector4i GSVector4i::fit(int arx, int ary) const
{ {

View File

@ -2333,6 +2333,7 @@ public:
static const GSVector4 m_two; static const GSVector4 m_two;
static const GSVector4 m_four; static const GSVector4 m_four;
static const GSVector4 m_x4b000000; static const GSVector4 m_x4b000000;
static const GSVector4 m_x4f800000;
__forceinline GSVector4() __forceinline GSVector4()
{ {
@ -2385,9 +2386,18 @@ public:
this->m = m; this->m = m;
} }
__forceinline explicit GSVector4(uint32 u32) __forceinline explicit GSVector4(int i)
{ {
*this = GSVector4(GSVector4i::load((int)u32).u8to32()); GSVector4i v((int)i);
*this = GSVector4(v);
}
__forceinline explicit GSVector4(uint32 u)
{
GSVector4i v((int)u);
*this = GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
} }
__forceinline explicit GSVector4(const GSVector4i& v); __forceinline explicit GSVector4(const GSVector4i& v);
@ -2407,11 +2417,6 @@ public:
this->m = m; this->m = m;
} }
__forceinline void operator = (uint32 u32)
{
*this = GSVector4(GSVector4i::load((int)u32).u8to32());
}
__forceinline operator __m128() const __forceinline operator __m128() const
{ {
return m; return m;
@ -2422,6 +2427,16 @@ public:
return GSVector4i(*this).rgba32(); return GSVector4i(*this).rgba32();
} }
__forceinline static GSVector4 rgba32(uint32 rgba)
{
return GSVector4(GSVector4i::load((int)rgba).u8to32());
}
__forceinline static GSVector4 rgba32(uint32 rgba, int shift)
{
return GSVector4(GSVector4i::load((int)rgba).u8to32() << shift);
}
__forceinline static GSVector4 cast(const GSVector4i& v); __forceinline static GSVector4 cast(const GSVector4i& v);
__forceinline GSVector4 abs() const __forceinline GSVector4 abs() const
@ -2840,6 +2855,13 @@ public:
return GSVector4(_mm_load_ss(&f)); return GSVector4(_mm_load_ss(&f));
} }
__forceinline static GSVector4 load(uint32 u)
{
GSVector4i v = GSVector4i::load((int)u);
return GSVector4(v) + (m_x4f800000 & GSVector4::cast(v.sra32(31)));
}
template<bool aligned> __forceinline static GSVector4 load(const void* p) template<bool aligned> __forceinline static GSVector4 load(const void* p)
{ {
return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p)); return GSVector4(aligned ? _mm_load_ps((const float*)p) : _mm_loadu_ps((const float*)p));

View File

@ -51,7 +51,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
break; break;
} }
enter(32, true); sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7); vmovdqa(ptr[rsp + 16], xmm7);
@ -168,7 +168,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]); vmovdqa(xmm7, ptr[rsp + 16]);
leave(); add(rsp, 8 + 2 * 16);
ret(); ret();
} }
@ -200,7 +200,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
break; break;
} }
enter(32, true); sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7); vmovdqa(ptr[rsp + 16], xmm7);
@ -334,7 +334,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]); vmovdqa(xmm7, ptr[rsp + 16]);
leave(); add(rsp, 8 + 2 * 16);
ret(); ret();
} }
@ -364,7 +364,7 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
break; break;
} }
enter(32, true); sub(rsp, 8 + 2 * 16);
vmovdqa(ptr[rsp + 0], xmm6); vmovdqa(ptr[rsp + 0], xmm6);
vmovdqa(ptr[rsp + 16], xmm7); vmovdqa(ptr[rsp + 16], xmm7);
@ -488,7 +488,7 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
vmovdqa(xmm6, ptr[rsp + 0]); vmovdqa(xmm6, ptr[rsp + 0]);
vmovdqa(xmm7, ptr[rsp + 16]); vmovdqa(xmm7, ptr[rsp + 16]);
leave(); add(rsp, 8 + 2 * 16);
ret(); ret();
} }

View File

@ -51,7 +51,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
break; break;
} }
enter(32, true); sub(rsp, 8 + 2 * 16);
movdqa(ptr[rsp + 0], xmm6); movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7); movdqa(ptr[rsp + 16], xmm7);
@ -172,7 +172,7 @@ GSVertexTrace::CGSW::CGSW(const void* param, uint32 key, void* code, size_t maxs
movdqa(xmm6, ptr[rsp + 0]); movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]); movdqa(xmm7, ptr[rsp + 16]);
leave(); add(rsp, 8 + 2 * 16);
ret(); ret();
} }
@ -204,7 +204,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
break; break;
} }
enter(32, true); sub(rsp, 8 + 2 * 16);
movdqa(ptr[rsp + 0], xmm6); movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7); movdqa(ptr[rsp + 16], xmm7);
@ -355,7 +355,7 @@ GSVertexTrace::CGHW9::CGHW9(const void* param, uint32 key, void* code, size_t ma
movdqa(xmm6, ptr[rsp + 0]); movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]); movdqa(xmm7, ptr[rsp + 16]);
leave(); add(rsp, 8 + 2 * 16);
ret(); ret();
} }
@ -385,7 +385,7 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
break; break;
} }
enter(32, true); sub(rsp, 8 + 2 * 16);
movdqa(ptr[rsp + 0], xmm6); movdqa(ptr[rsp + 0], xmm6);
movdqa(ptr[rsp + 16], xmm7); movdqa(ptr[rsp + 16], xmm7);
@ -535,7 +535,7 @@ GSVertexTrace::CGHW11::CGHW11(const void* param, uint32 key, void* code, size_t
movdqa(xmm6, ptr[rsp + 0]); movdqa(xmm6, ptr[rsp + 0]);
movdqa(xmm7, ptr[rsp + 16]); movdqa(xmm7, ptr[rsp + 16]);
leave(); add(rsp, 8 + 2 * 16);
ret(); ret();
} }