gsdx: date/datm fix for 16-bit frame buffer in sw rendering mode

This commit is contained in:
gabest11 2015-08-02 20:21:49 +02:00
parent 1f402b1b56
commit a1a842b07f
11 changed files with 599 additions and 289 deletions

View File

@ -113,13 +113,13 @@ bool GPURenderer::Merge()
GSVector2i s = st[0]->GetSize();
GSVector4 sRect[2];
GSVector4 dRect[2];
GSVector4 sr[2];
GSVector4 dr[2];
sRect[0] = GSVector4(0, 0, 1, 1);
dRect[0] = GSVector4(0, 0, s.x, s.y);
sr[0] = GSVector4(0, 0, 1, 1);
dr[0] = GSVector4(0, 0, s.x, s.y);
m_dev->Merge(st, sRect, dRect, s, 1, 1, GSVector4(0, 0, 0, 1));
m_dev->Merge(st, sr, dr, s, 1, 1, GSVector4(0, 0, 0, 1));
if(m_shadeboost)
{

View File

@ -1148,7 +1148,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
{
if(sel.fpsm == 2)
{
test |= fd.srl32(15) == GSVector8i::zero();
// test |= fd.srl32(15) == GSVector8i::zero();
test |= fd.sll32(16).sra32(31) == GSVector8i::zero();
}
else
{
@ -1159,7 +1160,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
{
if(sel.fpsm == 2)
{
test |= fd.sll32(16).sra32(31);
test |= fd.sll32(16).sra32(31); // == GSVector8i::xffffffff();
}
else
{
@ -2264,7 +2265,8 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
{
if(sel.fpsm == 2)
{
test |= fd.srl32(15) == GSVector4i::zero();
// test |= fd.srl32(15) == GSVector4i::zero();
test |= fd.sll32(16).sra32(31) == GSVector4i::zero();
}
else
{
@ -2275,7 +2277,7 @@ void GSDrawScanline::DrawScanline(int pixels, int left, int top, const GSVertexS
{
if(sel.fpsm == 2)
{
test |= fd.sll32(16).sra32(31);
test |= fd.sll32(16).sra32(31); // == GSVector4i::xffffffff();
}
else
{

View File

@ -1326,7 +1326,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
if(m_sel.fpsm == 2)
{
vpxor(xmm0, xmm0);
vpsrld(xmm1, xmm6, 15);
//vpsrld(xmm1, xmm6, 15);
vpslld(xmm1, xmm6, 16);
vpsrad(xmm1, 31);
vpcmpeqd(xmm1, xmm0);
}
else

View File

@ -2330,7 +2330,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
if(m_sel.fpsm == 2)
{
vpxor(xmm0, xmm0);
vpsrld(xmm1, xmm2, 15);
//vpsrld(xmm1, xmm2, 15);
vpslld(xmm1, xmm2, 16);
vpsrad(xmm1, 31);
vpcmpeqd(xmm1, xmm0);
}
else

View File

@ -2295,7 +2295,9 @@ void GSDrawScanlineCodeGenerator::TestDestAlpha()
if(m_sel.fpsm == 2)
{
vpxor(ymm0, ymm0);
vpsrld(ymm1, ymm2, 15);
//vpsrld(ymm1, ymm2, 15);
vpslld(ymm1, ymm2, 16);
vpsrad(ymm1, 31);
vpcmpeqd(ymm1, ymm0);
}
else

File diff suppressed because it is too large Load Diff

View File

@ -108,6 +108,7 @@ class GSRendererCL : public GSRenderer
uint32 noscissor:1; // 53
uint32 tpsm:4; // 54
uint32 aem:1; // 58
uint32 merged:1; // 59
// TODO
};
@ -148,6 +149,7 @@ class GSRendererCL : public GSRenderer
{
GSVector4i scissor;
GSVector4i dimx; // 4x4 signed char
TFXSelector sel;
uint32 fbp, zbp, bw;
uint32 fm, zm;
uint32 fog; // rgb
@ -172,6 +174,7 @@ class GSRendererCL : public GSRenderer
GSVector4i* src_pages; // read by any texture level
GSVector4i* dst_pages; // f/z writes to it
uint32 fbp, zbp, bw;
uint32 fpsm, zpsm, tpsm;
#ifdef DEBUG
TFXParameter* pb;
#endif
@ -227,16 +230,23 @@ class GSRendererCL : public GSRenderer
void Enqueue();
void EnqueueTFX(std::list<shared_ptr<TFXJob>>& jobs, uint32 bin_count, const cl_uchar4& bin_dim);
void UpdateTextureCache(TFXJob* job);
void JoinTFX(std::list<shared_ptr<TFXJob>>& jobs);
bool UpdateTextureCache(TFXJob* job);
void InvalidateTextureCache(TFXJob* job);
void UsePages(uint32* pages);
void ReleasePages(uint32* pages);
static void CL_CALLBACK ReleasePageEvent(cl_event event, cl_int event_command_exec_status, void* user_data);
protected:
GSTexture* m_texture[2];
uint8* m_output;
GSVector4i m_rw_pages[2][4]; // pages that may be read or modified by the rendering queue, f/z rw, tex r
GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?)
GSVector4i m_tmp_pages[4]; // TODO: this should be block level, too many overlaps inside pages with render targets
GSVector4i m_tc_pages[4]; // invalidated texture cache pages (split this into 8:24?) // TODO: this should be block level, too many overlaps inside pages with render targets
GSVector4i m_tmp_pages[4];
uint32 m_tmp_pages2[MAX_PAGES + 1];
uint32 m_rw_pages_rendering[512]; // pages that are currently in-use
void Reset();
void VSync(int field);

View File

@ -66,6 +66,9 @@ GSState::GSState()
//s_dump = 1;
//s_save = 1;
//s_savez = 1;
//s_savet = 1;
//s_savef = 1;
//s_saven = 656;
UserHacks_WildHack = !!theApp.GetConfig("UserHacks", 0) ? theApp.GetConfig("UserHacks_WildHack", 0) : 0;
m_crc_hack_level = theApp.GetConfig("crc_hack_level", 3);

View File

@ -224,7 +224,7 @@ bool GSUtil::CheckSSE()
return true;
}
#define OCL_PROGRAM_VERSION 1
#define OCL_PROGRAM_VERSION 3
#ifdef ENABLE_OPENCL
void GSUtil::GetDeviceDescs(list<OCLDeviceDesc>& dl)

View File

@ -1,3 +1,9 @@
#if defined(CL_VERSION_2_0)
#error hello
#endif
#if defined(CL_VERSION_1_1) || defined(CL_VERSION_1_2) // make safe to include in resource file to enforce dependency
#ifdef cl_amd_printf
@ -26,6 +32,13 @@
#error "MAX_PRIM_PER_BATCH != 32u OR 64u"
#endif
#define TFX_ABA(sel) ((sel.x >> 24) & 3)
#define TFX_ABB(sel) ((sel.x >> 26) & 3)
#define TFX_ABC(sel) ((sel.x >> 28) & 3)
#define TFX_ABD(sel) ((sel.x >> 30) & 3)
#define TFX_WMS(sel) ((sel.y >> 8) & 3)
#define TFX_WMT(sel) ((sel.y >> 10) & 3)
typedef struct
{
union {float4 p; struct {float x, y; uint z, f;};};
@ -35,9 +48,9 @@ typedef struct
typedef struct
{
gs_vertex v[3];
uint zmin;
uint zmin, zmax;
uint pb_index;
uint _pad[2];
uint _pad;
} gs_prim;
typedef struct
@ -60,6 +73,7 @@ typedef struct
{
int4 scissor;
char dimx[4][4];
uint2 sel;
int fbp, zbp, bw;
uint fm, zm;
uchar4 fog; // rgb
@ -110,10 +124,10 @@ enum GS_TFX
enum GS_CLAMP
{
CLAMP_REPEAT = 0,
CLAMP_CLAMP = 1,
CLAMP_REGION_CLAMP = 2,
CLAMP_REGION_REPEAT = 3,
CLAMP_REGION_REPEAT = 0,
CLAMP_REPEAT = 1,
CLAMP_CLAMP = 2,
CLAMP_REGION_CLAMP = 3,
};
enum GS_ZTST
@ -603,6 +617,7 @@ __kernel void KERNEL_PRIM(
// only interpolate the relative to zmin and hopefully small values
uint zmin = min(min(v0->z, v1->z), v2->z);
uint zmax = max(max(v0->z, v1->z), v2->z);
prim->v[0].p = (float4)(v0->p.x, v0->p.y, as_float(v0->z - zmin), v0->p.w);
prim->v[0].tc = v0->tc;
@ -612,6 +627,7 @@ __kernel void KERNEL_PRIM(
prim->v[2].tc = v2->tc;
prim->zmin = zmin;
prim->zmax = zmax;
float4 dp0 = v1->p - v0->p;
float4 dp1 = v0->p - v2->p;
@ -969,27 +985,34 @@ bool DestAlphaTest(uint fd)
int Wrap(int a, int b, int c, int mode)
{
switch(mode)
if(MERGED)
{
case CLAMP_REPEAT:
return a & b;
case CLAMP_CLAMP:
return clamp(a, 0, c);
case CLAMP_REGION_CLAMP:
return clamp(a, b, c);
case CLAMP_REGION_REPEAT:
return (a & b) | c;
return select((a & b) | c, clamp(a, b, c), (mode & 2) != 0);
}
else
{
switch(mode)
{
case CLAMP_REGION_REPEAT:
return (a & b) | c;
case CLAMP_REPEAT:
return a & b;
case CLAMP_CLAMP:
return clamp(a, 0, c);
case CLAMP_REGION_CLAMP:
return clamp(a, b, c);
}
}
}
int4 AlphaBlend(int4 c, int afix, uint fd)
int4 AlphaBlend(int4 c, uint fd, int afix, uint2 sel)
{
if(FWRITE && (ABE || AA1))
{
int4 cs = c;
int4 cd;
if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1)
if(ABA != ABB && (ABA == 1 || ABB == 1 || ABC == 1) || ABD == 1 || MERGED)
{
if(is32bit(FPSM) || is24bit(FPSM))
{
@ -1007,50 +1030,69 @@ int4 AlphaBlend(int4 c, int afix, uint fd)
}
}
if(ABA != ABB)
if(MERGED)
{
switch(ABA)
{
case 0: break; // c.xyz = cs.xyz;
case 1: c.xyz = cd.xyz; break;
case 2: c.xyz = 0; break;
}
int aba = TFX_ABA(sel);
int abb = TFX_ABB(sel);
int abc = TFX_ABC(sel);
int abd = TFX_ABD(sel);
switch(ABB)
{
case 0: c.xyz -= cs.xyz; break;
case 1: c.xyz -= cd.xyz; break;
case 2: break;
}
int ad = !is24bit(FPSM) ? cd.w : 0x80;
if(!(is24bit(FPSM) && ABC == 1))
{
int a = 0;
int3 A = aba == 0 ? cs.xyz : aba == 1 ? cd.xyz : 0;
int3 B = abb == 0 ? cs.xyz : abb == 1 ? cd.xyz : 0;
int C = abc == 0 ? cs.w : abc == 1 ? ad : afix;
int3 D = abd == 0 ? cs.xyz : abd == 1 ? cd.xyz : 0;
switch(ABC)
{
case 0: a = cs.w; break;
case 1: a = cd.w; break;
case 2: a = afix; break;
}
c.xyz = c.xyz * a >> 7;
}
switch(ABD)
{
case 0: c.xyz += cs.xyz; break;
case 1: c.xyz += cd.xyz; break;
case 2: break;
}
c.xyz = (mul24(A - B, C) >> 7) + D;
}
else
{
switch(ABD)
if(ABA != ABB)
{
case 0: break;
case 1: c.xyz = cd.xyz; break;
case 2: c.xyz = 0; break;
switch(ABA)
{
case 0: break; // c.xyz = cs.xyz;
case 1: c.xyz = cd.xyz; break;
case 2: c.xyz = 0; break;
}
switch(ABB)
{
case 0: c.xyz -= cs.xyz; break;
case 1: c.xyz -= cd.xyz; break;
case 2: break;
}
if(!(is24bit(FPSM) && ABC == 1))
{
int a = 0;
switch(ABC)
{
case 0: a = cs.w; break;
case 1: a = cd.w; break;
case 2: a = afix; break;
}
c.xyz = c.xyz * a >> 7;
}
switch(ABD)
{
case 0: c.xyz += cs.xyz; break;
case 1: c.xyz += cd.xyz; break;
case 2: break;
}
}
else
{
switch(ABD)
{
case 0: break;
case 1: c.xyz = cd.xyz; break;
case 2: c.xyz = 0; break;
}
}
}
@ -1150,8 +1192,6 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
if(!FST)
{
uv = convert_int2_rte(t.xy * native_recip(t.z));
if(LTF) uv -= 0x0008;
}
else
{
@ -1167,15 +1207,17 @@ int4 SampleTexture(__global uchar* tex, __global gs_param* pb, float3 t)
uv = convert_int2(t.xy);
}
if(LTF) uv -= 0x0008;
int2 uvf = uv & 0x000f;
int2 uv0 = uv >> 4;
int2 uv1 = uv0 + 1;
uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, WMS);
uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, WMT);
uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, WMS);
uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, WMT);
uv0.x = Wrap(uv0.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
uv0.y = Wrap(uv0.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);
uv1.x = Wrap(uv1.x, pb->minu, pb->maxu, MERGED ? TFX_WMS(pb->sel) : WMS);
uv1.y = Wrap(uv1.y, pb->minv, pb->maxv, MERGED ? TFX_WMT(pb->sel) : WMT);
int4 c00 = ReadTexel(tex, uv0.x, uv0.y, 0, pb);
int4 c01 = ReadTexel(tex, uv1.x, uv0.y, 0, pb);
@ -1356,6 +1398,11 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
{
// TODO: aa1: draw edge as a line
if(!ZTest(prim->zmax, zd))
{
continue;
}
__global gs_barycentric* b = &barycentric[prim_index + i];
float3 f = b->dx.xyz * (pf.x - b->dx.w) + b->dy.xyz * (pf.y - b->dy.w) + (float3)(0, 0, 1);
@ -1420,8 +1467,6 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
if(TFX != TFX_NONE)
{
tex = vm; // TODO: use the texture cache
ct = SampleTexture(tex, pb, t);
}
@ -1515,7 +1560,7 @@ __kernel __attribute__((reqd_work_group_size(8, 8, 1))) void KERNEL_TFX(
// alpha blend
c = AlphaBlend(c, pb->afix, fd);
c = AlphaBlend(c, fd, pb->afix, pb->sel);
// write frame

View File

@ -123,8 +123,15 @@ using namespace std;
#include <GL/wglext.h>
#include "GLLoader.h"
#if _MSC_VER >= 1800
#include <unordered_map>
#include <unordered_set>
#define hash_map unordered_map
#define hash_set unordered_set
#else
#include <hash_map>
#include <hash_set>
#endif
using namespace stdext;