From a27142f1a6415b50d2709bbab35b163f519f26e0 Mon Sep 17 00:00:00 2001 From: "sudonim1@gmail.com" Date: Mon, 23 Apr 2012 18:56:22 +0000 Subject: [PATCH] Set some svn:eol-style properties. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5173 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GSRendererCS.cpp | 1730 +++++++------- plugins/GSdx/GSRendererCS.h | 290 +-- plugins/GSdx/config.h | 88 +- plugins/GSdx/res/cs.fx | 766 +++--- plugins/GSdx/res/fxaa.fx | 3992 ++++++++++++++++---------------- plugins/GSdx/res/shadeboost.fx | 150 +- 6 files changed, 3508 insertions(+), 3508 deletions(-) diff --git a/plugins/GSdx/GSRendererCS.cpp b/plugins/GSdx/GSRendererCS.cpp index d9945a9070..50fd594cdb 100644 --- a/plugins/GSdx/GSRendererCS.cpp +++ b/plugins/GSdx/GSRendererCS.cpp @@ -1,865 +1,865 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#include "stdafx.h" -#include "GSRendererCS.h" - -#define PS_BATCH_SIZE 512 - -GSRendererCS::GSRendererCS() - : GSRenderer() -{ - m_nativeres = true; - - memset(m_vm_valid, 0, sizeof(m_vm_valid)); - - memset(m_texture, 0, sizeof(m_texture)); - - m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); -} - -GSRendererCS::~GSRendererCS() -{ - for(int i = 0; i < countof(m_texture); i++) - { - delete m_texture[i]; - } - - _aligned_free(m_output); -} - -bool GSRendererCS::CreateDevice(GSDevice* dev_unk) -{ - if(!__super::CreateDevice(dev_unk)) - return false; - - HRESULT hr; - - D3D11_DEPTH_STENCIL_DESC dsd; - D3D11_BLEND_DESC bsd; - D3D11_SAMPLER_DESC sd; - D3D11_BUFFER_DESC bd; - D3D11_TEXTURE2D_DESC td; - D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; - D3D11_SHADER_RESOURCE_VIEW_DESC srvd; - - D3D_FEATURE_LEVEL level; - - ((GSDeviceDX*)dev_unk)->GetFeatureLevel(level); - - if(level < D3D_FEATURE_LEVEL_11_0) - return false; - - GSDevice11* dev = (GSDevice11*)dev_unk; - - ID3D11DeviceContext* ctx = *dev; - - // empty depth stencil state - - memset(&dsd, 0, sizeof(dsd)); - - dsd.StencilEnable = false; - dsd.DepthEnable = false; - - hr = (*dev)->CreateDepthStencilState(&dsd, &m_dss); - - if(FAILED(hr)) return false; - - // empty blend state - - memset(&bsd, 0, sizeof(bsd)); - - bsd.RenderTarget[0].BlendEnable = false; - - hr = (*dev)->CreateBlendState(&bsd, &m_bs); - - if(FAILED(hr)) return false; - - // point sampler - - memset(&sd, 0, sizeof(sd)); - - sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT; - - sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; - sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; - sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; - - sd.MaxLOD = FLT_MAX; - sd.MaxAnisotropy = 16; - sd.ComparisonFunc = D3D11_COMPARISON_NEVER; - - hr = (*dev)->CreateSamplerState(&sd, &m_ss); - - if(FAILED(hr)) return false; - - // link buffer - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = 256 << 20; // 256 MB w00t - bd.StructureByteStride = sizeof(uint32) * 4; // c, z, id, next - bd.Usage = D3D11_USAGE_DEFAULT; - bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; - bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_lb); - - { - uint32 data[] = {0, 0, 0xffffffff, 0}; - - D3D11_BOX box; - memset(&box, 0, sizeof(box)); - box.right = sizeof(data); - box.bottom = 1; - box.back = 1; - - ctx->UpdateSubresource(m_lb, 0, &box, data, 0, 0); - } - - if(FAILED(hr)) return false; - - memset(&uavd, 0, sizeof(uavd)); - - uavd.Format = DXGI_FORMAT_UNKNOWN; - uavd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride; - uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER; - uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; - - hr = (*dev)->CreateUnorderedAccessView(m_lb, &uavd, &m_lb_uav); - - if(FAILED(hr)) return false; - - memset(&srvd, 0, sizeof(srvd)); - - srvd.Format = DXGI_FORMAT_UNKNOWN; - srvd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride; - srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; - - hr = (*dev)->CreateShaderResourceView(m_lb, &srvd, &m_lb_srv); - - if(FAILED(hr)) return false; - - // start offset buffer - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = sizeof(uint32) * 2048 * 2048; // index - bd.Usage = D3D11_USAGE_DEFAULT; - bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; - bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_sob); - - if(FAILED(hr)) return false; - - memset(&uavd, 0, sizeof(uavd)); - - uavd.Format = DXGI_FORMAT_R32_TYPELESS; - uavd.Buffer.NumElements = bd.ByteWidth / sizeof(uint32); - uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW; - uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; - - hr = (*dev)->CreateUnorderedAccessView(m_sob, &uavd, &m_sob_uav); - - if(FAILED(hr)) return false; - - memset(&srvd, 0, sizeof(srvd)); - - srvd.Format = DXGI_FORMAT_R32_TYPELESS; - srvd.BufferEx.NumElements = bd.ByteWidth / sizeof(uint32); - srvd.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW; - srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX; - - hr = (*dev)->CreateShaderResourceView(m_sob, &srvd, &m_sob_srv); - - if(FAILED(hr)) return false; - - const uint32 tmp = 0; - - ctx->ClearUnorderedAccessViewUint(m_sob_uav, &tmp); // initial clear, next time Draw should restore it in Step 2 - - // video memory (4MB) - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = 4 * 1024 * 1024; - bd.Usage = D3D11_USAGE_DEFAULT; - bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS; - bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_vm); - - if(FAILED(hr)) return false; - - memset(&uavd, 0, sizeof(uavd)); - - uavd.Format = DXGI_FORMAT_R32_TYPELESS; - uavd.Buffer.FirstElement = 0; - uavd.Buffer.NumElements = 1024 * 1024; - uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW; - uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; - - hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); - - if(FAILED(hr)) return false; -/* - memset(&td, 0, sizeof(td)); - - td.Width = PAGE_SIZE; - td.Height = MAX_PAGES; - td.Format = DXGI_FORMAT_R8_UINT; - td.MipLevels = 1; - td.ArraySize = 1; - td.SampleDesc.Count = 1; - td.SampleDesc.Quality = 0; - td.Usage = D3D11_USAGE_DEFAULT; - td.BindFlags = D3D11_BIND_UNORDERED_ACCESS; - - hr = (*dev)->CreateTexture2D(&td, NULL, &m_vm); - - if(FAILED(hr)) return false; - - memset(&uavd, 0, sizeof(uavd)); - - uavd.Format = DXGI_FORMAT_R8_UINT; - uavd.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D; - - hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); - - if(FAILED(hr)) return false; -*/ - // one page, for copying between cpu<->gpu - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = PAGE_SIZE; - bd.Usage = D3D11_USAGE_STAGING; - bd.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb); - - if(FAILED(hr)) return false; -/* - memset(&td, 0, sizeof(td)); - - td.Width = PAGE_SIZE; - td.Height = 1; - td.Format = DXGI_FORMAT_R8_UINT; - td.MipLevels = 1; - td.ArraySize = 1; - td.SampleDesc.Count = 1; - td.SampleDesc.Quality = 0; - td.Usage = D3D11_USAGE_STAGING; - td.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; - - hr = (*dev)->CreateTexture2D(&td, NULL, &m_pb); - - if(FAILED(hr)) return false; -*/ - // VSConstantBuffer - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = sizeof(VSConstantBuffer); - bd.Usage = D3D11_USAGE_DEFAULT; - bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_vs_cb); - - if(FAILED(hr)) return false; - - // PS - - D3D11_SHADER_MACRO macro[] = - { - {NULL, NULL}, - }; - - hr = dev->CompileShader(IDR_CS_FX, "ps_main0", macro, &m_ps0); - - if(FAILED(hr)) return false; - - // PSConstantBuffer - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = sizeof(PSConstantBuffer); - bd.Usage = D3D11_USAGE_DEFAULT; - bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; - - hr = (*dev)->CreateBuffer(&bd, NULL, &m_ps_cb); - - if(FAILED(hr)) return false; - - // - - return true; -} - -void GSRendererCS::ResetDevice() -{ - for(int i = 0; i < countof(m_texture); i++) - { - delete m_texture[i]; - - m_texture[i] = NULL; - } -} - -void GSRendererCS::VSync(int field) -{ - __super::VSync(field); - - //printf("%lld\n", m_perfmon.GetFrame()); -} - -GSTexture* GSRendererCS::GetOutput(int i) -{ - // TODO: create a compute shader which unswizzles the frame from m_vm to the output texture - - const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB; - - int w = DISPFB.FBW * 64; - int h = GetFrameRect(i).bottom; - - // TODO: round up bottom - - if(m_dev->ResizeTexture(&m_texture[i], w, h)) - { - const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM]; - - GSVector4i r(0, 0, w, h); - GSVector4i r2 = r.ralign(psm.bs); - - GSOffset* o = m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM); - - Read(o, r2, false); - - (m_mem.*psm.rtx)(o, r2, m_output, 1024 * 4, m_env.TEXA); - - m_texture[i]->Update(r, m_output, 1024 * 4); - - if(s_dump) - { - if(s_save && s_n >= s_saven) - { - m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM)); - } - - s_n++; - } - } - - return m_texture[i]; -} - -void GSRendererCS::Draw() -{ - GSDrawingEnvironment& env = m_env; - GSDrawingContext* context = m_context; - - GSVector2i rtsize(2048, 2048); - GSVector4i scissor = GSVector4i(context->scissor.in).rintersect(GSVector4i(rtsize).zwxy()); - GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); - GSVector4i r = bbox.rintersect(scissor); - - uint32 fm = context->FRAME.FBMSK; - uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; - - if(fm != 0xffffffff) - { - Write(context->offset.fb, r); - - // TODO: m_tc->InvalidateVideoMem(context->offset.fb, r, false); - } - - if(zm != 0xffffffff) - { - Write(context->offset.zb, r); - - // TODO: m_tc->InvalidateVideoMem(context->offset.zb, r, false); - } - - // TODO: if(24-bit) fm/zm |= 0xff000000; - - if(PRIM->TME) - { - m_mem.m_clut.Read32(context->TEX0, env.TEXA); - - GSVector4i r; - - GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear()); - - // TODO: unswizzle pages of r to a texture, check m_vm_valid, bit not set cpu->gpu, set gpu->gpu - - // TODO: Write transfer should directly write to m_vm, then Read/Write syncing won't be necessary, clut must be updated with the gpu also - - // TODO: tex = m_tc->LookupSource(context->TEX0, env.TEXA, r); - - // if(!tex) return; - } - - // - - GSDevice11* dev = (GSDevice11*)m_dev; - - ID3D11DeviceContext* ctx = *dev; - - // - - dev->BeginScene(); - - // SetupOM - - dev->OMSetDepthStencilState(m_dss, 0); - dev->OMSetBlendState(m_bs, 0); - - ID3D11UnorderedAccessView* uavs[] = {m_vm_uav, m_lb_uav, m_sob_uav}; - uint32 counters[] = {1, 0, 0}; - - dev->OMSetRenderTargets(rtsize, countof(uavs), uavs, counters, &scissor); - - // SetupIA - - D3D11_PRIMITIVE_TOPOLOGY topology; - - switch(m_vt.m_primclass) - { - case GS_POINT_CLASS: - topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST; - break; - case GS_LINE_CLASS: - case GS_SPRITE_CLASS: - topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST; - break; - case GS_TRIANGLE_CLASS: - topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; - break; - default: - __assume(0); - } - - GSVector4i r2 = bbox.add32(GSVector4i(-1, -1, 1, 1)).rintersect(scissor); - - m_vertex.buff[m_vertex.next + 0].XYZ.X = context->XYOFFSET.OFX + (r2.left << 4); - m_vertex.buff[m_vertex.next + 0].XYZ.Y = context->XYOFFSET.OFY + (r2.top << 4); - m_vertex.buff[m_vertex.next + 1].XYZ.X = context->XYOFFSET.OFX + (r2.right << 4); - m_vertex.buff[m_vertex.next + 1].XYZ.Y = context->XYOFFSET.OFY + (r2.bottom << 4); - - m_index.buff[m_index.tail + 0] = m_vertex.next + 0; - m_index.buff[m_index.tail + 1] = m_vertex.next + 1; - - dev->IASetVertexBuffer(m_vertex.buff, sizeof(GSVertex), m_vertex.next + 2); - dev->IASetIndexBuffer(m_index.buff, m_index.tail + 2); - - // SetupVS - - VSSelector vs_sel; - - vs_sel.tme = PRIM->TME; - vs_sel.fst = PRIM->FST; - - VSConstantBuffer vs_cb; - - float sx = 2.0f / (rtsize.x << 4); - float sy = 2.0f / (rtsize.y << 4); - //float sx = 1.0f / 16; - //float sy = 1.0f / 16; - float ox = (float)(int)context->XYOFFSET.OFX; - float oy = (float)(int)context->XYOFFSET.OFY; - - vs_cb.VertexScale = GSVector4(sx, -sy, 0.0f, 0.0f); - vs_cb.VertexOffset = GSVector4(ox * sx + 1, -(oy * sy + 1), 0.0f, -1.0f); - //vs_cb.VertexScale = GSVector4(sx, sy, 0.0f, 0.0f); - //vs_cb.VertexOffset = GSVector4(ox * sx, oy * sy, 0.0f, -1.0f); - - { - GSVertexShader11 vs; - - hash_map::const_iterator i = m_vs.find(vs_sel); - - if(i != m_vs.end()) - { - vs = i->second; - } - else - { - string str[2]; - - str[0] = format("%d", vs_sel.tme); - str[1] = format("%d", vs_sel.fst); - - D3D11_SHADER_MACRO macro[] = - { - {"VS_TME", str[0].c_str()}, - {"VS_FST", str[1].c_str()}, - {NULL, NULL}, - }; - - D3D11_INPUT_ELEMENT_DESC layout[] = - { - {"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0}, - {"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0}, - {"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0}, - {"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0}, - {"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0}, - {"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0}, - {"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0}, - }; - - dev->CompileShader(IDR_CS_FX, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il); - - m_vs[vs_sel] = vs; - } - - ctx->UpdateSubresource(m_vs_cb, 0, NULL, &vs_cb, 0, 0); // TODO: only update if changed - - dev->VSSetShader(vs.vs, m_vs_cb); - - dev->IASetInputLayout(vs.il); - } - - // SetupGS - - GSSelector gs_sel; - - gs_sel.iip = PRIM->IIP; - - CComPtr gs[2]; - - for(int j = 0; j < 2; j++) - { - gs_sel.prim = j == 0 ? m_vt.m_primclass : GS_SPRITE_CLASS; - - hash_map >::const_iterator i = m_gs.find(gs_sel); - - if(i != m_gs.end()) - { - gs[j] = i->second; - } - else - { - string str[2]; - - str[0] = format("%d", gs_sel.iip); - str[1] = format("%d", j == 0 ? gs_sel.prim : GS_SPRITE_CLASS); - - D3D11_SHADER_MACRO macro[] = - { - {"GS_IIP", str[0].c_str()}, - {"GS_PRIM", str[1].c_str()}, - {NULL, NULL}, - }; - - dev->CompileShader(IDR_CS_FX, "gs_main", macro, &gs[j]); - - m_gs[gs_sel] = gs[j]; - } - } - - // SetupPS - - dev->PSSetSamplerState(m_ss, NULL, NULL); - - PSSelector ps_sel; - - ps_sel.fpsm = context->FRAME.PSM; - ps_sel.zpsm = context->ZBUF.PSM; - - CComPtr ps[2] = {m_ps0, NULL}; - - hash_map >::const_iterator i = m_ps1.find(ps_sel); - - if(i != m_ps1.end()) - { - ps[1] = i->second; - } - else - { - string str[15]; - - str[0] = format("%d", PS_BATCH_SIZE); - str[1] = format("%d", context->FRAME.PSM); - str[2] = format("%d", context->ZBUF.PSM); - - D3D11_SHADER_MACRO macro[] = - { - {"PS_BATCH_SIZE", str[0].c_str()}, - {"PS_FPSM", str[1].c_str()}, - {"PS_ZPSM", str[2].c_str()}, - {NULL, NULL}, - }; - - dev->CompileShader(IDR_CS_FX, "ps_main1", macro, &ps[1]); - - m_ps1[ps_sel] = ps[1]; - } - - PSConstantBuffer ps_cb; - - ps_cb.fm = fm; - ps_cb.zm = zm; - - ctx->UpdateSubresource(m_ps_cb, 0, NULL, &ps_cb, 0, 0); // TODO: only update if changed - - OffsetBuffer* fzbo = NULL; - - GetOffsetBuffer(&fzbo); - - dev->PSSetShaderResourceView(0, fzbo->row_srv); - dev->PSSetShaderResourceView(1, fzbo->col_srv); - // TODO: palette, texture - - int step = PS_BATCH_SIZE * GSUtil::GetVertexCount(PRIM->PRIM); - - for(int i = 0; i < m_index.tail; i += step) - { - dev->IASetPrimitiveTopology(topology); - dev->GSSetShader(gs[0]); - dev->PSSetShader(ps[0], m_ps_cb); - dev->DrawIndexedPrimitive(i, std::min(m_index.tail - i, step)); - - dev->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_LINELIST); - dev->GSSetShader(gs[1]); - dev->PSSetShader(ps[1], m_ps_cb); - dev->DrawIndexedPrimitive(m_index.tail, 2); - - //printf("%d/%d, %d %d %d %d\n", i, m_index.tail, r2.x, r2.y, r2.z, r2.w); - } - - dev->EndScene(); - - if(0) - { - std::string s; - /* - s = format("c:\\temp1\\_%05d_f%lld_fb0_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0); - m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024); - Read(m_mem.GetOffset(0, 16, PSM_PSMCT32), GSVector4i(0, 0, 1024, 1024), false); - */ - // - if(fm != 0xffffffff) Read(context->offset.fb, r, false); - // - if(zm != 0xffffffff) Read(context->offset.zb, r, false); - - s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM); - m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); - - s = format("c:\\temp1\\_%05d_f%lld_zt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM); - m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); - - /* - s = format("c:\\temp1\\_%05d_f%lld_fb1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0); - m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024); - */ - - s_n++; - } -} - -void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) -{ - GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); - - Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated (important) - - // TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target - // TODO: invalidate texture cache -} - -void GSRendererCS::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) -{ - GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); - - Read(o, r, false); -} - -void GSRendererCS::Write(GSOffset* o, const GSVector4i& r) -{ - GSDevice11* dev = (GSDevice11*)m_dev; - - ID3D11DeviceContext* ctx = *dev; - - D3D11_BOX box; - - memset(&box, 0, sizeof(box)); - - box.right = 1; - box.bottom = 1; - box.back = 1; - - uint32* pages = o->GetPages(r); - - for(size_t i = 0; pages[i] != GSOffset::EOP; i++) - { - uint32 page = pages[i]; - - uint32 row = page >> 5; - uint32 col = 1 << (page & 31); - - if((m_vm_valid[row] & col) == 0) - { - m_vm_valid[row] |= col; - - box.left = page * PAGE_SIZE; - box.right = (page + 1) * PAGE_SIZE; - - ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0); -/* - // m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4 - - box.left = 0; - box.right = PAGE_SIZE; - box.top = page; - box.bottom = box.top + 1; - - ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0); -*/ - if(0) - printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); - } - } - - delete [] pages; -} - -void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) -{ - GSDevice11* dev = (GSDevice11*)m_dev; - - ID3D11DeviceContext* ctx = *dev; - - D3D11_BOX box; - - memset(&box, 0, sizeof(box)); - - box.right = 1; - box.bottom = 1; - box.back = 1; - - uint32* pages = o->GetPages(r); - - for(size_t i = 0; pages[i] != GSOffset::EOP; i++) - { - uint32 page = pages[i]; - - uint32 row = page >> 5; - uint32 col = 1 << (page & 31); - - if(m_vm_valid[row] & col) - { - if(invalidate) - { - m_vm_valid[row] ^= col; - } - - box.left = page * PAGE_SIZE; - box.right = (page + 1) * PAGE_SIZE; - - ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); -/* - // m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4 - - box.left = 0; - box.right = PAGE_SIZE; - box.top = page; - box.bottom = box.top + 1; - - ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); -*/ - D3D11_MAPPED_SUBRESOURCE map; - - if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ, 0, &map))) - { - memcpy(m_mem.m_vm8 + page * PAGE_SIZE, map.pData, PAGE_SIZE); - - ctx->Unmap(m_pb, 0); - - if(0) - printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); - } - } - } - - delete [] pages; -} - -bool GSRendererCS::GetOffsetBuffer(OffsetBuffer** fzbo) -{ - HRESULT hr; - - GSDevice11* dev = (GSDevice11*)m_dev; - - D3D11_BUFFER_DESC bd; - D3D11_SHADER_RESOURCE_VIEW_DESC srvd; - D3D11_SUBRESOURCE_DATA data; - - hash_map::iterator i = m_offset.find(m_context->offset.fzb->hash); - - if(i == m_offset.end()) - { - OffsetBuffer ob; - - memset(&bd, 0, sizeof(bd)); - - bd.ByteWidth = sizeof(GSVector2i) * 2048; - bd.Usage = D3D11_USAGE_IMMUTABLE; - bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; - - memset(&data, 0, sizeof(data)); - - data.pSysMem = m_context->offset.fzb->row; - - hr = (*dev)->CreateBuffer(&bd, &data, &ob.row); - - if(FAILED(hr)) return false; - - data.pSysMem = m_context->offset.fzb->col; - - hr = (*dev)->CreateBuffer(&bd, &data, &ob.col); - - if(FAILED(hr)) return false; - - memset(&srvd, 0, sizeof(srvd)); - - srvd.Format = DXGI_FORMAT_R32G32_SINT; - srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; - srvd.Buffer.FirstElement = 0; - srvd.Buffer.NumElements = 2048; - - hr = (*dev)->CreateShaderResourceView(ob.row, &srvd, &ob.row_srv); - - if(FAILED(hr)) return false; - - hr = (*dev)->CreateShaderResourceView(ob.col, &srvd, &ob.col_srv); - - if(FAILED(hr)) return false; - - m_offset[m_context->offset.fzb->hash] = ob; - - i = m_offset.find(m_context->offset.fzb->hash); - } - - *fzbo = &i->second; - - return true; -} +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#include "stdafx.h" +#include "GSRendererCS.h" + +#define PS_BATCH_SIZE 512 + +GSRendererCS::GSRendererCS() + : GSRenderer() +{ + m_nativeres = true; + + memset(m_vm_valid, 0, sizeof(m_vm_valid)); + + memset(m_texture, 0, sizeof(m_texture)); + + m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); +} + +GSRendererCS::~GSRendererCS() +{ + for(int i = 0; i < countof(m_texture); i++) + { + delete m_texture[i]; + } + + _aligned_free(m_output); +} + +bool GSRendererCS::CreateDevice(GSDevice* dev_unk) +{ + if(!__super::CreateDevice(dev_unk)) + return false; + + HRESULT hr; + + D3D11_DEPTH_STENCIL_DESC dsd; + D3D11_BLEND_DESC bsd; + D3D11_SAMPLER_DESC sd; + D3D11_BUFFER_DESC bd; + D3D11_TEXTURE2D_DESC td; + D3D11_UNORDERED_ACCESS_VIEW_DESC uavd; + D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + + D3D_FEATURE_LEVEL level; + + ((GSDeviceDX*)dev_unk)->GetFeatureLevel(level); + + if(level < D3D_FEATURE_LEVEL_11_0) + return false; + + GSDevice11* dev = (GSDevice11*)dev_unk; + + ID3D11DeviceContext* ctx = *dev; + + // empty depth stencil state + + memset(&dsd, 0, sizeof(dsd)); + + dsd.StencilEnable = false; + dsd.DepthEnable = false; + + hr = (*dev)->CreateDepthStencilState(&dsd, &m_dss); + + if(FAILED(hr)) return false; + + // empty blend state + + memset(&bsd, 0, sizeof(bsd)); + + bsd.RenderTarget[0].BlendEnable = false; + + hr = (*dev)->CreateBlendState(&bsd, &m_bs); + + if(FAILED(hr)) return false; + + // point sampler + + memset(&sd, 0, sizeof(sd)); + + sd.Filter = D3D11_FILTER_MIN_MAG_MIP_POINT; + + sd.AddressU = D3D11_TEXTURE_ADDRESS_CLAMP; + sd.AddressV = D3D11_TEXTURE_ADDRESS_CLAMP; + sd.AddressW = D3D11_TEXTURE_ADDRESS_CLAMP; + + sd.MaxLOD = FLT_MAX; + sd.MaxAnisotropy = 16; + sd.ComparisonFunc = D3D11_COMPARISON_NEVER; + + hr = (*dev)->CreateSamplerState(&sd, &m_ss); + + if(FAILED(hr)) return false; + + // link buffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = 256 << 20; // 256 MB w00t + bd.StructureByteStride = sizeof(uint32) * 4; // c, z, id, next + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; + bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_lb); + + { + uint32 data[] = {0, 0, 0xffffffff, 0}; + + D3D11_BOX box; + memset(&box, 0, sizeof(box)); + box.right = sizeof(data); + box.bottom = 1; + box.back = 1; + + ctx->UpdateSubresource(m_lb, 0, &box, data, 0, 0); + } + + if(FAILED(hr)) return false; + + memset(&uavd, 0, sizeof(uavd)); + + uavd.Format = DXGI_FORMAT_UNKNOWN; + uavd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride; + uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER; + uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; + + hr = (*dev)->CreateUnorderedAccessView(m_lb, &uavd, &m_lb_uav); + + if(FAILED(hr)) return false; + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_UNKNOWN; + srvd.Buffer.NumElements = bd.ByteWidth / bd.StructureByteStride; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; + + hr = (*dev)->CreateShaderResourceView(m_lb, &srvd, &m_lb_srv); + + if(FAILED(hr)) return false; + + // start offset buffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(uint32) * 2048 * 2048; // index + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE; + bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_sob); + + if(FAILED(hr)) return false; + + memset(&uavd, 0, sizeof(uavd)); + + uavd.Format = DXGI_FORMAT_R32_TYPELESS; + uavd.Buffer.NumElements = bd.ByteWidth / sizeof(uint32); + uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW; + uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; + + hr = (*dev)->CreateUnorderedAccessView(m_sob, &uavd, &m_sob_uav); + + if(FAILED(hr)) return false; + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_R32_TYPELESS; + srvd.BufferEx.NumElements = bd.ByteWidth / sizeof(uint32); + srvd.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX; + + hr = (*dev)->CreateShaderResourceView(m_sob, &srvd, &m_sob_srv); + + if(FAILED(hr)) return false; + + const uint32 tmp = 0; + + ctx->ClearUnorderedAccessViewUint(m_sob_uav, &tmp); // initial clear, next time Draw should restore it in Step 2 + + // video memory (4MB) + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = 4 * 1024 * 1024; + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_UNORDERED_ACCESS; + bd.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_vm); + + if(FAILED(hr)) return false; + + memset(&uavd, 0, sizeof(uavd)); + + uavd.Format = DXGI_FORMAT_R32_TYPELESS; + uavd.Buffer.FirstElement = 0; + uavd.Buffer.NumElements = 1024 * 1024; + uavd.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW; + uavd.ViewDimension = D3D11_UAV_DIMENSION_BUFFER; + + hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); + + if(FAILED(hr)) return false; +/* + memset(&td, 0, sizeof(td)); + + td.Width = PAGE_SIZE; + td.Height = MAX_PAGES; + td.Format = DXGI_FORMAT_R8_UINT; + td.MipLevels = 1; + td.ArraySize = 1; + td.SampleDesc.Count = 1; + td.SampleDesc.Quality = 0; + td.Usage = D3D11_USAGE_DEFAULT; + td.BindFlags = D3D11_BIND_UNORDERED_ACCESS; + + hr = (*dev)->CreateTexture2D(&td, NULL, &m_vm); + + if(FAILED(hr)) return false; + + memset(&uavd, 0, sizeof(uavd)); + + uavd.Format = DXGI_FORMAT_R8_UINT; + uavd.ViewDimension = D3D11_UAV_DIMENSION_TEXTURE2D; + + hr = (*dev)->CreateUnorderedAccessView(m_vm, &uavd, &m_vm_uav); + + if(FAILED(hr)) return false; +*/ + // one page, for copying between cpu<->gpu + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = PAGE_SIZE; + bd.Usage = D3D11_USAGE_STAGING; + bd.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_pb); + + if(FAILED(hr)) return false; +/* + memset(&td, 0, sizeof(td)); + + td.Width = PAGE_SIZE; + td.Height = 1; + td.Format = DXGI_FORMAT_R8_UINT; + td.MipLevels = 1; + td.ArraySize = 1; + td.SampleDesc.Count = 1; + td.SampleDesc.Quality = 0; + td.Usage = D3D11_USAGE_STAGING; + td.CPUAccessFlags = D3D11_CPU_ACCESS_READ | D3D11_CPU_ACCESS_WRITE; + + hr = (*dev)->CreateTexture2D(&td, NULL, &m_pb); + + if(FAILED(hr)) return false; +*/ + // VSConstantBuffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(VSConstantBuffer); + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_vs_cb); + + if(FAILED(hr)) return false; + + // PS + + D3D11_SHADER_MACRO macro[] = + { + {NULL, NULL}, + }; + + hr = dev->CompileShader(IDR_CS_FX, "ps_main0", macro, &m_ps0); + + if(FAILED(hr)) return false; + + // PSConstantBuffer + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(PSConstantBuffer); + bd.Usage = D3D11_USAGE_DEFAULT; + bd.BindFlags = D3D11_BIND_CONSTANT_BUFFER; + + hr = (*dev)->CreateBuffer(&bd, NULL, &m_ps_cb); + + if(FAILED(hr)) return false; + + // + + return true; +} + +void GSRendererCS::ResetDevice() +{ + for(int i = 0; i < countof(m_texture); i++) + { + delete m_texture[i]; + + m_texture[i] = NULL; + } +} + +void GSRendererCS::VSync(int field) +{ + __super::VSync(field); + + //printf("%lld\n", m_perfmon.GetFrame()); +} + +GSTexture* GSRendererCS::GetOutput(int i) +{ + // TODO: create a compute shader which unswizzles the frame from m_vm to the output texture + + const GSRegDISPFB& DISPFB = m_regs->DISP[i].DISPFB; + + int w = DISPFB.FBW * 64; + int h = GetFrameRect(i).bottom; + + // TODO: round up bottom + + if(m_dev->ResizeTexture(&m_texture[i], w, h)) + { + const GSLocalMemory::psm_t& psm = GSLocalMemory::m_psm[DISPFB.PSM]; + + GSVector4i r(0, 0, w, h); + GSVector4i r2 = r.ralign(psm.bs); + + GSOffset* o = m_mem.GetOffset(DISPFB.Block(), DISPFB.FBW, DISPFB.PSM); + + Read(o, r2, false); + + (m_mem.*psm.rtx)(o, r2, m_output, 1024 * 4, m_env.TEXA); + + m_texture[i]->Update(r, m_output, 1024 * 4); + + if(s_dump) + { + if(s_save && s_n >= s_saven) + { + m_texture[i]->Save(format("c:\\temp1\\_%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM)); + } + + s_n++; + } + } + + return m_texture[i]; +} + +void GSRendererCS::Draw() +{ + GSDrawingEnvironment& env = m_env; + GSDrawingContext* context = m_context; + + GSVector2i rtsize(2048, 2048); + GSVector4i scissor = GSVector4i(context->scissor.in).rintersect(GSVector4i(rtsize).zwxy()); + GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); + GSVector4i r = bbox.rintersect(scissor); + + uint32 fm = context->FRAME.FBMSK; + uint32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; + + if(fm != 0xffffffff) + { + Write(context->offset.fb, r); + + // TODO: m_tc->InvalidateVideoMem(context->offset.fb, r, false); + } + + if(zm != 0xffffffff) + { + Write(context->offset.zb, r); + + // TODO: m_tc->InvalidateVideoMem(context->offset.zb, r, false); + } + + // TODO: if(24-bit) fm/zm |= 0xff000000; + + if(PRIM->TME) + { + m_mem.m_clut.Read32(context->TEX0, env.TEXA); + + GSVector4i r; + + GetTextureMinMax(r, context->TEX0, context->CLAMP, m_vt.IsLinear()); + + // TODO: unswizzle pages of r to a texture, check m_vm_valid, bit not set cpu->gpu, set gpu->gpu + + // TODO: Write transfer should directly write to m_vm, then Read/Write syncing won't be necessary, clut must be updated with the gpu also + + // TODO: tex = m_tc->LookupSource(context->TEX0, env.TEXA, r); + + // if(!tex) return; + } + + // + + GSDevice11* dev = (GSDevice11*)m_dev; + + ID3D11DeviceContext* ctx = *dev; + + // + + dev->BeginScene(); + + // SetupOM + + dev->OMSetDepthStencilState(m_dss, 0); + dev->OMSetBlendState(m_bs, 0); + + ID3D11UnorderedAccessView* uavs[] = {m_vm_uav, m_lb_uav, m_sob_uav}; + uint32 counters[] = {1, 0, 0}; + + dev->OMSetRenderTargets(rtsize, countof(uavs), uavs, counters, &scissor); + + // SetupIA + + D3D11_PRIMITIVE_TOPOLOGY topology; + + switch(m_vt.m_primclass) + { + case GS_POINT_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_POINTLIST; + break; + case GS_LINE_CLASS: + case GS_SPRITE_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_LINELIST; + break; + case GS_TRIANGLE_CLASS: + topology = D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST; + break; + default: + __assume(0); + } + + GSVector4i r2 = bbox.add32(GSVector4i(-1, -1, 1, 1)).rintersect(scissor); + + m_vertex.buff[m_vertex.next + 0].XYZ.X = context->XYOFFSET.OFX + (r2.left << 4); + m_vertex.buff[m_vertex.next + 0].XYZ.Y = context->XYOFFSET.OFY + (r2.top << 4); + m_vertex.buff[m_vertex.next + 1].XYZ.X = context->XYOFFSET.OFX + (r2.right << 4); + m_vertex.buff[m_vertex.next + 1].XYZ.Y = context->XYOFFSET.OFY + (r2.bottom << 4); + + m_index.buff[m_index.tail + 0] = m_vertex.next + 0; + m_index.buff[m_index.tail + 1] = m_vertex.next + 1; + + dev->IASetVertexBuffer(m_vertex.buff, sizeof(GSVertex), m_vertex.next + 2); + dev->IASetIndexBuffer(m_index.buff, m_index.tail + 2); + + // SetupVS + + VSSelector vs_sel; + + vs_sel.tme = PRIM->TME; + vs_sel.fst = PRIM->FST; + + VSConstantBuffer vs_cb; + + float sx = 2.0f / (rtsize.x << 4); + float sy = 2.0f / (rtsize.y << 4); + //float sx = 1.0f / 16; + //float sy = 1.0f / 16; + float ox = (float)(int)context->XYOFFSET.OFX; + float oy = (float)(int)context->XYOFFSET.OFY; + + vs_cb.VertexScale = GSVector4(sx, -sy, 0.0f, 0.0f); + vs_cb.VertexOffset = GSVector4(ox * sx + 1, -(oy * sy + 1), 0.0f, -1.0f); + //vs_cb.VertexScale = GSVector4(sx, sy, 0.0f, 0.0f); + //vs_cb.VertexOffset = GSVector4(ox * sx, oy * sy, 0.0f, -1.0f); + + { + GSVertexShader11 vs; + + hash_map::const_iterator i = m_vs.find(vs_sel); + + if(i != m_vs.end()) + { + vs = i->second; + } + else + { + string str[2]; + + str[0] = format("%d", vs_sel.tme); + str[1] = format("%d", vs_sel.fst); + + D3D11_SHADER_MACRO macro[] = + { + {"VS_TME", str[0].c_str()}, + {"VS_FST", str[1].c_str()}, + {NULL, NULL}, + }; + + D3D11_INPUT_ELEMENT_DESC layout[] = + { + {"TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"COLOR", 0, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 8, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 1, DXGI_FORMAT_R32_FLOAT, 0, 12, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"POSITION", 0, DXGI_FORMAT_R16G16_UINT, 0, 16, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"POSITION", 1, DXGI_FORMAT_R32_UINT, 0, 20, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"TEXCOORD", 2, DXGI_FORMAT_R16G16_UINT, 0, 24, D3D11_INPUT_PER_VERTEX_DATA, 0}, + {"COLOR", 1, DXGI_FORMAT_R8G8B8A8_UNORM, 0, 28, D3D11_INPUT_PER_VERTEX_DATA, 0}, + }; + + dev->CompileShader(IDR_CS_FX, "vs_main", macro, &vs.vs, layout, countof(layout), &vs.il); + + m_vs[vs_sel] = vs; + } + + ctx->UpdateSubresource(m_vs_cb, 0, NULL, &vs_cb, 0, 0); // TODO: only update if changed + + dev->VSSetShader(vs.vs, m_vs_cb); + + dev->IASetInputLayout(vs.il); + } + + // SetupGS + + GSSelector gs_sel; + + gs_sel.iip = PRIM->IIP; + + CComPtr gs[2]; + + for(int j = 0; j < 2; j++) + { + gs_sel.prim = j == 0 ? m_vt.m_primclass : GS_SPRITE_CLASS; + + hash_map >::const_iterator i = m_gs.find(gs_sel); + + if(i != m_gs.end()) + { + gs[j] = i->second; + } + else + { + string str[2]; + + str[0] = format("%d", gs_sel.iip); + str[1] = format("%d", j == 0 ? gs_sel.prim : GS_SPRITE_CLASS); + + D3D11_SHADER_MACRO macro[] = + { + {"GS_IIP", str[0].c_str()}, + {"GS_PRIM", str[1].c_str()}, + {NULL, NULL}, + }; + + dev->CompileShader(IDR_CS_FX, "gs_main", macro, &gs[j]); + + m_gs[gs_sel] = gs[j]; + } + } + + // SetupPS + + dev->PSSetSamplerState(m_ss, NULL, NULL); + + PSSelector ps_sel; + + ps_sel.fpsm = context->FRAME.PSM; + ps_sel.zpsm = context->ZBUF.PSM; + + CComPtr ps[2] = {m_ps0, NULL}; + + hash_map >::const_iterator i = m_ps1.find(ps_sel); + + if(i != m_ps1.end()) + { + ps[1] = i->second; + } + else + { + string str[15]; + + str[0] = format("%d", PS_BATCH_SIZE); + str[1] = format("%d", context->FRAME.PSM); + str[2] = format("%d", context->ZBUF.PSM); + + D3D11_SHADER_MACRO macro[] = + { + {"PS_BATCH_SIZE", str[0].c_str()}, + {"PS_FPSM", str[1].c_str()}, + {"PS_ZPSM", str[2].c_str()}, + {NULL, NULL}, + }; + + dev->CompileShader(IDR_CS_FX, "ps_main1", macro, &ps[1]); + + m_ps1[ps_sel] = ps[1]; + } + + PSConstantBuffer ps_cb; + + ps_cb.fm = fm; + ps_cb.zm = zm; + + ctx->UpdateSubresource(m_ps_cb, 0, NULL, &ps_cb, 0, 0); // TODO: only update if changed + + OffsetBuffer* fzbo = NULL; + + GetOffsetBuffer(&fzbo); + + dev->PSSetShaderResourceView(0, fzbo->row_srv); + dev->PSSetShaderResourceView(1, fzbo->col_srv); + // TODO: palette, texture + + int step = PS_BATCH_SIZE * GSUtil::GetVertexCount(PRIM->PRIM); + + for(int i = 0; i < m_index.tail; i += step) + { + dev->IASetPrimitiveTopology(topology); + dev->GSSetShader(gs[0]); + dev->PSSetShader(ps[0], m_ps_cb); + dev->DrawIndexedPrimitive(i, std::min(m_index.tail - i, step)); + + dev->IASetPrimitiveTopology(D3D11_PRIMITIVE_TOPOLOGY_LINELIST); + dev->GSSetShader(gs[1]); + dev->PSSetShader(ps[1], m_ps_cb); + dev->DrawIndexedPrimitive(m_index.tail, 2); + + //printf("%d/%d, %d %d %d %d\n", i, m_index.tail, r2.x, r2.y, r2.z, r2.w); + } + + dev->EndScene(); + + if(0) + { + std::string s; + /* + s = format("c:\\temp1\\_%05d_f%lld_fb0_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0); + m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024); + Read(m_mem.GetOffset(0, 16, PSM_PSMCT32), GSVector4i(0, 0, 1024, 1024), false); + */ + // + if(fm != 0xffffffff) Read(context->offset.fb, r, false); + // + if(zm != 0xffffffff) Read(context->offset.zb, r, false); + + s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM); + m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512); + + s = format("c:\\temp1\\_%05d_f%lld_zt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM); + m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512); + + /* + s = format("c:\\temp1\\_%05d_f%lld_fb1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), 0, 0); + m_mem.SaveBMP(s, 0, 16, PSM_PSMCT32, 1024, 1024); + */ + + s_n++; + } +} + +void GSRendererCS::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r) +{ + GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM); + + Read(o, r, true); // TODO: fully overwritten pages are not needed to be read, only invalidated (important) + + // TODO: false deps, 8H/4HL/4HH texture sharing pages with 24-bit target + // TODO: invalidate texture cache +} + +void GSRendererCS::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut) +{ + GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM); + + Read(o, r, false); +} + +void GSRendererCS::Write(GSOffset* o, const GSVector4i& r) +{ + GSDevice11* dev = (GSDevice11*)m_dev; + + ID3D11DeviceContext* ctx = *dev; + + D3D11_BOX box; + + memset(&box, 0, sizeof(box)); + + box.right = 1; + box.bottom = 1; + box.back = 1; + + uint32* pages = o->GetPages(r); + + for(size_t i = 0; pages[i] != GSOffset::EOP; i++) + { + uint32 page = pages[i]; + + uint32 row = page >> 5; + uint32 col = 1 << (page & 31); + + if((m_vm_valid[row] & col) == 0) + { + m_vm_valid[row] |= col; + + box.left = page * PAGE_SIZE; + box.right = (page + 1) * PAGE_SIZE; + + ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0); +/* + // m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4 + + box.left = 0; + box.right = PAGE_SIZE; + box.top = page; + box.bottom = box.top + 1; + + ctx->UpdateSubresource(m_vm, 0, &box, m_mem.m_vm8 + page * PAGE_SIZE, 0, 0); +*/ + if(0) + printf("[%lld] write %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); + } + } + + delete [] pages; +} + +void GSRendererCS::Read(GSOffset* o, const GSVector4i& r, bool invalidate) +{ + GSDevice11* dev = (GSDevice11*)m_dev; + + ID3D11DeviceContext* ctx = *dev; + + D3D11_BOX box; + + memset(&box, 0, sizeof(box)); + + box.right = 1; + box.bottom = 1; + box.back = 1; + + uint32* pages = o->GetPages(r); + + for(size_t i = 0; pages[i] != GSOffset::EOP; i++) + { + uint32 page = pages[i]; + + uint32 row = page >> 5; + uint32 col = 1 << (page & 31); + + if(m_vm_valid[row] & col) + { + if(invalidate) + { + m_vm_valid[row] ^= col; + } + + box.left = page * PAGE_SIZE; + box.right = (page + 1) * PAGE_SIZE; + + ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); +/* + // m_vm texture row is 2k in bytes, one page is 8k => starting row: addr / 4k, number of rows: 8k / 2k = 4 + + box.left = 0; + box.right = PAGE_SIZE; + box.top = page; + box.bottom = box.top + 1; + + ctx->CopySubresourceRegion(m_pb, 0, 0, 0, 0, m_vm, 0, &box); +*/ + D3D11_MAPPED_SUBRESOURCE map; + + if(SUCCEEDED(ctx->Map(m_pb, 0, D3D11_MAP_READ, 0, &map))) + { + memcpy(m_mem.m_vm8 + page * PAGE_SIZE, map.pData, PAGE_SIZE); + + ctx->Unmap(m_pb, 0); + + if(0) + printf("[%lld] read %05x %d %d (%d)\n", __rdtsc(), o->bp, o->bw, o->psm, page); + } + } + } + + delete [] pages; +} + +bool GSRendererCS::GetOffsetBuffer(OffsetBuffer** fzbo) +{ + HRESULT hr; + + GSDevice11* dev = (GSDevice11*)m_dev; + + D3D11_BUFFER_DESC bd; + D3D11_SHADER_RESOURCE_VIEW_DESC srvd; + D3D11_SUBRESOURCE_DATA data; + + hash_map::iterator i = m_offset.find(m_context->offset.fzb->hash); + + if(i == m_offset.end()) + { + OffsetBuffer ob; + + memset(&bd, 0, sizeof(bd)); + + bd.ByteWidth = sizeof(GSVector2i) * 2048; + bd.Usage = D3D11_USAGE_IMMUTABLE; + bd.BindFlags = D3D11_BIND_SHADER_RESOURCE; + + memset(&data, 0, sizeof(data)); + + data.pSysMem = m_context->offset.fzb->row; + + hr = (*dev)->CreateBuffer(&bd, &data, &ob.row); + + if(FAILED(hr)) return false; + + data.pSysMem = m_context->offset.fzb->col; + + hr = (*dev)->CreateBuffer(&bd, &data, &ob.col); + + if(FAILED(hr)) return false; + + memset(&srvd, 0, sizeof(srvd)); + + srvd.Format = DXGI_FORMAT_R32G32_SINT; + srvd.ViewDimension = D3D11_SRV_DIMENSION_BUFFER; + srvd.Buffer.FirstElement = 0; + srvd.Buffer.NumElements = 2048; + + hr = (*dev)->CreateShaderResourceView(ob.row, &srvd, &ob.row_srv); + + if(FAILED(hr)) return false; + + hr = (*dev)->CreateShaderResourceView(ob.col, &srvd, &ob.col_srv); + + if(FAILED(hr)) return false; + + m_offset[m_context->offset.fzb->hash] = ob; + + i = m_offset.find(m_context->offset.fzb->hash); + } + + *fzbo = &i->second; + + return true; +} diff --git a/plugins/GSdx/GSRendererCS.h b/plugins/GSdx/GSRendererCS.h index 8a08e04427..69b25d7959 100644 --- a/plugins/GSdx/GSRendererCS.h +++ b/plugins/GSdx/GSRendererCS.h @@ -1,145 +1,145 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#pragma once - -#include "GSRenderer.h" -#include "GSDevice11.h" - -class GSRendererCS : public GSRenderer -{ - struct VSSelector - { - union - { - struct - { - uint32 tme:1; - uint32 fst:1; - }; - - uint32 key; - }; - - operator uint32() {return key & 0x3;} - - VSSelector() : key(0) {} - }; - - __aligned(struct, 32) VSConstantBuffer - { - GSVector4 VertexScale; - GSVector4 VertexOffset; - }; - - struct GSSelector - { - union - { - struct - { - uint32 iip:1; - uint32 prim:2; - }; - - uint32 key; - }; - - operator uint32() {return key & 0x7;} - - GSSelector() : key(0) {} - }; - - struct PSSelector - { - union - { - struct - { - uint32 fpsm:6; - uint32 zpsm:6; - }; - - uint32 key; - }; - - operator uint32() {return key & 0x3ff;} - - PSSelector() : key(0) {} - }; - - __aligned(struct, 32) PSConstantBuffer - { - uint32 fm; - uint32 zm; - }; - - CComPtr m_dss; - CComPtr m_bs; - CComPtr m_ss; - CComPtr m_lb; - CComPtr m_lb_uav; - CComPtr m_lb_srv; - CComPtr m_sob; - CComPtr m_sob_uav; - CComPtr m_sob_srv; - CComPtr m_vm; - //CComPtr m_vm; - CComPtr m_vm_uav; - uint32 m_vm_valid[16]; - CComPtr m_pb; - //CComPtr m_pb; - hash_map m_vs; - CComPtr m_vs_cb; - hash_map > m_gs; - CComPtr m_ps0; - hash_map > m_ps1; - CComPtr m_ps_cb; - - void Write(GSOffset* o, const GSVector4i& r); - void Read(GSOffset* o, const GSVector4i& r, bool invalidate); - - struct OffsetBuffer - { - CComPtr row, col; - CComPtr row_srv, col_srv; - }; - - hash_map m_offset; - - bool GetOffsetBuffer(OffsetBuffer** fzbo); - -protected: - GSTexture* m_texture[2]; - uint8* m_output; - - bool CreateDevice(GSDevice* dev); - void ResetDevice(); - void VSync(int field); - GSTexture* GetOutput(int i); - void Draw(); - void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); - void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut); - -public: - GSRendererCS(); - virtual ~GSRendererCS(); -}; +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "GSRenderer.h" +#include "GSDevice11.h" + +class GSRendererCS : public GSRenderer +{ + struct VSSelector + { + union + { + struct + { + uint32 tme:1; + uint32 fst:1; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x3;} + + VSSelector() : key(0) {} + }; + + __aligned(struct, 32) VSConstantBuffer + { + GSVector4 VertexScale; + GSVector4 VertexOffset; + }; + + struct GSSelector + { + union + { + struct + { + uint32 iip:1; + uint32 prim:2; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x7;} + + GSSelector() : key(0) {} + }; + + struct PSSelector + { + union + { + struct + { + uint32 fpsm:6; + uint32 zpsm:6; + }; + + uint32 key; + }; + + operator uint32() {return key & 0x3ff;} + + PSSelector() : key(0) {} + }; + + __aligned(struct, 32) PSConstantBuffer + { + uint32 fm; + uint32 zm; + }; + + CComPtr m_dss; + CComPtr m_bs; + CComPtr m_ss; + CComPtr m_lb; + CComPtr m_lb_uav; + CComPtr m_lb_srv; + CComPtr m_sob; + CComPtr m_sob_uav; + CComPtr m_sob_srv; + CComPtr m_vm; + //CComPtr m_vm; + CComPtr m_vm_uav; + uint32 m_vm_valid[16]; + CComPtr m_pb; + //CComPtr m_pb; + hash_map m_vs; + CComPtr m_vs_cb; + hash_map > m_gs; + CComPtr m_ps0; + hash_map > m_ps1; + CComPtr m_ps_cb; + + void Write(GSOffset* o, const GSVector4i& r); + void Read(GSOffset* o, const GSVector4i& r, bool invalidate); + + struct OffsetBuffer + { + CComPtr row, col; + CComPtr row_srv, col_srv; + }; + + hash_map m_offset; + + bool GetOffsetBuffer(OffsetBuffer** fzbo); + +protected: + GSTexture* m_texture[2]; + uint8* m_output; + + bool CreateDevice(GSDevice* dev); + void ResetDevice(); + void VSync(int field); + GSTexture* GetOutput(int i); + void Draw(); + void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r); + void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r, bool clut); + +public: + GSRendererCS(); + virtual ~GSRendererCS(); +}; diff --git a/plugins/GSdx/config.h b/plugins/GSdx/config.h index 745bc35822..b082c9bce8 100644 --- a/plugins/GSdx/config.h +++ b/plugins/GSdx/config.h @@ -1,44 +1,44 @@ -/* - * Copyright (C) 2007-2009 Gabest - * http://www.gabest.org - * - * This Program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This Program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with GNU Make; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - * http://www.gnu.org/copyleft/gpl.html - * - */ - -#pragma once - -//#define ENABLE_VTUNE - -#define ENABLE_JIT_RASTERIZER - -//#define ENABLE_DYNAMIC_CRC_HACK - -#define ENABLE_UPSCALE_HACKS // Hacks intended to fix upscaling / rendering glitches in HW renderers - -//#define DISABLE_HW_TEXTURE_CACHE // Slow but fixes a lot of bugs - -//#define DISABLE_CRC_HACKS // Disable all game specific hacks - -#if defined(DISABLE_HW_TEXTURE_CACHE) && !defined(DISABLE_CRC_HACKS) - #define DISABLE_CRC_HACKS -#endif - -//#define DISABLE_BITMASKING - -//#define DISABLE_COLCLAMP - -//#define DISABLE_DATE +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +//#define ENABLE_VTUNE + +#define ENABLE_JIT_RASTERIZER + +//#define ENABLE_DYNAMIC_CRC_HACK + +#define ENABLE_UPSCALE_HACKS // Hacks intended to fix upscaling / rendering glitches in HW renderers + +//#define DISABLE_HW_TEXTURE_CACHE // Slow but fixes a lot of bugs + +//#define DISABLE_CRC_HACKS // Disable all game specific hacks + +#if defined(DISABLE_HW_TEXTURE_CACHE) && !defined(DISABLE_CRC_HACKS) + #define DISABLE_CRC_HACKS +#endif + +//#define DISABLE_BITMASKING + +//#define DISABLE_COLCLAMP + +//#define DISABLE_DATE diff --git a/plugins/GSdx/res/cs.fx b/plugins/GSdx/res/cs.fx index 594e99e85b..fb63c0b012 100644 --- a/plugins/GSdx/res/cs.fx +++ b/plugins/GSdx/res/cs.fx @@ -1,383 +1,383 @@ -#ifndef VS_TME -#define VS_TME 1 -#define VS_FST 1 -#endif - -#ifndef GS_IIP -#define GS_IIP 0 -#define GS_PRIM 2 -#endif - -#ifndef PS_BATCH_SIZE -#define PS_BATCH_SIZE 2048 -#define PS_FPSM PSM_PSMCT32 -#define PS_ZPSM PSM_PSMZ16 -#endif - -#define PSM_PSMCT32 0 -#define PSM_PSMCT24 1 -#define PSM_PSMCT16 2 -#define PSM_PSMCT16S 10 -#define PSM_PSMT8 19 -#define PSM_PSMT4 20 -#define PSM_PSMT8H 27 -#define PSM_PSMT4HL 36 -#define PSM_PSMT4HH 44 -#define PSM_PSMZ32 48 -#define PSM_PSMZ24 49 -#define PSM_PSMZ16 50 -#define PSM_PSMZ16S 58 - -struct VS_INPUT -{ - float2 st : TEXCOORD0; - float4 c : COLOR0; - float q : TEXCOORD1; - uint2 p : POSITION0; - uint z : POSITION1; - uint2 uv : TEXCOORD2; - float4 f : COLOR1; -}; - -struct VS_OUTPUT -{ - float4 p : SV_Position; - float2 z : TEXCOORD0; - float4 t : TEXCOORD1; - float4 c : COLOR0; -}; - -struct GS_OUTPUT -{ - float4 p : SV_Position; - float2 z : TEXCOORD0; - float4 t : TEXCOORD1; - float4 c : COLOR0; - uint id : SV_PrimitiveID; -}; - -cbuffer VSConstantBuffer : register(c0) -{ - float4 VertexScale; - float4 VertexOffset; -}; - -cbuffer PSConstantBuffer : register(c0) -{ - uint2 WriteMask; -}; - -struct FragmentLinkItem -{ - uint c, z, id, next; -}; - -RWByteAddressBuffer VideoMemory : register(u0); -RWStructuredBuffer FragmentLinkBuffer : register(u1); -RWByteAddressBuffer StartOffsetBuffer : register(u2); -//RWTexture2D VideoMemory : register(u2); // 8192 * 512 R8_UINT - -Buffer FZRowOffset : register(t0); -Buffer FZColOffset : register(t1); -Texture2D Palette : register(t2); -Texture2D Texture : register(t3); - -VS_OUTPUT vs_main(VS_INPUT input) -{ - VS_OUTPUT output; - - output.p = float4(input.p, 0.0f, 0.0f) * VertexScale - VertexOffset; - output.z = float2(input.z & 0xffff, input.z >> 16); // TODO: min(input.z, 0xffffff00) ? - - if(VS_TME) - { - if(VS_FST) - { - output.t.xy = input.uv; - output.t.w = 1.0f; - } - else - { - output.t.xy = input.st; - output.t.w = input.q; - } - } - else - { - output.t.xy = 0; - output.t.w = 1.0f; - } - - output.c = input.c; - output.t.z = input.f.r; - - return output; -} - -#if GS_PRIM == 0 - -[maxvertexcount(1)] -void gs_main(point VS_OUTPUT input[1], inout PointStream stream, uint id : SV_PrimitiveID) -{ - GS_OUTPUT output; - - output.p = input[0].p; - output.z = input[0].z; - output.t = input[0].t; - output.c = input[0].c; - output.id = id; - - stream.Append(output); -} - -#elif GS_PRIM == 1 - -[maxvertexcount(2)] -void gs_main(line VS_OUTPUT input[2], inout LineStream stream, uint id : SV_PrimitiveID) -{ - [unroll] - for(int i = 0; i < 2; i++) - { - GS_OUTPUT output; - - output.p = input[i].p; - output.z = input[i].z; - output.t = input[i].t; - output.c = input[i].c; - output.id = id; - -#if GS_IIP == 0 - if(i != 1) output.c = input[1].c; -#endif - - stream.Append(output); - } -} - -#elif GS_PRIM == 2 - -[maxvertexcount(3)] -void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream stream, uint id : SV_PrimitiveID) -{ - [unroll] - for(int i = 0; i < 3; i++) - { - GS_OUTPUT output; - - output.p = input[i].p; - output.z = input[i].z; - output.t = input[i].t; - output.c = input[i].c; - output.id = id; - -#if GS_IIP == 0 - if(i != 2) output.c = input[2].c; -#endif - - stream.Append(output); - } -} - -#elif GS_PRIM == 3 - -[maxvertexcount(4)] -void gs_main(line VS_OUTPUT input[2], inout TriangleStream stream, uint id : SV_PrimitiveID) -{ - GS_OUTPUT lt, rb, lb, rt; - - lt.p = input[0].p; - lt.z = input[1].z; - lt.t.xy = input[0].t.xy; - lt.t.zw = input[1].t.zw; - lt.c = input[0].c; - lt.id = id; - -#if GS_IIP == 0 - lt.c = input[1].c; -#endif - - rb.p = input[1].p; - rb.z = input[1].z; - rb.t = input[1].t; - rb.c = input[1].c; - rb.id = id; - - lb = lt; - lb.p.y = rb.p.y; - lb.t.y = rb.t.y; - - rt = rb; - rt.p.y = lt.p.y; - rt.t.y = lt.t.y; - - stream.Append(lt); - stream.Append(lb); - stream.Append(rt); - stream.Append(rb); -} - -#endif - -uint CompressColor32(float4 f) -{ - uint4 c = (uint4)(f * 0xff) << uint4(0, 8, 16, 24); - - return c.r | c.g | c.b | c.a; -} - -uint DecompressColor16(uint c) -{ - uint r = (c & 0x001f) << 3; - uint g = (c & 0x03e0) << 6; - uint b = (c & 0x7c00) << 9; - uint a = (c & 0x8000) << 15; - - return r | g | b | a; -} - -uint ReadPixel(uint addr) -{ - return VideoMemory.Load(addr) >> ((addr & 2) << 3); -} - -void WritePixel(uint addr, uint value, uint psm) -{ - uint tmp; - - switch(psm) - { - case PSM_PSMCT32: - case PSM_PSMZ32: - case PSM_PSMCT24: - case PSM_PSMZ24: - VideoMemory.Store(addr, value); - break; - case PSM_PSMCT16: - case PSM_PSMCT16S: - case PSM_PSMZ16: - case PSM_PSMZ16S: - tmp = (addr & 2) << 3; - value = ((value << tmp) ^ VideoMemory.Load(addr)) & (0x0000ffff << tmp); - VideoMemory.InterlockedXor(addr, value, tmp); - break; - } -} - -void ps_main0(GS_OUTPUT input) -{ - uint x = (uint)input.p.x; - uint y = (uint)input.p.y; - - uint tail = FragmentLinkBuffer.IncrementCounter(); - - uint index = (y << 11) + x; - uint next = 0; - - StartOffsetBuffer.InterlockedExchange(index * 4, tail, next); - - FragmentLinkItem item; - - // TODO: preprocess color (tfx, alpha test), z-test - - item.c = CompressColor32(input.c); - item.z = (uint)(input.z.y * 0x10000 + input.z.x); - item.id = input.id; - item.next = next; - - FragmentLinkBuffer[tail] = item; -} - -void ps_main1(GS_OUTPUT input) -{ - uint2 pos = (uint2)input.p.xy; - - // sort fragments - - uint StartOffsetIndex = (pos.y << 11) + pos.x; - - int index[PS_BATCH_SIZE]; - int count = 0; - - uint next = StartOffsetBuffer.Load(StartOffsetIndex * 4); - - StartOffsetBuffer.Store(StartOffsetIndex * 4, 0); - - [allow_uav_condition] - while(next != 0) - { - index[count++] = next; - - next = FragmentLinkBuffer[next].next; - } - - int N2 = 1 << (int)(ceil(log2(count))); - - [allow_uav_condition] - for(int i = count; i < N2; i++) - { - index[i] = 0; - } - - [allow_uav_condition] - for(int k = 2; k <= N2; k = 2 * k) - { - [allow_uav_condition] - for(int j = k >> 1; j > 0 ; j = j >> 1) - { - [allow_uav_condition] - for(int i = 0; i < N2; i++) - { - uint i_id = FragmentLinkBuffer[index[i]].id; - - int ixj = i ^ j; - - if(ixj > i) - { - uint ixj_id = FragmentLinkBuffer[index[ixj]].id; - - if((i & k) == 0 && i_id > ixj_id) - { - int temp = index[i]; - index[i] = index[ixj]; - index[ixj] = temp; - } - - if((i & k) != 0 && i_id < ixj_id) - { - int temp = index[i]; - index[i] = index[ixj]; - index[ixj] = temp; - } - } - } - } - } - - uint2 addr = (uint2)(FZRowOffset[pos.y] + FZColOffset[pos.x]) << 1; - - uint dc = ReadPixel(addr.x); - uint dz = ReadPixel(addr.y); - - uint sc = dc; - uint sz = dz; - - [allow_uav_condition] - while(--count >= 0) - { - FragmentLinkItem f = FragmentLinkBuffer[index[count]]; - - // TODO - - if(sz < f.z) - { - sc = f.c; - sz = f.z; - } - } - - uint c = sc; // (dc & ~WriteMask.x) | (sc & WriteMask.x); - uint z = 0;//sz; //(dz & ~WriteMask.y) | (sz & WriteMask.y); - - WritePixel(addr.x, c, PS_FPSM); - WritePixel(addr.y, z, PS_ZPSM); -} +#ifndef VS_TME +#define VS_TME 1 +#define VS_FST 1 +#endif + +#ifndef GS_IIP +#define GS_IIP 0 +#define GS_PRIM 2 +#endif + +#ifndef PS_BATCH_SIZE +#define PS_BATCH_SIZE 2048 +#define PS_FPSM PSM_PSMCT32 +#define PS_ZPSM PSM_PSMZ16 +#endif + +#define PSM_PSMCT32 0 +#define PSM_PSMCT24 1 +#define PSM_PSMCT16 2 +#define PSM_PSMCT16S 10 +#define PSM_PSMT8 19 +#define PSM_PSMT4 20 +#define PSM_PSMT8H 27 +#define PSM_PSMT4HL 36 +#define PSM_PSMT4HH 44 +#define PSM_PSMZ32 48 +#define PSM_PSMZ24 49 +#define PSM_PSMZ16 50 +#define PSM_PSMZ16S 58 + +struct VS_INPUT +{ + float2 st : TEXCOORD0; + float4 c : COLOR0; + float q : TEXCOORD1; + uint2 p : POSITION0; + uint z : POSITION1; + uint2 uv : TEXCOORD2; + float4 f : COLOR1; +}; + +struct VS_OUTPUT +{ + float4 p : SV_Position; + float2 z : TEXCOORD0; + float4 t : TEXCOORD1; + float4 c : COLOR0; +}; + +struct GS_OUTPUT +{ + float4 p : SV_Position; + float2 z : TEXCOORD0; + float4 t : TEXCOORD1; + float4 c : COLOR0; + uint id : SV_PrimitiveID; +}; + +cbuffer VSConstantBuffer : register(c0) +{ + float4 VertexScale; + float4 VertexOffset; +}; + +cbuffer PSConstantBuffer : register(c0) +{ + uint2 WriteMask; +}; + +struct FragmentLinkItem +{ + uint c, z, id, next; +}; + +RWByteAddressBuffer VideoMemory : register(u0); +RWStructuredBuffer FragmentLinkBuffer : register(u1); +RWByteAddressBuffer StartOffsetBuffer : register(u2); +//RWTexture2D VideoMemory : register(u2); // 8192 * 512 R8_UINT + +Buffer FZRowOffset : register(t0); +Buffer FZColOffset : register(t1); +Texture2D Palette : register(t2); +Texture2D Texture : register(t3); + +VS_OUTPUT vs_main(VS_INPUT input) +{ + VS_OUTPUT output; + + output.p = float4(input.p, 0.0f, 0.0f) * VertexScale - VertexOffset; + output.z = float2(input.z & 0xffff, input.z >> 16); // TODO: min(input.z, 0xffffff00) ? + + if(VS_TME) + { + if(VS_FST) + { + output.t.xy = input.uv; + output.t.w = 1.0f; + } + else + { + output.t.xy = input.st; + output.t.w = input.q; + } + } + else + { + output.t.xy = 0; + output.t.w = 1.0f; + } + + output.c = input.c; + output.t.z = input.f.r; + + return output; +} + +#if GS_PRIM == 0 + +[maxvertexcount(1)] +void gs_main(point VS_OUTPUT input[1], inout PointStream stream, uint id : SV_PrimitiveID) +{ + GS_OUTPUT output; + + output.p = input[0].p; + output.z = input[0].z; + output.t = input[0].t; + output.c = input[0].c; + output.id = id; + + stream.Append(output); +} + +#elif GS_PRIM == 1 + +[maxvertexcount(2)] +void gs_main(line VS_OUTPUT input[2], inout LineStream stream, uint id : SV_PrimitiveID) +{ + [unroll] + for(int i = 0; i < 2; i++) + { + GS_OUTPUT output; + + output.p = input[i].p; + output.z = input[i].z; + output.t = input[i].t; + output.c = input[i].c; + output.id = id; + +#if GS_IIP == 0 + if(i != 1) output.c = input[1].c; +#endif + + stream.Append(output); + } +} + +#elif GS_PRIM == 2 + +[maxvertexcount(3)] +void gs_main(triangle VS_OUTPUT input[3], inout TriangleStream stream, uint id : SV_PrimitiveID) +{ + [unroll] + for(int i = 0; i < 3; i++) + { + GS_OUTPUT output; + + output.p = input[i].p; + output.z = input[i].z; + output.t = input[i].t; + output.c = input[i].c; + output.id = id; + +#if GS_IIP == 0 + if(i != 2) output.c = input[2].c; +#endif + + stream.Append(output); + } +} + +#elif GS_PRIM == 3 + +[maxvertexcount(4)] +void gs_main(line VS_OUTPUT input[2], inout TriangleStream stream, uint id : SV_PrimitiveID) +{ + GS_OUTPUT lt, rb, lb, rt; + + lt.p = input[0].p; + lt.z = input[1].z; + lt.t.xy = input[0].t.xy; + lt.t.zw = input[1].t.zw; + lt.c = input[0].c; + lt.id = id; + +#if GS_IIP == 0 + lt.c = input[1].c; +#endif + + rb.p = input[1].p; + rb.z = input[1].z; + rb.t = input[1].t; + rb.c = input[1].c; + rb.id = id; + + lb = lt; + lb.p.y = rb.p.y; + lb.t.y = rb.t.y; + + rt = rb; + rt.p.y = lt.p.y; + rt.t.y = lt.t.y; + + stream.Append(lt); + stream.Append(lb); + stream.Append(rt); + stream.Append(rb); +} + +#endif + +uint CompressColor32(float4 f) +{ + uint4 c = (uint4)(f * 0xff) << uint4(0, 8, 16, 24); + + return c.r | c.g | c.b | c.a; +} + +uint DecompressColor16(uint c) +{ + uint r = (c & 0x001f) << 3; + uint g = (c & 0x03e0) << 6; + uint b = (c & 0x7c00) << 9; + uint a = (c & 0x8000) << 15; + + return r | g | b | a; +} + +uint ReadPixel(uint addr) +{ + return VideoMemory.Load(addr) >> ((addr & 2) << 3); +} + +void WritePixel(uint addr, uint value, uint psm) +{ + uint tmp; + + switch(psm) + { + case PSM_PSMCT32: + case PSM_PSMZ32: + case PSM_PSMCT24: + case PSM_PSMZ24: + VideoMemory.Store(addr, value); + break; + case PSM_PSMCT16: + case PSM_PSMCT16S: + case PSM_PSMZ16: + case PSM_PSMZ16S: + tmp = (addr & 2) << 3; + value = ((value << tmp) ^ VideoMemory.Load(addr)) & (0x0000ffff << tmp); + VideoMemory.InterlockedXor(addr, value, tmp); + break; + } +} + +void ps_main0(GS_OUTPUT input) +{ + uint x = (uint)input.p.x; + uint y = (uint)input.p.y; + + uint tail = FragmentLinkBuffer.IncrementCounter(); + + uint index = (y << 11) + x; + uint next = 0; + + StartOffsetBuffer.InterlockedExchange(index * 4, tail, next); + + FragmentLinkItem item; + + // TODO: preprocess color (tfx, alpha test), z-test + + item.c = CompressColor32(input.c); + item.z = (uint)(input.z.y * 0x10000 + input.z.x); + item.id = input.id; + item.next = next; + + FragmentLinkBuffer[tail] = item; +} + +void ps_main1(GS_OUTPUT input) +{ + uint2 pos = (uint2)input.p.xy; + + // sort fragments + + uint StartOffsetIndex = (pos.y << 11) + pos.x; + + int index[PS_BATCH_SIZE]; + int count = 0; + + uint next = StartOffsetBuffer.Load(StartOffsetIndex * 4); + + StartOffsetBuffer.Store(StartOffsetIndex * 4, 0); + + [allow_uav_condition] + while(next != 0) + { + index[count++] = next; + + next = FragmentLinkBuffer[next].next; + } + + int N2 = 1 << (int)(ceil(log2(count))); + + [allow_uav_condition] + for(int i = count; i < N2; i++) + { + index[i] = 0; + } + + [allow_uav_condition] + for(int k = 2; k <= N2; k = 2 * k) + { + [allow_uav_condition] + for(int j = k >> 1; j > 0 ; j = j >> 1) + { + [allow_uav_condition] + for(int i = 0; i < N2; i++) + { + uint i_id = FragmentLinkBuffer[index[i]].id; + + int ixj = i ^ j; + + if(ixj > i) + { + uint ixj_id = FragmentLinkBuffer[index[ixj]].id; + + if((i & k) == 0 && i_id > ixj_id) + { + int temp = index[i]; + index[i] = index[ixj]; + index[ixj] = temp; + } + + if((i & k) != 0 && i_id < ixj_id) + { + int temp = index[i]; + index[i] = index[ixj]; + index[ixj] = temp; + } + } + } + } + } + + uint2 addr = (uint2)(FZRowOffset[pos.y] + FZColOffset[pos.x]) << 1; + + uint dc = ReadPixel(addr.x); + uint dz = ReadPixel(addr.y); + + uint sc = dc; + uint sz = dz; + + [allow_uav_condition] + while(--count >= 0) + { + FragmentLinkItem f = FragmentLinkBuffer[index[count]]; + + // TODO + + if(sz < f.z) + { + sc = f.c; + sz = f.z; + } + } + + uint c = sc; // (dc & ~WriteMask.x) | (sc & WriteMask.x); + uint z = 0;//sz; //(dz & ~WriteMask.y) | (sz & WriteMask.y); + + WritePixel(addr.x, c, PS_FPSM); + WritePixel(addr.y, z, PS_ZPSM); +} diff --git a/plugins/GSdx/res/fxaa.fx b/plugins/GSdx/res/fxaa.fx index 095d289197..450aea1d8d 100644 --- a/plugins/GSdx/res/fxaa.fx +++ b/plugins/GSdx/res/fxaa.fx @@ -1,1996 +1,1996 @@ -#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency - -#define FXAA_PC 1 -#define FXAA_QUALITY__SUBPIX 0.0 - -#if SHADER_MODEL >= 0x400 - -#if SHADER_MODEL >= 0x500 - #define FXAA_HLSL_5 1 -#else - #define FXAA_HLSL_4 1 -#endif - -Texture2D Texture; -SamplerState TextureSampler; - -cbuffer cb0 -{ - float4 _rcpFrame; - float4 _rcpFrameOpt; -}; - -struct PS_INPUT -{ - float4 p : SV_Position; - float2 t : TEXCOORD0; -}; - -struct PS_OUTPUT -{ - float4 c : SV_Target0; -}; - -#elif SHADER_MODEL <= 0x300 - -#define FXAA_HLSL_3 1 - -sampler Texture : register(s0); - -float4 _rcpFrame : register(c0); -float4 _rcpFrameOpt : register(c1); - -struct PS_INPUT -{ -#if SHADER_MODEL < 0x300 - float4 p : TEXCOORD1; -#else - float4 p : VPOS; -#endif - float2 t : TEXCOORD0; -}; - -struct PS_OUTPUT -{ - float4 c : COLOR; -}; - -#endif - -/*============================================================================ - - - NVIDIA FXAA 3.10 by TIMOTHY LOTTES - - ------------------------------------------------------------------------------- -COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED. ------------------------------------------------------------------------------- -TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED -*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA -OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR -CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR -LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, -OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE -THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH -DAMAGES. - ------------------------------------------------------------------------------- - INTEGRATION CHECKLIST ------------------------------------------------------------------------------- -(1.) -In the shader source, -setup defines for the desired configuration. -Example, - - #define FXAA_PC 1 - #define FXAA_HLSL_3 1 - #define FXAA_QUALITY__PRESET 12 - #define FXAA_QUALITY__EDGE_THRESHOLD (1.0/6.0) - #define FXAA_QUALITY__EDGE_THRESHOLD_MIN (1.0/12.0) - -(2.) -Then include this file, - - #include "Fxaa3.h" - -(3.) -Then call the FXAA pixel shader from within your desired shader, - - return FxaaPixelShader(pos, posPos, tex, rcpFrame, rcpFrameOpt); - -(4.) -Insure pass prior to FXAA outputs RGBL. -See next section. - -(5.) -Setup engine to provide "rcpFrame" and "rcpFrameOpt" constants. -Not using constants will result in a performance loss. - - // {x_} = 1.0/screenWidthInPixels - // {_y} = 1.0/screenHeightInPixels - float2 rcpFrame - - // This must be from a constant/uniform. - // {x___} = 2.0/screenWidthInPixels - // {_y__} = 2.0/screenHeightInPixels - // {__z_} = 0.5/screenWidthInPixels - // {___w} = 0.5/screenHeightInPixels - float4 rcpFrameOpt - -(5.a.) -Optionally change to this for sharper FXAA Console, - - // This must be from a constant/uniform. - // {x___} = 2.0/screenWidthInPixels - // {_y__} = 2.0/screenHeightInPixels - // {__z_} = 0.333/screenWidthInPixels - // {___w} = 0.333/screenHeightInPixels - float4 rcpFrameOpt - -(6.) -Have FXAA vertex shader run as a full screen triangle, -and output "pos" and "posPos" such that inputs in the pixel shader provide, - - // {xy} = center of pixel - float2 pos, - - // {xy__} = upper left of pixel - // {__zw} = lower right of pixel - float4 posPos, - -(7.) -Insure the texture sampler used by FXAA is set to bilinear filtering. - - ------------------------------------------------------------------------------- - INTEGRATION - RGBL AND COLORSPACE ------------------------------------------------------------------------------- -FXAA3 requires RGBL as input. - -RGB should be LDR (low dynamic range). -Specifically do FXAA after tonemapping. - -RGB data as returned by a texture fetch can be linear or non-linear. -Note an "sRGB format" texture counts as linear, -because the result of a texture fetch is linear data. -Regular "RGBA8" textures in the sRGB colorspace are non-linear. - -Luma must be stored in the alpha channel prior to running FXAA. -This luma should be in a perceptual space (could be gamma 2.0). -Example pass before FXAA where output is gamma 2.0 encoded, - - color.rgb = ToneMap(color.rgb); // linear color output - color.rgb = sqrt(color.rgb); // gamma 2.0 color output - return color; - -To use FXAA, - - color.rgb = ToneMap(color.rgb); // linear color output - color.rgb = sqrt(color.rgb); // gamma 2.0 color output - color.a = dot(color.rgb, float3(0.299, 0.587, 0.114)); // compute luma - return color; - -Another example where output is linear encoded, -say for instance writing to an sRGB formated render target, -where the render target does the conversion back to sRGB after blending, - - color.rgb = ToneMap(color.rgb); // linear color output - return color; - -To use FXAA, - - color.rgb = ToneMap(color.rgb); // linear color output - color.a = sqrt(dot(color.rgb, float3(0.299, 0.587, 0.114))); // compute luma - return color; - -Getting luma correct is required for the algorithm to work correctly. - - ------------------------------------------------------------------------------- - BEING LINEARLY CORRECT? ------------------------------------------------------------------------------- -Applying FXAA to a framebuffer with linear RGB color will look worse. -This is very counter intuitive, but happends to be true in this case. -The reason is because dithering artifacts will be more visiable -in a linear colorspace. - - ------------------------------------------------------------------------------- - COMPLEX INTEGRATION ------------------------------------------------------------------------------- -Q. What if the engine is blending into RGB before wanting to run FXAA? - -A. In the last opaque pass prior to FXAA, - have the pass write out luma into alpha. - Then blend into RGB only. - FXAA should be able to run ok - assuming the blending pass did not any add aliasing. - This should be the common case for particles and common blending passes. - -============================================================================*/ - -/*============================================================================ - - INTEGRATION KNOBS - -============================================================================*/ -// -// FXAA_PS3 and FXAA_360 choose the console algorithm (FXAA3 CONSOLE). -// FXAA_360_OPT is a prototype for the new optimized 360 version. -// -// 1 = Use API. -// 0 = Don't use API. -// -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_PS3 - #define FXAA_PS3 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_360 - #define FXAA_360 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_360_OPT - #define FXAA_360_OPT 0 -#endif -/*==========================================================================*/ -#ifndef FXAA_PC - // - // FXAA Quality - // The high quality PC algorithm. - // - #define FXAA_PC 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_PC_CONSOLE - // - // The console algorithm for PC is included - // for developers targeting really low spec machines. - // - #define FXAA_PC_CONSOLE 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_GLSL_120 - #define FXAA_GLSL_120 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_GLSL_130 - #define FXAA_GLSL_130 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_HLSL_3 - #define FXAA_HLSL_3 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_HLSL_4 - #define FXAA_HLSL_4 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_HLSL_5 - #define FXAA_HLSL_5 0 -#endif -/*==========================================================================*/ -#ifndef FXAA_EARLY_EXIT - // - // Controls algorithm's early exit path. - // On PS3 turning this on adds 2 cycles to the shader. - // On 360 turning this off adds 10ths of a millisecond to the shader. - // Turning this off on console will result in a more blurry image. - // So this defaults to on. - // - // 1 = On. - // 0 = Off. - // - #define FXAA_EARLY_EXIT 1 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_DISCARD - // - // Only valid for PC OpenGL currently. - // - // 1 = Use discard on pixels which don't need AA. - // For APIs which enable concurrent TEX+ROP from same surface. - // 0 = Return unchanged color on pixels which don't need AA. - // - #define FXAA_DISCARD 0 -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_FAST_PIXEL_OFFSET - // - // Used for GLSL 120 only. - // - // 1 = GL API supports fast pixel offsets - // 0 = do not use fast pixel offsets - // - #ifdef GL_EXT_gpu_shader4 - #define FXAA_FAST_PIXEL_OFFSET 1 - #endif - #ifdef GL_NV_gpu_shader5 - #define FXAA_FAST_PIXEL_OFFSET 1 - #endif - #ifdef GL_ARB_gpu_shader5 - #define FXAA_FAST_PIXEL_OFFSET 1 - #endif - #ifndef FXAA_FAST_PIXEL_OFFSET - #define FXAA_FAST_PIXEL_OFFSET 0 - #endif -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_GATHER4_ALPHA - // - // 1 = API supports gather4 on alpha channel. - // 0 = API does not support gather4 on alpha channel. - // - #if (FXAA_HLSL_5 == 1) - #define FXAA_GATHER4_ALPHA 1 - #endif - #ifdef GL_ARB_gpu_shader5 - #define FXAA_GATHER4_ALPHA 1 - #endif - #ifdef GL_NV_gpu_shader5 - #define FXAA_GATHER4_ALPHA 1 - #endif - #ifndef FXAA_GATHER4_ALPHA - #define FXAA_GATHER4_ALPHA 0 - #endif -#endif - -/*============================================================================ - FXAA CONSOLE - TUNING KNOBS -============================================================================*/ -#ifndef FXAA_CONSOLE__EDGE_SHARPNESS - // - // Consoles the sharpness of edges. - // - // Due to the PS3 being ALU bound, - // there are only two safe values here: 4 and 8. - // These options use the shaders ability to a free *|/ by 4|8. - // - // 8.0 is sharper - // 4.0 is softer - // 2.0 is really soft (good for vector graphics inputs) - // - #if 1 - #define FXAA_CONSOLE__EDGE_SHARPNESS 8.0 - #endif - #if 0 - #define FXAA_CONSOLE__EDGE_SHARPNESS 4.0 - #endif - #if 0 - #define FXAA_CONSOLE__EDGE_SHARPNESS 2.0 - #endif -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_CONSOLE__EDGE_THRESHOLD - // - // The minimum amount of local contrast required to apply algorithm. - // The console setting has a different mapping than the quality setting. - // - // This only applies when FXAA_EARLY_EXIT is 1. - // - // Due to the PS3 being ALU bound, - // there are only two safe values here: 0.25 and 0.125. - // These options use the shaders ability to a free *|/ by 4|8. - // - // 0.125 leaves less aliasing, but is softer - // 0.25 leaves more aliasing, and is sharper - // - #if 1 - #define FXAA_CONSOLE__EDGE_THRESHOLD 0.125 - #else - #define FXAA_CONSOLE__EDGE_THRESHOLD 0.25 - #endif -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_CONSOLE__EDGE_THRESHOLD_MIN - // - // Trims the algorithm from processing darks. - // The console setting has a different mapping than the quality setting. - // - // This only applies when FXAA_EARLY_EXIT is 1. - // - // This does not apply to PS3. - // PS3 was simplified to avoid more shader instructions. - // - #define FXAA_CONSOLE__EDGE_THRESHOLD_MIN 0.05 -#endif - -/*============================================================================ - FXAA QUALITY - TUNING KNOBS -============================================================================*/ -#ifndef FXAA_QUALITY__EDGE_THRESHOLD - // - // The minimum amount of local contrast required to apply algorithm. - // - // 1/3 - too little - // 1/4 - low quality - // 1/6 - default - // 1/8 - high quality (default) - // 1/16 - overkill - // - #define FXAA_QUALITY__EDGE_THRESHOLD (1.0/6.0) -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_QUALITY__EDGE_THRESHOLD_MIN - // - // Trims the algorithm from processing darks. - // - // 1/32 - visible limit - // 1/16 - high quality - // 1/12 - upper limit (default, the start of visible unfiltered edges) - // - #define FXAA_QUALITY__EDGE_THRESHOLD_MIN (1.0/12.0) -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_QUALITY__SUBPIX - // - // Choose the amount of sub-pixel aliasing removal. - // - // 1 - upper limit (softer) - // 3/4 - default amount of filtering - // 1/2 - lower limit (sharper, less sub-pixel aliasing removal) - // - #define FXAA_QUALITY__SUBPIX (3.0/4.0) -#endif -/*--------------------------------------------------------------------------*/ -#ifndef FXAA_QUALITY__PRESET - // - // Choose the quality preset. - // - // OPTIONS - // ----------------------------------------------------------------------- - // 10 to 15 - default medium dither (10=fastest, 15=highest quality) - // 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality) - // 39 - no dither, very expensive - // - // NOTES - // ----------------------------------------------------------------------- - // 12 = slightly faster then FXAA 3.9 and higher edge quality (default) - // 13 = about same speed as FXAA 3.9 and better than 12 - // 23 = closest to FXAA 3.9 visually and performance wise - // _ = the lowest digit is directly related to performance - // _ = the highest digit is directly related to style - // - #define FXAA_QUALITY__PRESET 12 -#endif - - -/*============================================================================ - - FXAA QUALITY - PRESETS - -============================================================================*/ - -/*============================================================================ - FXAA QUALITY - MEDIUM DITHER PRESETS -============================================================================*/ -#if (FXAA_QUALITY__PRESET == 10) - #define FXAA_QUALITY__PS 3 - #define FXAA_QUALITY__P0 1.5 - #define FXAA_QUALITY__P1 3.0 - #define FXAA_QUALITY__P2 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 11) - #define FXAA_QUALITY__PS 4 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 3.0 - #define FXAA_QUALITY__P3 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 12) - #define FXAA_QUALITY__PS 5 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 4.0 - #define FXAA_QUALITY__P4 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 13) - #define FXAA_QUALITY__PS 6 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 4.0 - #define FXAA_QUALITY__P5 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 14) - #define FXAA_QUALITY__PS 7 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 4.0 - #define FXAA_QUALITY__P6 12.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 15) - #define FXAA_QUALITY__PS 8 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 4.0 - #define FXAA_QUALITY__P7 12.0 -#endif - -/*============================================================================ - FXAA QUALITY - LOW DITHER PRESETS -============================================================================*/ -#if (FXAA_QUALITY__PRESET == 20) - #define FXAA_QUALITY__PS 3 - #define FXAA_QUALITY__P0 1.5 - #define FXAA_QUALITY__P1 2.0 - #define FXAA_QUALITY__P2 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 21) - #define FXAA_QUALITY__PS 4 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 22) - #define FXAA_QUALITY__PS 5 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 23) - #define FXAA_QUALITY__PS 6 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 24) - #define FXAA_QUALITY__PS 7 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 3.0 - #define FXAA_QUALITY__P6 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 25) - #define FXAA_QUALITY__PS 8 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 4.0 - #define FXAA_QUALITY__P7 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 26) - #define FXAA_QUALITY__PS 9 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 4.0 - #define FXAA_QUALITY__P8 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 27) - #define FXAA_QUALITY__PS 10 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 2.0 - #define FXAA_QUALITY__P8 4.0 - #define FXAA_QUALITY__P9 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 28) - #define FXAA_QUALITY__PS 11 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 2.0 - #define FXAA_QUALITY__P8 2.0 - #define FXAA_QUALITY__P9 4.0 - #define FXAA_QUALITY__P10 8.0 -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_QUALITY__PRESET == 29) - #define FXAA_QUALITY__PS 12 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.5 - #define FXAA_QUALITY__P2 2.0 - #define FXAA_QUALITY__P3 2.0 - #define FXAA_QUALITY__P4 2.0 - #define FXAA_QUALITY__P5 2.0 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 2.0 - #define FXAA_QUALITY__P8 2.0 - #define FXAA_QUALITY__P9 2.0 - #define FXAA_QUALITY__P10 4.0 - #define FXAA_QUALITY__P11 8.0 -#endif - -/*============================================================================ - FXAA QUALITY - EXTREME QUALITY -============================================================================*/ -#if (FXAA_QUALITY__PRESET == 39) - #define FXAA_QUALITY__PS 12 - #define FXAA_QUALITY__P0 1.0 - #define FXAA_QUALITY__P1 1.0 - #define FXAA_QUALITY__P2 1.0 - #define FXAA_QUALITY__P3 1.0 - #define FXAA_QUALITY__P4 1.0 - #define FXAA_QUALITY__P5 1.5 - #define FXAA_QUALITY__P6 2.0 - #define FXAA_QUALITY__P7 2.0 - #define FXAA_QUALITY__P8 2.0 - #define FXAA_QUALITY__P9 2.0 - #define FXAA_QUALITY__P10 4.0 - #define FXAA_QUALITY__P11 8.0 -#endif - - - -/*============================================================================ - - API PORTING - -============================================================================*/ -#if FXAA_GLSL_120 - // Requires, - // #version 120 - // And at least, - // #extension GL_EXT_gpu_shader4 : enable - // (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9) - #define half float - #define half2 vec2 - #define half3 vec3 - #define half4 vec4 - #define int2 ivec2 - #define float2 vec2 - #define float3 vec3 - #define float4 vec4 - #define FxaaInt2 ivec2 - #define FxaaFloat2 vec2 - #define FxaaFloat3 vec3 - #define FxaaFloat4 vec4 - #define FxaaDiscard discard - #define FxaaDot3(a, b) dot(a, b) - #define FxaaSat(x) clamp(x, 0.0, 1.0) - #define FxaaLerp(x,y,s) mix(x,y,s) - #define FxaaTex sampler2D - #define FxaaTexTop(t, p) texture2DLod(t, p, 0.0) - #if (FXAA_FAST_PIXEL_OFFSET == 1) - #define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o) - #else - #define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0) - #endif - #if (FXAA_GATHER4_ALPHA == 1) - // use #extension GL_ARB_gpu_shader5 : enable - #define FxaaTexAlpha4(t, p, r) textureGather(t, p, 3) - #define FxaaTexOffAlpha4(t, p, o, r) textureGatherOffset(t, p, o, 3) - #endif -#endif -/*--------------------------------------------------------------------------*/ -#if FXAA_GLSL_130 - // Requires "#version 130" or better - #define half float - #define half2 vec2 - #define half3 vec3 - #define half4 vec4 - #define int2 ivec2 - #define float2 vec2 - #define float3 vec3 - #define float4 vec4 - #define FxaaInt2 ivec2 - #define FxaaFloat2 vec2 - #define FxaaFloat3 vec3 - #define FxaaFloat4 vec4 - #define FxaaDiscard discard - #define FxaaDot3(a, b) dot(a, b) - #define FxaaSat(x) clamp(x, 0.0, 1.0) - #define FxaaLerp(x,y,s) mix(x,y,s) - #define FxaaTex sampler2D - #define FxaaTexTop(t, p) textureLod(t, p, 0.0) - #define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o) - #if (FXAA_GATHER4_ALPHA == 1) - // use #extension GL_ARB_gpu_shader5 : enable - #define FxaaTexAlpha4(t, p, r) textureGather(t, p, 3) - #define FxaaTexOffAlpha4(t, p, o, r) textureGatherOffset(t, p, o, 3) - #endif -#endif -/*--------------------------------------------------------------------------*/ -#if (FXAA_HLSL_3 == 1) || (FXAA_360 == 1) - #define int2 float2 - #define FxaaInt2 float2 - #define FxaaFloat2 float2 - #define FxaaFloat3 float3 - #define FxaaFloat4 float4 - #define FxaaDiscard clip(-1) - #define FxaaDot3(a, b) dot(a, b) - #define FxaaSat(x) saturate(x) - #define FxaaLerp(x,y,s) lerp(x,y,s) - #define FxaaTex sampler2D - #define FxaaTexTop(t, p) tex2Dlod(t, float4(p, 0.0, 0.0)) - #define FxaaTexOff(t, p, o, r) tex2Dlod(t, float4(p + (o * r), 0, 0)) -#endif -/*--------------------------------------------------------------------------*/ -#if FXAA_HLSL_4 - #define FxaaInt2 int2 - #define FxaaFloat2 float2 - #define FxaaFloat3 float3 - #define FxaaFloat4 float4 - #define FxaaDiscard clip(-1) - #define FxaaDot3(a, b) dot(a, b) - #define FxaaSat(x) saturate(x) - #define FxaaLerp(x,y,s) lerp(x,y,s) - struct FxaaTex { SamplerState smpl; Texture2D tex; }; - #define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0) - #define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o) -#endif -/*--------------------------------------------------------------------------*/ -#if FXAA_HLSL_5 - #define FxaaInt2 int2 - #define FxaaFloat2 float2 - #define FxaaFloat3 float3 - #define FxaaFloat4 float4 - #define FxaaDiscard clip(-1) - #define FxaaDot3(a, b) dot(a, b) - #define FxaaSat(x) saturate(x) - #define FxaaLerp(x,y,s) lerp(x,y,s) - struct FxaaTex { SamplerState smpl; Texture2D tex; }; - #define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0) - #define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o) - #define FxaaTexAlpha4(t, p, r) t.tex.GatherAlpha(t.smpl, p) - #define FxaaTexOffAlpha4(t, p, o, r) t.tex.GatherAlpha(t.smpl, p, o) -#endif - - - -/*============================================================================ - - FXAA3 CONSOLE - 360 PIXEL SHADER - ------------------------------------------------------------------------------- -Might be some optimizations left here, -as of this latest change didn't have a PIX dump to verify if TEX bound. -============================================================================*/ -#if (FXAA_360 == 1) -/*--------------------------------------------------------------------------*/ -half4 FxaaPixelShader( - // {xy} = center of pixel - float2 pos, - // {xy__} = upper left of pixel - // {__zw} = lower right of pixel - float4 posPos, - // {rgb_} = color in linear or perceptual color space - // {___a} = alpha output is junk value - FxaaTex tex, - // This must be from a constant/uniform. - // {xy} = rcpFrame not used on PC version of FXAA Console - float2 rcpFrame, - // This must be from a constant/uniform. - // {x___} = 2.0/screenWidthInPixels - // {_y__} = 2.0/screenHeightInPixels - // {__z_} = 0.5/screenWidthInPixels - // {___w} = 0.5/screenHeightInPixels - float4 rcpFrameOpt -) { -/*--------------------------------------------------------------------------*/ - half4 lumaNwNeSwSe; - lumaNwNeSwSe.x = FxaaTexTop(tex, posPos.xy).w; - lumaNwNeSwSe.y = FxaaTexTop(tex, posPos.zy).w; - lumaNwNeSwSe.z = FxaaTexTop(tex, posPos.xw).w; - lumaNwNeSwSe.w = FxaaTexTop(tex, posPos.zw).w; -/*--------------------------------------------------------------------------*/ - half4 rgbyM = FxaaTexTop(tex, pos.xy); -/*--------------------------------------------------------------------------*/ - lumaNwNeSwSe.y += 1.0/384.0; -/*--------------------------------------------------------------------------*/ - half2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); - half2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); -/*--------------------------------------------------------------------------*/ - half lumaMin = min(lumaMinTemp.x, lumaMinTemp.y); - half lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y); -/*--------------------------------------------------------------------------*/ - half lumaMinM = min(lumaMin, rgbyM.w); - half lumaMaxM = max(lumaMax, rgbyM.w); - if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD)) return rgbyM; -/*--------------------------------------------------------------------------*/ - half2 dir; - dir.x = dot(lumaNwNeSwSe, float4(-1.0, -1.0, 1.0, 1.0)); - dir.y = dot(lumaNwNeSwSe, float4( 1.0, -1.0, 1.0,-1.0)); -/*--------------------------------------------------------------------------*/ - half2 dir1; - dir1 = normalize(dir.xy); -/*--------------------------------------------------------------------------*/ - half dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * FXAA_CONSOLE__EDGE_SHARPNESS; - half2 dir2; - dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0); -/*--------------------------------------------------------------------------*/ - half4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * rcpFrameOpt.zw); - half4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * rcpFrameOpt.zw); - half4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * rcpFrameOpt.xy); - half4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * rcpFrameOpt.xy); -/*--------------------------------------------------------------------------*/ - half4 rgbyA = rgbyN1 * 0.5 + rgbyP1 * 0.5; - half4 rgbyB = rgbyN2 * 0.25 + rgbyP2 * 0.25 + rgbyA * 0.5; -/*--------------------------------------------------------------------------*/ - bool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax); - if(twoTap) rgbyB.xyz = rgbyA.xyz; - return rgbyB; } -/*==========================================================================*/ -#endif - - - -/*============================================================================ - - FXAA3 CONSOLE - 360 PIXEL SHADER OPTIMIZED PROTOTYPE - ------------------------------------------------------------------------------- -This prototype optimized version thanks to suggestions from Andy Luedke. -Should be fully tex bound in all cases. -As of the FXAA 3.10 release I have not tested this code, -but at least the missing ";" was fixed. -If it does not work, please let me know so I can fix it. ------------------------------------------------------------------------------- -Extra requirements, -(1.) Different inputs: no posPos. -(2.) Different inputs: alias three samplers with different exp bias settings! -(3.) New constants: setup fxaaConst as described below. -============================================================================*/ -#if (FXAA_360_OPT == 1) -/*--------------------------------------------------------------------------*/ -[reduceTempRegUsage(4)] -float4 FxaaPixelShader( - // {xy} = center of pixel - float2 pos, - // Three samplers, - // texExpBias0 = exponent bias 0 - // texExpBiasNeg1 = exponent bias -1 - // texExpBiasNeg2 = exponent bias -2 - // {rgb_} = color in linear or perceptual color space - // {___a} = alpha output is junk value - uniform sampler2D texExpBias0, - uniform sampler2D texExpBiasNeg1, - uniform sampler2D texExpBiasNeg2, - // These must be in physical constant registers and NOT immedates - // Immedates will result in compiler un-optimizing - // width = screen width in pixels - // height = screen height in pixels - fxaaConstDir, // float4(1.0, -1.0, 0.25, -0.25); - fxaaConstInner, // float4(0.5/width, 0.5/height, -0.5/width, -0.5/height); - fxaaConstOuter // float4(8.0/width, 8.0/height, -4.0/width, -4.0/height); -) { -/*--------------------------------------------------------------------------*/ - float4 lumaNwNeSwSe; - asm { - tfetch2D lumaNwNeSwSe.w___, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe._w__, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe.__w_, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false - tfetch2D lumaNwNeSwSe.___w, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false - }; -/*--------------------------------------------------------------------------*/ - lumaNwNeSwSe.y += 1.0/384.0; - float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); - float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); - float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y); - float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y); -/*--------------------------------------------------------------------------*/ - float4 rgbyM = tex2Dlod(texExpBias0, float4(pos.xy, 0.0, 0.0)); - float4 lumaMinM = min(lumaMin, rgbyM.w); - float4 lumaMaxM = max(lumaMax, rgbyM.w); - if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD)) return rgbyM; -/*--------------------------------------------------------------------------*/ - float2 dir; - dir.x = dot(lumaNwNeSwSe, fxaaConstDir.yyxx); - dir.y = dot(lumaNwNeSwSe, fxaaConstDir.xyxy); - dir = normalize(dir); -/*--------------------------------------------------------------------------*/ - float4 dir1 = dir.xyxy * fxaaConstInner.xyzw; -/*--------------------------------------------------------------------------*/ - float4 dir2; - float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y)); - dir2 = saturate(fxaaConstOuter.zzww * dir.xyxy / FXAA_CONSOLE__EDGE_SHARPNESS / dirAbsMinTimesC + 0.5); - dir2 = dir2 * fxaaConstOuter.xyxy + fxaaConstOuter.zwzw; -/*--------------------------------------------------------------------------*/ - float4 rgbyN1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.xy, 0.0, 0.0)); - float4 rgbyP1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.zw, 0.0, 0.0)); - float4 rgbyN2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.xy, 0.0, 0.0)); - float4 rgbyP2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.zw, 0.0, 0.0)); -/*--------------------------------------------------------------------------*/ - half4 rgbyA = rgbyN1 + rgbyP1; - half4 rgbyB = rgbyN2 + rgbyP2 * 0.5 + rgbyA; -/*--------------------------------------------------------------------------*/ - float4 rgbyR = ((rgbyB.w - lumaMax) > 0.0) ? rgbyA : rgbyB; - rgbyR = ((rgbyB.w - lumaMin) > 0.0) ? rgbyR : rgbyA; - return rgbyR; } -/*==========================================================================*/ -#endif - - - - -/*============================================================================ - - FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT) - -============================================================================== -The code below does not exactly match the assembly. -I have a feeling that 12 cycles is possible, but was not able to get there. -Might have to increase register count to get full performance. -Note this shader does not use perspective interpolation. - -Use the following cgc options, - - --fenable-bx2 --fastmath --fastprecision --nofloatbindings - ------------------------------------------------------------------------------- - NVSHADERPERF OUTPUT ------------------------------------------------------------------------------- -For reference and to aid in debug, output of NVShaderPerf should match this, - -Shader to schedule: - 0: texpkb h0.w(TRUE), v5.zyxx, #0 - 2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x - 4: texpkb h0.w(TRUE), v5.xwxx, #0 - 6: addh h0.z(TRUE), -h2, h0.w - 7: texpkb h1.w(TRUE), v5, #0 - 9: addh h0.x(TRUE), h0.z, -h1.w - 10: addh h3.w(TRUE), h0.z, h1 - 11: texpkb h2.w(TRUE), v5.zwzz, #0 - 13: addh h0.z(TRUE), h3.w, -h2.w - 14: addh h0.x(TRUE), h2.w, h0 - 15: nrmh h1.xz(TRUE), h0_n - 16: minh_m8 h0.x(TRUE), |h1|, |h1.z| - 17: maxh h4.w(TRUE), h0, h1 - 18: divx h2.xy(TRUE), h1_n.xzzw, h0_n - 19: movr r1.zw(TRUE), v4.xxxy - 20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww - 22: minh h5.w(TRUE), h0, h1 - 23: texpkb h0(TRUE), r2.xzxx, #0 - 25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1 - 27: maxh h4.x(TRUE), h2.z, h2.w - 28: texpkb h1(TRUE), r0.zwzz, #0 - 30: addh_d2 h1(TRUE), h0, h1 - 31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz - 33: texpkb h0(TRUE), r0, #0 - 35: minh h4.z(TRUE), h2, h2.w - 36: fenct TRUE - 37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz - 39: texpkb h2(TRUE), r1, #0 - 41: addh_d2 h0(TRUE), h0, h2 - 42: maxh h2.w(TRUE), h4, h4.x - 43: minh h2.x(TRUE), h5.w, h4.z - 44: addh_d2 h0(TRUE), h0, h1 - 45: slth h2.x(TRUE), h0.w, h2 - 46: sgth h2.w(TRUE), h0, h2 - 47: movh h0(TRUE), h0 - 48: addx.c0 rc(TRUE), h2, h2.w - 49: movh h0(c0.NE.x), h1 - -IPU0 ------ Simplified schedule: -------- -Pass | Unit | uOp | PC: Op ------+--------+------+------------------------- - 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; - | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; - | SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-; - | | | - 2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0; - | TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0; - | SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-; - | | | - 3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0; - | TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0; - | SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---; - | SCB1 | add | 10: ADDh h3.w, h0.---z, h1; - | | | - 4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; - | TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; - | SCB0 | add | 14: ADDh h0.x, h2.w---, h0; - | SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-; - | | | - 5 | SCT1 | mov | 15: NRMh h1.xz, h0; - | SRB | nrm | 15: NRMh h1.xz, h0; - | SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|; - | SCB1 | max | 17: MAXh h4.w, h0, h1; - | | | - 6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0; - | SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy; - | SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-; - | SCB1 | min | 22: MINh h5.w, h0, h1; - | | | - 7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0; - | TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0; - | SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---; - | SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1; - | | | - 8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0; - | TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0; - | SCB0/1 | add | 30: ADDh/2 h1, h0, h1; - | | | - 9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--; - | SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0; - | TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0; - | SCB1 | min | 35: MINh h4.z, h2, h2.--w-; - | | | - 10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--; - | SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0; - | TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0; - | SCB0/1 | add | 41: ADDh/2 h0, h0, h2; - | | | - 11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---; - | SCT1 | max | 42: MAXh h2.w, h4, h4.---x; - | SCB0/1 | add | 44: ADDh/2 h0, h0, h1; - | | | - 12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2; - | SCT1 | set | 46: SGTh h2.w, h0, h2; - | SCB0/1 | mul | 47: MOVh h0, h0; - | | | - 13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---; - | SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1; - -Pass SCT TEX SCB - 1: 0% 100% 25% - 2: 0% 100% 25% - 3: 0% 100% 50% - 4: 0% 100% 50% - 5: 0% 0% 50% - 6: 100% 0% 75% - 7: 0% 100% 75% - 8: 0% 100% 100% - 9: 0% 100% 25% - 10: 0% 100% 100% - 11: 50% 0% 100% - 12: 50% 0% 100% - 13: 25% 0% 100% - -MEAN: 17% 61% 67% - -Pass SCT0 SCT1 TEX SCB0 SCB1 - 1: 0% 0% 100% 0% 100% - 2: 0% 0% 100% 0% 100% - 3: 0% 0% 100% 100% 100% - 4: 0% 0% 100% 100% 100% - 5: 0% 0% 0% 100% 100% - 6: 100% 100% 0% 100% 100% - 7: 0% 0% 100% 100% 100% - 8: 0% 0% 100% 100% 100% - 9: 0% 0% 100% 0% 100% - 10: 0% 0% 100% 100% 100% - 11: 100% 100% 0% 100% 100% - 12: 100% 100% 0% 100% 100% - 13: 100% 0% 0% 100% 100% - -MEAN: 30% 23% 61% 76% 100% -Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5 -Results 13 cycles, 3 r regs, 923,076,923 pixels/s -============================================================================*/ -#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0) -/*--------------------------------------------------------------------------*/ -#pragma disablepc all -#pragma option O3 -#pragma option OutColorPrec=fp16 -#pragma texformat default RGBA8 -/*==========================================================================*/ -half4 FxaaPixelShader( - // {xy} = center of pixel - float2 pos, - // {xy__} = upper left of pixel - // {__zw} = lower right of pixel - float4 posPos, - // {rgb_} = color in linear or perceptual color space - // {___a} = luma in perceptual color space (not linear) - sampler2D tex, - // This must be from a constant/uniform. - // {xy} = rcpFrame not used on PS3 - float2 rcpFrame, - // This must be from a constant/uniform. - // {x___} = 2.0/screenWidthInPixels - // {_y__} = 2.0/screenHeightInPixels - // {__z_} = 0.5/screenWidthInPixels - // {___w} = 0.5/screenHeightInPixels - float4 rcpFrameOpt -) { -/*--------------------------------------------------------------------------*/ -// (1) - half4 dir; - half4 lumaNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0)); - lumaNe.w += half(1.0/512.0); - dir.x = -lumaNe.w; - dir.z = -lumaNe.w; -/*--------------------------------------------------------------------------*/ -// (2) - half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0)); - dir.x += lumaSw.w; - dir.z += lumaSw.w; -/*--------------------------------------------------------------------------*/ -// (3) - half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0)); - dir.x -= lumaNw.w; - dir.z += lumaNw.w; -/*--------------------------------------------------------------------------*/ -// (4) - half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0)); - dir.x += lumaSe.w; - dir.z -= lumaSe.w; -/*--------------------------------------------------------------------------*/ -// (5) - half4 dir1_pos; - dir1_pos.xy = normalize(dir.xyz).xz; - half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS); -/*--------------------------------------------------------------------------*/ -// (6) - half4 dir2_pos; - dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0)); - dir1_pos.zw = pos.xy; - dir2_pos.zw = pos.xy; - half4 temp1N; - temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw; -/*--------------------------------------------------------------------------*/ -// (7) - temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0)); - half4 rgby1; - rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw; -/*--------------------------------------------------------------------------*/ -// (8) - rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0)); - rgby1 = (temp1N + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ -// (9) - half4 temp2N; - temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy; - temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0)); -/*--------------------------------------------------------------------------*/ -// (10) - half4 rgby2; - rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy; - rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0)); - rgby2 = (temp2N + rgby2) * 0.5; -/*--------------------------------------------------------------------------*/ -// (11) - // compilier moves these scalar ops up to other cycles - half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w)); - half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w)); - rgby2 = (rgby2 + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ -// (12) - bool twoTapLt = rgby2.w < lumaMin; - bool twoTapGt = rgby2.w > lumaMax; -/*--------------------------------------------------------------------------*/ -// (13) - if(twoTapLt || twoTapGt) rgby2 = rgby1; -/*--------------------------------------------------------------------------*/ - return rgby2; } -/*==========================================================================*/ -#endif - - - -/*============================================================================ - - FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT) - -============================================================================== -The code mostly matches the assembly. -I have a feeling that 14 cycles is possible, but was not able to get there. -Might have to increase register count to get full performance. -Note this shader does not use perspective interpolation. - -Use the following cgc options, - - --fenable-bx2 --fastmath --fastprecision --nofloatbindings - ------------------------------------------------------------------------------- - NVSHADERPERF OUTPUT ------------------------------------------------------------------------------- -For reference and to aid in debug, output of NVShaderPerf should match this, - -Shader to schedule: - 0: texpkb h0.w(TRUE), v5.zyxx, #0 - 2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x - 4: texpkb h1.w(TRUE), v5.xwxx, #0 - 6: addh h0.x(TRUE), h1.w, -h2.y - 7: texpkb h2.w(TRUE), v5.zwzz, #0 - 9: minh h4.w(TRUE), h2.y, h2 - 10: maxh h5.x(TRUE), h2.y, h2.w - 11: texpkb h0.w(TRUE), v5, #0 - 13: addh h3.w(TRUE), -h0, h0.x - 14: addh h0.x(TRUE), h0.w, h0 - 15: addh h0.z(TRUE), -h2.w, h0.x - 16: addh h0.x(TRUE), h2.w, h3.w - 17: minh h5.y(TRUE), h0.w, h1.w - 18: nrmh h2.xz(TRUE), h0_n - 19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z| - 20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w - 21: movr r1.zw(TRUE), v4.xxxy - 22: maxh h2.w(TRUE), h0, h1 - 23: fenct TRUE - 24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz - 26: texpkb h0(TRUE), r0, #0 - 28: maxh h5.x(TRUE), h2.w, h5 - 29: minh h5.w(TRUE), h5.y, h4 - 30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz - 32: texpkb h2(TRUE), r1, #0 - 34: addh_d2 h2(TRUE), h0, h2 - 35: texpkb h1(TRUE), v4, #0 - 37: maxh h5.y(TRUE), h5.x, h1.w - 38: minh h4.w(TRUE), h1, h5 - 39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz - 41: texpkb h0(TRUE), r0, #0 - 43: addh_m8 h5.z(TRUE), h5.y, -h4.w - 44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz - 46: texpkb h3(TRUE), r2, #0 - 48: addh_d2 h0(TRUE), h0, h3 - 49: addh_d2 h3(TRUE), h0, h2 - 50: movh h0(TRUE), h3 - 51: slth h3.x(TRUE), h3.w, h5.w - 52: sgth h3.w(TRUE), h3, h5.x - 53: addx.c0 rc(TRUE), h3.x, h3 - 54: slth.c0 rc(TRUE), h5.z, h5 - 55: movh h0(c0.NE.w), h2 - 56: movh h0(c0.NE.x), h1 - -IPU0 ------ Simplified schedule: -------- -Pass | Unit | uOp | PC: Op ------+--------+------+------------------------- - 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; - | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; - | SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--; - | | | - 2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0; - | TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0; - | SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---; - | | | - 3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; - | TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; - | SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---; - | SCB1 | min | 9: MINh h4.w, h2.---y, h2; - | | | - 4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0; - | TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0; - | SCB0 | add | 14: ADDh h0.x, h0.w---, h0; - | SCB1 | add | 13: ADDh h3.w,-h0, h0.---x; - | | | - 5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---; - | SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-; - | SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--; - | | | - 6 | SCT1 | mov | 18: NRMh h2.xz, h0; - | SRB | nrm | 18: NRMh h2.xz, h0; - | SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|; - | | | - 7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--; - | SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy; - | SCB1 | max | 22: MAXh h2.w, h0, h1; - | | | - 8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--; - | SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0; - | TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0; - | SCB0 | max | 28: MAXh h5.x, h2.w---, h5; - | SCB1 | min | 29: MINh h5.w, h5.---y, h4; - | | | - 9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--; - | SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0; - | TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0; - | SCB0/1 | add | 34: ADDh/2 h2, h0, h2; - | | | - 10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0; - | TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0; - | SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--; - | SCB1 | min | 38: MINh h4.w, h1, h5; - | | | - 11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--; - | SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0; - | TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0; - | SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--; - | SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-; - | | | - 12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0; - | TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0; - | SCB0/1 | add | 48: ADDh/2 h0, h0, h3; - | | | - 13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2; - | SCB0/1 | mul | 50: MOVh h0, h3; - | | | - 14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---; - | SCT1 | set | 52: SGTh h3.w, h3, h5.---x; - | SCB0 | set | 54: SLThc0 rc, h5.z---, h5; - | SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3; - | | | - 15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2; - | SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1; - -Pass SCT TEX SCB - 1: 0% 100% 25% - 2: 0% 100% 25% - 3: 0% 100% 50% - 4: 0% 100% 50% - 5: 50% 0% 25% - 6: 0% 0% 25% - 7: 100% 0% 25% - 8: 0% 100% 50% - 9: 0% 100% 100% - 10: 0% 100% 50% - 11: 0% 100% 75% - 12: 0% 100% 100% - 13: 100% 0% 100% - 14: 50% 0% 50% - 15: 100% 0% 100% - -MEAN: 26% 60% 56% - -Pass SCT0 SCT1 TEX SCB0 SCB1 - 1: 0% 0% 100% 100% 0% - 2: 0% 0% 100% 100% 0% - 3: 0% 0% 100% 100% 100% - 4: 0% 0% 100% 100% 100% - 5: 100% 100% 0% 100% 0% - 6: 0% 0% 0% 0% 100% - 7: 100% 100% 0% 0% 100% - 8: 0% 0% 100% 100% 100% - 9: 0% 0% 100% 100% 100% - 10: 0% 0% 100% 100% 100% - 11: 0% 0% 100% 100% 100% - 12: 0% 0% 100% 100% 100% - 13: 100% 100% 0% 100% 100% - 14: 100% 100% 0% 100% 100% - 15: 100% 100% 0% 100% 100% - -MEAN: 33% 33% 60% 86% 80% -Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5 -Results 15 cycles, 3 r regs, 800,000,000 pixels/s -============================================================================*/ -#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1) -/*--------------------------------------------------------------------------*/ -#pragma disablepc all -#pragma option O2 -#pragma option OutColorPrec=fp16 -#pragma texformat default RGBA8 -/*==========================================================================*/ -half4 FxaaPixelShader( - // {xy} = center of pixel - float2 pos, - // {xy__} = upper left of pixel - // {__zw} = lower right of pixel - float4 posPos, - // {rgb_} = color in linear or perceptual color space - // {___a} = luma in perceptual color space (not linear) - sampler2D tex, - // This must be from a constant/uniform. - // {xy} = rcpFrame not used on PS3 - float2 rcpFrame, - // This must be from a constant/uniform. - // {x___} = 2.0/screenWidthInPixels - // {_y__} = 2.0/screenHeightInPixels - // {__z_} = 0.5/screenWidthInPixels - // {___w} = 0.5/screenHeightInPixels - float4 rcpFrameOpt -) { -/*--------------------------------------------------------------------------*/ -// (1) - half4 rgbyNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0)); - half lumaNe = rgbyNe.w + half(1.0/512.0); -/*--------------------------------------------------------------------------*/ -// (2) - half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0)); - half lumaSwNegNe = lumaSw.w - lumaNe; -/*--------------------------------------------------------------------------*/ -// (3) - half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0)); - half lumaMaxNwSw = max(lumaNw.w, lumaSw.w); - half lumaMinNwSw = min(lumaNw.w, lumaSw.w); -/*--------------------------------------------------------------------------*/ -// (4) - half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0)); - half dirZ = lumaNw.w + lumaSwNegNe; - half dirX = -lumaNw.w + lumaSwNegNe; -/*--------------------------------------------------------------------------*/ -// (5) - half3 dir; - dir.y = 0.0; - dir.x = lumaSe.w + dirX; - dir.z = -lumaSe.w + dirZ; - half lumaMinNeSe = min(lumaNe, lumaSe.w); -/*--------------------------------------------------------------------------*/ -// (6) - half4 dir1_pos; - dir1_pos.xy = normalize(dir).xz; - half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS); -/*--------------------------------------------------------------------------*/ -// (7) - half4 dir2_pos; - dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0)); - dir1_pos.zw = pos.xy; - dir2_pos.zw = pos.xy; - half lumaMaxNeSe = max(lumaNe, lumaSe.w); -/*--------------------------------------------------------------------------*/ -// (8) - half4 temp1N; - temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw; - temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0)); - half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe); - half lumaMin = min(lumaMinNwSw, lumaMinNeSe); -/*--------------------------------------------------------------------------*/ -// (9) - half4 rgby1; - rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw; - rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0)); - rgby1 = (temp1N + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ -// (10) - half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0)); - half lumaMaxM = max(lumaMax, rgbyM.w); - half lumaMinM = min(lumaMin, rgbyM.w); -/*--------------------------------------------------------------------------*/ -// (11) - half4 temp2N; - temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy; - temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0)); - half4 rgby2; - rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy; - half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE__EDGE_THRESHOLD; -/*--------------------------------------------------------------------------*/ -// (12) - rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0)); - rgby2 = (temp2N + rgby2) * 0.5; -/*--------------------------------------------------------------------------*/ -// (13) - rgby2 = (rgby2 + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ -// (14) - bool twoTapLt = rgby2.w < lumaMin; - bool twoTapGt = rgby2.w > lumaMax; - bool earlyExit = lumaRangeM < lumaMax; - bool twoTap = twoTapLt || twoTapGt; -/*--------------------------------------------------------------------------*/ -// (15) - if(twoTap) rgby2 = rgby1; - if(earlyExit) rgby2 = rgbyM; -/*--------------------------------------------------------------------------*/ - return rgby2; } -/*==========================================================================*/ -#endif - - - -/*============================================================================ - - FXAA3 CONSOLE - PC PIXEL SHADER - ------------------------------------------------------------------------------- -Using a modified version of the PS3 version here to best target old hardware. -============================================================================*/ -#if (FXAA_PC_CONSOLE == 1) -/*--------------------------------------------------------------------------*/ -half4 FxaaPixelShader( - // {xy} = center of pixel - float2 pos, - // {xy__} = upper left of pixel - // {__zw} = lower right of pixel - float4 posPos, - // {rgb_} = color in linear or perceptual color space - // {___a} = alpha output is junk value - FxaaTex tex, - // This must be from a constant/uniform. - // {xy} = rcpFrame not used on PC version of FXAA Console - float2 rcpFrame, - // This must be from a constant/uniform. - // {x___} = 2.0/screenWidthInPixels - // {_y__} = 2.0/screenHeightInPixels - // {__z_} = 0.5/screenWidthInPixels - // {___w} = 0.5/screenHeightInPixels - float4 rcpFrameOpt -) { -/*--------------------------------------------------------------------------*/ - half4 dir; - dir.y = 0.0; - half4 lumaNe = FxaaTexTop(tex, posPos.zy); - lumaNe.w += half(1.0/384.0); - dir.x = -lumaNe.w; - dir.z = -lumaNe.w; -/*--------------------------------------------------------------------------*/ - half4 lumaSw = FxaaTexTop(tex, posPos.xw); - dir.x += lumaSw.w; - dir.z += lumaSw.w; -/*--------------------------------------------------------------------------*/ - half4 lumaNw = FxaaTexTop(tex, posPos.xy); - dir.x -= lumaNw.w; - dir.z += lumaNw.w; -/*--------------------------------------------------------------------------*/ - half4 lumaSe = FxaaTexTop(tex, posPos.zw); - dir.x += lumaSe.w; - dir.z -= lumaSe.w; -/*==========================================================================*/ - #if (FXAA_EARLY_EXIT == 1) - half4 rgbyM = FxaaTexTop(tex, pos.xy); -/*--------------------------------------------------------------------------*/ - half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w)); - half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w)); -/*--------------------------------------------------------------------------*/ - half lumaMinM = min(lumaMin, rgbyM.w); - half lumaMaxM = max(lumaMax, rgbyM.w); -/*--------------------------------------------------------------------------*/ - if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD)) - #if (FXAA_DISCARD == 1) - FxaaDiscard; - #else - return rgbyM; - #endif - #endif -/*==========================================================================*/ - half4 dir1_pos; - dir1_pos.xy = normalize(dir.xyz).xz; - half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS); -/*--------------------------------------------------------------------------*/ - half4 dir2_pos; - dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0)); - dir1_pos.zw = pos.xy; - dir2_pos.zw = pos.xy; - half4 temp1N; - temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw; -/*--------------------------------------------------------------------------*/ - temp1N = FxaaTexTop(tex, temp1N.xy); - half4 rgby1; - rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw; -/*--------------------------------------------------------------------------*/ - rgby1 = FxaaTexTop(tex, rgby1.xy); - rgby1 = (temp1N + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ - half4 temp2N; - temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy; - temp2N = FxaaTexTop(tex, temp2N.xy); -/*--------------------------------------------------------------------------*/ - half4 rgby2; - rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy; - rgby2 = FxaaTexTop(tex, rgby2.xy); - rgby2 = (temp2N + rgby2) * 0.5; -/*--------------------------------------------------------------------------*/ - #if (FXAA_EARLY_EXIT == 0) - half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w)); - half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w)); - #endif - rgby2 = (rgby2 + rgby1) * 0.5; -/*--------------------------------------------------------------------------*/ - bool twoTapLt = rgby2.w < lumaMin; - bool twoTapGt = rgby2.w > lumaMax; -/*--------------------------------------------------------------------------*/ - if(twoTapLt || twoTapGt) rgby2 = rgby1; -/*--------------------------------------------------------------------------*/ - return rgby2; } -/*==========================================================================*/ -#endif - - - -/*============================================================================ - - FXAA3 QUALITY - PC - -============================================================================*/ -#if (FXAA_PC == 1) -/*--------------------------------------------------------------------------*/ -float4 FxaaPixelShader( - // {xy} = center of pixel - float2 pos, - // {xyzw} = not used on FXAA3 Quality - float4 posPos, - // {rgb_} = color in linear or perceptual color space - // {___a} = luma in perceptual color space (not linear) - FxaaTex tex, - // This must be from a constant/uniform. - // {x_} = 1.0/screenWidthInPixels - // {_y} = 1.0/screenHeightInPixels - float2 rcpFrame, - // {xyzw} = not used on FXAA3 Quality - float4 rcpFrameOpt -) { -/*--------------------------------------------------------------------------*/ - float2 posM; - posM.x = pos.x; - posM.y = pos.y; - #if (FXAA_GATHER4_ALPHA == 1) - #if (FXAA_DISCARD == 0) - float4 rgbyM = FxaaTexTop(tex, posM); - #define lumaM rgbyM.w - #endif - float4 luma4A = FxaaTexAlpha4(tex, posM, rcpFrame.xy); - float4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1), rcpFrame.xy); - #if (FXAA_DISCARD == 1) - #define lumaM luma4A.w - #endif - #define lumaE luma4A.z - #define lumaS luma4A.x - #define lumaSE luma4A.y - #define lumaNW luma4B.w - #define lumaN luma4B.z - #define lumaW luma4B.x - #else - float4 rgbyM = FxaaTexTop(tex, posM); - #define lumaM rgbyM.w - float lumaS = FxaaTexOff(tex, posM, FxaaInt2( 0, 1), rcpFrame.xy).w; - float lumaE = FxaaTexOff(tex, posM, FxaaInt2( 1, 0), rcpFrame.xy).w; - float lumaN = FxaaTexOff(tex, posM, FxaaInt2( 0,-1), rcpFrame.xy).w; - float lumaW = FxaaTexOff(tex, posM, FxaaInt2(-1, 0), rcpFrame.xy).w; - #endif -/*--------------------------------------------------------------------------*/ - float maxSM = max(lumaS, lumaM); - float minSM = min(lumaS, lumaM); - float maxESM = max(lumaE, maxSM); - float minESM = min(lumaE, minSM); - float maxWN = max(lumaN, lumaW); - float minWN = min(lumaN, lumaW); - float rangeMax = max(maxWN, maxESM); - float rangeMin = min(minWN, minESM); - float rangeMaxScaled = rangeMax * FXAA_QUALITY__EDGE_THRESHOLD; - float range = rangeMax - rangeMin; - float rangeMaxClamped = max(FXAA_QUALITY__EDGE_THRESHOLD_MIN, rangeMaxScaled); - bool earlyExit = range < rangeMaxClamped; -/*--------------------------------------------------------------------------*/ - if(earlyExit) - #if (FXAA_DISCARD == 1) - FxaaDiscard; - #else - return rgbyM; - #endif -/*--------------------------------------------------------------------------*/ - #if (FXAA_GATHER4_ALPHA == 0) - float lumaNW = FxaaTexOff(tex, posM, FxaaInt2(-1,-1), rcpFrame.xy).w; - float lumaSE = FxaaTexOff(tex, posM, FxaaInt2( 1, 1), rcpFrame.xy).w; - float lumaNE = FxaaTexOff(tex, posM, FxaaInt2( 1,-1), rcpFrame.xy).w; - float lumaSW = FxaaTexOff(tex, posM, FxaaInt2(-1, 1), rcpFrame.xy).w; - #else - float lumaNE = FxaaTexOff(tex, posM, FxaaInt2(1, -1), rcpFrame.xy).w; - float lumaSW = FxaaTexOff(tex, posM, FxaaInt2(-1, 1), rcpFrame.xy).w; - #endif -/*--------------------------------------------------------------------------*/ - float lumaNS = lumaN + lumaS; - float lumaWE = lumaW + lumaE; - float subpixRcpRange = 1.0/range; - float subpixNSWE = lumaNS + lumaWE; - float edgeHorz1 = (-2.0 * lumaM) + lumaNS; - float edgeVert1 = (-2.0 * lumaM) + lumaWE; -/*--------------------------------------------------------------------------*/ - float lumaNESE = lumaNE + lumaSE; - float lumaNWNE = lumaNW + lumaNE; - float edgeHorz2 = (-2.0 * lumaE) + lumaNESE; - float edgeVert2 = (-2.0 * lumaN) + lumaNWNE; -/*--------------------------------------------------------------------------*/ - float lumaNWSW = lumaNW + lumaSW; - float lumaSWSE = lumaSW + lumaSE; - float edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2); - float edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2); - float edgeHorz3 = (-2.0 * lumaW) + lumaNWSW; - float edgeVert3 = (-2.0 * lumaS) + lumaSWSE; - float edgeHorz = abs(edgeHorz3) + edgeHorz4; - float edgeVert = abs(edgeVert3) + edgeVert4; -/*--------------------------------------------------------------------------*/ - float subpixNWSWNESE = lumaNWSW + lumaNESE; - float lengthSign = rcpFrame.x; - bool horzSpan = edgeHorz >= edgeVert; - float subpixA = subpixNSWE * 2.0 + subpixNWSWNESE; -/*--------------------------------------------------------------------------*/ - if(!horzSpan) lumaN = lumaW; - if(!horzSpan) lumaS = lumaE; - if(horzSpan) lengthSign = rcpFrame.y; - float subpixB = (subpixA * (1.0/12.0)) - lumaM; -/*--------------------------------------------------------------------------*/ - float gradientN = lumaN - lumaM; - float gradientS = lumaS - lumaM; - float lumaNN = lumaN + lumaM; - float lumaSS = lumaS + lumaM; - bool pairN = abs(gradientN) >= abs(gradientS); - float gradient = max(abs(gradientN), abs(gradientS)); - if(pairN) lengthSign = -lengthSign; - float subpixC = FxaaSat(abs(subpixB) * subpixRcpRange); -/*--------------------------------------------------------------------------*/ - float2 posB; - posB.x = posM.x; - posB.y = posM.y; - float2 offNP; - offNP.x = (!horzSpan) ? 0.0 : rcpFrame.x; - offNP.y = ( horzSpan) ? 0.0 : rcpFrame.y; - if(!horzSpan) posB.x += lengthSign * 0.5; - if( horzSpan) posB.y += lengthSign * 0.5; -/*--------------------------------------------------------------------------*/ - float2 posN; - posN.x = posB.x - offNP.x * FXAA_QUALITY__P0; - posN.y = posB.y - offNP.y * FXAA_QUALITY__P0; - float2 posP; - posP.x = posB.x + offNP.x * FXAA_QUALITY__P0; - posP.y = posB.y + offNP.y * FXAA_QUALITY__P0; - float subpixD = ((-2.0)*subpixC) + 3.0; - float lumaEndN = FxaaTexTop(tex, posN).w; - float subpixE = subpixC * subpixC; - float lumaEndP = FxaaTexTop(tex, posP).w; -/*--------------------------------------------------------------------------*/ - if(!pairN) lumaNN = lumaSS; - float gradientScaled = gradient * 1.0/4.0; - float lumaMM = lumaM - lumaNN * 0.5; - float subpixF = subpixD * subpixE; - bool lumaMLTZero = lumaMM < 0.0; -/*--------------------------------------------------------------------------*/ - lumaEndN -= lumaNN * 0.5; - lumaEndP -= lumaNN * 0.5; - bool doneN = abs(lumaEndN) >= gradientScaled; - bool doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1; - bool doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1; -/*--------------------------------------------------------------------------*/ - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 3) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 4) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 5) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 6) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 7) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 8) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 9) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 10) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 11) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11; -/*--------------------------------------------------------------------------*/ - #if (FXAA_QUALITY__PS > 12) - if(doneNP) { - if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; - if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; - if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; - if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; - doneN = abs(lumaEndN) >= gradientScaled; - doneP = abs(lumaEndP) >= gradientScaled; - if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12; - if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12; - doneNP = (!doneN) || (!doneP); - if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12; - if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12; -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } - #endif -/*--------------------------------------------------------------------------*/ - } -/*--------------------------------------------------------------------------*/ - float dstN = posM.x - posN.x; - float dstP = posP.x - posM.x; - if(!horzSpan) dstN = posM.y - posN.y; - if(!horzSpan) dstP = posP.y - posM.y; -/*--------------------------------------------------------------------------*/ - bool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero; - float spanLength = (dstP + dstN); - bool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero; - float spanLengthRcp = 1.0/spanLength; -/*--------------------------------------------------------------------------*/ - bool directionN = dstN < dstP; - float dst = min(dstN, dstP); - bool goodSpan = directionN ? goodSpanN : goodSpanP; - float subpixG = subpixF * subpixF; - float pixelOffset = (dst * (-spanLengthRcp)) + 0.5; - float subpixH = subpixG * FXAA_QUALITY__SUBPIX; -/*--------------------------------------------------------------------------*/ - float pixelOffsetGood = goodSpan ? pixelOffset : 0.0; - float pixelOffsetSubpix = max(pixelOffsetGood, subpixH); - if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign; - if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign; - #if (FXAA_DISCARD == 1) - return FxaaTexTop(tex, posM); - #else - return float4(FxaaTexTop(tex, posM).xyz, lumaM); - #endif -} -/*==========================================================================*/ -#endif - -PS_OUTPUT ps_main(PS_INPUT input) -{ - PS_OUTPUT output; - - float2 pos = input.t; - float4 posPos = (float4)0; - - FxaaTex tex; - - #if SHADER_MODEL >= 0x400 - - tex.tex = Texture; - tex.smpl = TextureSampler; - - #else - - tex = Texture; - - #endif - - output.c = FxaaPixelShader(pos, posPos, tex, _rcpFrame.xy, _rcpFrameOpt); - - return output; -} - -#endif +#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency + +#define FXAA_PC 1 +#define FXAA_QUALITY__SUBPIX 0.0 + +#if SHADER_MODEL >= 0x400 + +#if SHADER_MODEL >= 0x500 + #define FXAA_HLSL_5 1 +#else + #define FXAA_HLSL_4 1 +#endif + +Texture2D Texture; +SamplerState TextureSampler; + +cbuffer cb0 +{ + float4 _rcpFrame; + float4 _rcpFrameOpt; +}; + +struct PS_INPUT +{ + float4 p : SV_Position; + float2 t : TEXCOORD0; +}; + +struct PS_OUTPUT +{ + float4 c : SV_Target0; +}; + +#elif SHADER_MODEL <= 0x300 + +#define FXAA_HLSL_3 1 + +sampler Texture : register(s0); + +float4 _rcpFrame : register(c0); +float4 _rcpFrameOpt : register(c1); + +struct PS_INPUT +{ +#if SHADER_MODEL < 0x300 + float4 p : TEXCOORD1; +#else + float4 p : VPOS; +#endif + float2 t : TEXCOORD0; +}; + +struct PS_OUTPUT +{ + float4 c : COLOR; +}; + +#endif + +/*============================================================================ + + + NVIDIA FXAA 3.10 by TIMOTHY LOTTES + + +------------------------------------------------------------------------------ +COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED. +------------------------------------------------------------------------------ +TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED +*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA +OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR +CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR +LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, +OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE +THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + +------------------------------------------------------------------------------ + INTEGRATION CHECKLIST +------------------------------------------------------------------------------ +(1.) +In the shader source, +setup defines for the desired configuration. +Example, + + #define FXAA_PC 1 + #define FXAA_HLSL_3 1 + #define FXAA_QUALITY__PRESET 12 + #define FXAA_QUALITY__EDGE_THRESHOLD (1.0/6.0) + #define FXAA_QUALITY__EDGE_THRESHOLD_MIN (1.0/12.0) + +(2.) +Then include this file, + + #include "Fxaa3.h" + +(3.) +Then call the FXAA pixel shader from within your desired shader, + + return FxaaPixelShader(pos, posPos, tex, rcpFrame, rcpFrameOpt); + +(4.) +Insure pass prior to FXAA outputs RGBL. +See next section. + +(5.) +Setup engine to provide "rcpFrame" and "rcpFrameOpt" constants. +Not using constants will result in a performance loss. + + // {x_} = 1.0/screenWidthInPixels + // {_y} = 1.0/screenHeightInPixels + float2 rcpFrame + + // This must be from a constant/uniform. + // {x___} = 2.0/screenWidthInPixels + // {_y__} = 2.0/screenHeightInPixels + // {__z_} = 0.5/screenWidthInPixels + // {___w} = 0.5/screenHeightInPixels + float4 rcpFrameOpt + +(5.a.) +Optionally change to this for sharper FXAA Console, + + // This must be from a constant/uniform. + // {x___} = 2.0/screenWidthInPixels + // {_y__} = 2.0/screenHeightInPixels + // {__z_} = 0.333/screenWidthInPixels + // {___w} = 0.333/screenHeightInPixels + float4 rcpFrameOpt + +(6.) +Have FXAA vertex shader run as a full screen triangle, +and output "pos" and "posPos" such that inputs in the pixel shader provide, + + // {xy} = center of pixel + float2 pos, + + // {xy__} = upper left of pixel + // {__zw} = lower right of pixel + float4 posPos, + +(7.) +Insure the texture sampler used by FXAA is set to bilinear filtering. + + +------------------------------------------------------------------------------ + INTEGRATION - RGBL AND COLORSPACE +------------------------------------------------------------------------------ +FXAA3 requires RGBL as input. + +RGB should be LDR (low dynamic range). +Specifically do FXAA after tonemapping. + +RGB data as returned by a texture fetch can be linear or non-linear. +Note an "sRGB format" texture counts as linear, +because the result of a texture fetch is linear data. +Regular "RGBA8" textures in the sRGB colorspace are non-linear. + +Luma must be stored in the alpha channel prior to running FXAA. +This luma should be in a perceptual space (could be gamma 2.0). +Example pass before FXAA where output is gamma 2.0 encoded, + + color.rgb = ToneMap(color.rgb); // linear color output + color.rgb = sqrt(color.rgb); // gamma 2.0 color output + return color; + +To use FXAA, + + color.rgb = ToneMap(color.rgb); // linear color output + color.rgb = sqrt(color.rgb); // gamma 2.0 color output + color.a = dot(color.rgb, float3(0.299, 0.587, 0.114)); // compute luma + return color; + +Another example where output is linear encoded, +say for instance writing to an sRGB formated render target, +where the render target does the conversion back to sRGB after blending, + + color.rgb = ToneMap(color.rgb); // linear color output + return color; + +To use FXAA, + + color.rgb = ToneMap(color.rgb); // linear color output + color.a = sqrt(dot(color.rgb, float3(0.299, 0.587, 0.114))); // compute luma + return color; + +Getting luma correct is required for the algorithm to work correctly. + + +------------------------------------------------------------------------------ + BEING LINEARLY CORRECT? +------------------------------------------------------------------------------ +Applying FXAA to a framebuffer with linear RGB color will look worse. +This is very counter intuitive, but happends to be true in this case. +The reason is because dithering artifacts will be more visiable +in a linear colorspace. + + +------------------------------------------------------------------------------ + COMPLEX INTEGRATION +------------------------------------------------------------------------------ +Q. What if the engine is blending into RGB before wanting to run FXAA? + +A. In the last opaque pass prior to FXAA, + have the pass write out luma into alpha. + Then blend into RGB only. + FXAA should be able to run ok + assuming the blending pass did not any add aliasing. + This should be the common case for particles and common blending passes. + +============================================================================*/ + +/*============================================================================ + + INTEGRATION KNOBS + +============================================================================*/ +// +// FXAA_PS3 and FXAA_360 choose the console algorithm (FXAA3 CONSOLE). +// FXAA_360_OPT is a prototype for the new optimized 360 version. +// +// 1 = Use API. +// 0 = Don't use API. +// +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_PS3 + #define FXAA_PS3 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_360 + #define FXAA_360 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_360_OPT + #define FXAA_360_OPT 0 +#endif +/*==========================================================================*/ +#ifndef FXAA_PC + // + // FXAA Quality + // The high quality PC algorithm. + // + #define FXAA_PC 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_PC_CONSOLE + // + // The console algorithm for PC is included + // for developers targeting really low spec machines. + // + #define FXAA_PC_CONSOLE 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GLSL_120 + #define FXAA_GLSL_120 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GLSL_130 + #define FXAA_GLSL_130 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_HLSL_3 + #define FXAA_HLSL_3 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_HLSL_4 + #define FXAA_HLSL_4 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_HLSL_5 + #define FXAA_HLSL_5 0 +#endif +/*==========================================================================*/ +#ifndef FXAA_EARLY_EXIT + // + // Controls algorithm's early exit path. + // On PS3 turning this on adds 2 cycles to the shader. + // On 360 turning this off adds 10ths of a millisecond to the shader. + // Turning this off on console will result in a more blurry image. + // So this defaults to on. + // + // 1 = On. + // 0 = Off. + // + #define FXAA_EARLY_EXIT 1 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_DISCARD + // + // Only valid for PC OpenGL currently. + // + // 1 = Use discard on pixels which don't need AA. + // For APIs which enable concurrent TEX+ROP from same surface. + // 0 = Return unchanged color on pixels which don't need AA. + // + #define FXAA_DISCARD 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_FAST_PIXEL_OFFSET + // + // Used for GLSL 120 only. + // + // 1 = GL API supports fast pixel offsets + // 0 = do not use fast pixel offsets + // + #ifdef GL_EXT_gpu_shader4 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifdef GL_NV_gpu_shader5 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifdef GL_ARB_gpu_shader5 + #define FXAA_FAST_PIXEL_OFFSET 1 + #endif + #ifndef FXAA_FAST_PIXEL_OFFSET + #define FXAA_FAST_PIXEL_OFFSET 0 + #endif +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GATHER4_ALPHA + // + // 1 = API supports gather4 on alpha channel. + // 0 = API does not support gather4 on alpha channel. + // + #if (FXAA_HLSL_5 == 1) + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifdef GL_ARB_gpu_shader5 + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifdef GL_NV_gpu_shader5 + #define FXAA_GATHER4_ALPHA 1 + #endif + #ifndef FXAA_GATHER4_ALPHA + #define FXAA_GATHER4_ALPHA 0 + #endif +#endif + +/*============================================================================ + FXAA CONSOLE - TUNING KNOBS +============================================================================*/ +#ifndef FXAA_CONSOLE__EDGE_SHARPNESS + // + // Consoles the sharpness of edges. + // + // Due to the PS3 being ALU bound, + // there are only two safe values here: 4 and 8. + // These options use the shaders ability to a free *|/ by 4|8. + // + // 8.0 is sharper + // 4.0 is softer + // 2.0 is really soft (good for vector graphics inputs) + // + #if 1 + #define FXAA_CONSOLE__EDGE_SHARPNESS 8.0 + #endif + #if 0 + #define FXAA_CONSOLE__EDGE_SHARPNESS 4.0 + #endif + #if 0 + #define FXAA_CONSOLE__EDGE_SHARPNESS 2.0 + #endif +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_CONSOLE__EDGE_THRESHOLD + // + // The minimum amount of local contrast required to apply algorithm. + // The console setting has a different mapping than the quality setting. + // + // This only applies when FXAA_EARLY_EXIT is 1. + // + // Due to the PS3 being ALU bound, + // there are only two safe values here: 0.25 and 0.125. + // These options use the shaders ability to a free *|/ by 4|8. + // + // 0.125 leaves less aliasing, but is softer + // 0.25 leaves more aliasing, and is sharper + // + #if 1 + #define FXAA_CONSOLE__EDGE_THRESHOLD 0.125 + #else + #define FXAA_CONSOLE__EDGE_THRESHOLD 0.25 + #endif +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_CONSOLE__EDGE_THRESHOLD_MIN + // + // Trims the algorithm from processing darks. + // The console setting has a different mapping than the quality setting. + // + // This only applies when FXAA_EARLY_EXIT is 1. + // + // This does not apply to PS3. + // PS3 was simplified to avoid more shader instructions. + // + #define FXAA_CONSOLE__EDGE_THRESHOLD_MIN 0.05 +#endif + +/*============================================================================ + FXAA QUALITY - TUNING KNOBS +============================================================================*/ +#ifndef FXAA_QUALITY__EDGE_THRESHOLD + // + // The minimum amount of local contrast required to apply algorithm. + // + // 1/3 - too little + // 1/4 - low quality + // 1/6 - default + // 1/8 - high quality (default) + // 1/16 - overkill + // + #define FXAA_QUALITY__EDGE_THRESHOLD (1.0/6.0) +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_QUALITY__EDGE_THRESHOLD_MIN + // + // Trims the algorithm from processing darks. + // + // 1/32 - visible limit + // 1/16 - high quality + // 1/12 - upper limit (default, the start of visible unfiltered edges) + // + #define FXAA_QUALITY__EDGE_THRESHOLD_MIN (1.0/12.0) +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_QUALITY__SUBPIX + // + // Choose the amount of sub-pixel aliasing removal. + // + // 1 - upper limit (softer) + // 3/4 - default amount of filtering + // 1/2 - lower limit (sharper, less sub-pixel aliasing removal) + // + #define FXAA_QUALITY__SUBPIX (3.0/4.0) +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_QUALITY__PRESET + // + // Choose the quality preset. + // + // OPTIONS + // ----------------------------------------------------------------------- + // 10 to 15 - default medium dither (10=fastest, 15=highest quality) + // 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality) + // 39 - no dither, very expensive + // + // NOTES + // ----------------------------------------------------------------------- + // 12 = slightly faster then FXAA 3.9 and higher edge quality (default) + // 13 = about same speed as FXAA 3.9 and better than 12 + // 23 = closest to FXAA 3.9 visually and performance wise + // _ = the lowest digit is directly related to performance + // _ = the highest digit is directly related to style + // + #define FXAA_QUALITY__PRESET 12 +#endif + + +/*============================================================================ + + FXAA QUALITY - PRESETS + +============================================================================*/ + +/*============================================================================ + FXAA QUALITY - MEDIUM DITHER PRESETS +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 10) + #define FXAA_QUALITY__PS 3 + #define FXAA_QUALITY__P0 1.5 + #define FXAA_QUALITY__P1 3.0 + #define FXAA_QUALITY__P2 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 11) + #define FXAA_QUALITY__PS 4 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 3.0 + #define FXAA_QUALITY__P3 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 12) + #define FXAA_QUALITY__PS 5 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 4.0 + #define FXAA_QUALITY__P4 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 13) + #define FXAA_QUALITY__PS 6 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 4.0 + #define FXAA_QUALITY__P5 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 14) + #define FXAA_QUALITY__PS 7 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 4.0 + #define FXAA_QUALITY__P6 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 15) + #define FXAA_QUALITY__PS 8 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 4.0 + #define FXAA_QUALITY__P7 12.0 +#endif + +/*============================================================================ + FXAA QUALITY - LOW DITHER PRESETS +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 20) + #define FXAA_QUALITY__PS 3 + #define FXAA_QUALITY__P0 1.5 + #define FXAA_QUALITY__P1 2.0 + #define FXAA_QUALITY__P2 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 21) + #define FXAA_QUALITY__PS 4 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 22) + #define FXAA_QUALITY__PS 5 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 23) + #define FXAA_QUALITY__PS 6 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 24) + #define FXAA_QUALITY__PS 7 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 3.0 + #define FXAA_QUALITY__P6 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 25) + #define FXAA_QUALITY__PS 8 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 4.0 + #define FXAA_QUALITY__P7 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 26) + #define FXAA_QUALITY__PS 9 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 4.0 + #define FXAA_QUALITY__P8 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 27) + #define FXAA_QUALITY__PS 10 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 2.0 + #define FXAA_QUALITY__P8 4.0 + #define FXAA_QUALITY__P9 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 28) + #define FXAA_QUALITY__PS 11 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 2.0 + #define FXAA_QUALITY__P8 2.0 + #define FXAA_QUALITY__P9 4.0 + #define FXAA_QUALITY__P10 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 29) + #define FXAA_QUALITY__PS 12 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.5 + #define FXAA_QUALITY__P2 2.0 + #define FXAA_QUALITY__P3 2.0 + #define FXAA_QUALITY__P4 2.0 + #define FXAA_QUALITY__P5 2.0 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 2.0 + #define FXAA_QUALITY__P8 2.0 + #define FXAA_QUALITY__P9 2.0 + #define FXAA_QUALITY__P10 4.0 + #define FXAA_QUALITY__P11 8.0 +#endif + +/*============================================================================ + FXAA QUALITY - EXTREME QUALITY +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 39) + #define FXAA_QUALITY__PS 12 + #define FXAA_QUALITY__P0 1.0 + #define FXAA_QUALITY__P1 1.0 + #define FXAA_QUALITY__P2 1.0 + #define FXAA_QUALITY__P3 1.0 + #define FXAA_QUALITY__P4 1.0 + #define FXAA_QUALITY__P5 1.5 + #define FXAA_QUALITY__P6 2.0 + #define FXAA_QUALITY__P7 2.0 + #define FXAA_QUALITY__P8 2.0 + #define FXAA_QUALITY__P9 2.0 + #define FXAA_QUALITY__P10 4.0 + #define FXAA_QUALITY__P11 8.0 +#endif + + + +/*============================================================================ + + API PORTING + +============================================================================*/ +#if FXAA_GLSL_120 + // Requires, + // #version 120 + // And at least, + // #extension GL_EXT_gpu_shader4 : enable + // (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9) + #define half float + #define half2 vec2 + #define half3 vec3 + #define half4 vec4 + #define int2 ivec2 + #define float2 vec2 + #define float3 vec3 + #define float4 vec4 + #define FxaaInt2 ivec2 + #define FxaaFloat2 vec2 + #define FxaaFloat3 vec3 + #define FxaaFloat4 vec4 + #define FxaaDiscard discard + #define FxaaDot3(a, b) dot(a, b) + #define FxaaSat(x) clamp(x, 0.0, 1.0) + #define FxaaLerp(x,y,s) mix(x,y,s) + #define FxaaTex sampler2D + #define FxaaTexTop(t, p) texture2DLod(t, p, 0.0) + #if (FXAA_FAST_PIXEL_OFFSET == 1) + #define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o) + #else + #define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0) + #endif + #if (FXAA_GATHER4_ALPHA == 1) + // use #extension GL_ARB_gpu_shader5 : enable + #define FxaaTexAlpha4(t, p, r) textureGather(t, p, 3) + #define FxaaTexOffAlpha4(t, p, o, r) textureGatherOffset(t, p, o, 3) + #endif +#endif +/*--------------------------------------------------------------------------*/ +#if FXAA_GLSL_130 + // Requires "#version 130" or better + #define half float + #define half2 vec2 + #define half3 vec3 + #define half4 vec4 + #define int2 ivec2 + #define float2 vec2 + #define float3 vec3 + #define float4 vec4 + #define FxaaInt2 ivec2 + #define FxaaFloat2 vec2 + #define FxaaFloat3 vec3 + #define FxaaFloat4 vec4 + #define FxaaDiscard discard + #define FxaaDot3(a, b) dot(a, b) + #define FxaaSat(x) clamp(x, 0.0, 1.0) + #define FxaaLerp(x,y,s) mix(x,y,s) + #define FxaaTex sampler2D + #define FxaaTexTop(t, p) textureLod(t, p, 0.0) + #define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o) + #if (FXAA_GATHER4_ALPHA == 1) + // use #extension GL_ARB_gpu_shader5 : enable + #define FxaaTexAlpha4(t, p, r) textureGather(t, p, 3) + #define FxaaTexOffAlpha4(t, p, o, r) textureGatherOffset(t, p, o, 3) + #endif +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_HLSL_3 == 1) || (FXAA_360 == 1) + #define int2 float2 + #define FxaaInt2 float2 + #define FxaaFloat2 float2 + #define FxaaFloat3 float3 + #define FxaaFloat4 float4 + #define FxaaDiscard clip(-1) + #define FxaaDot3(a, b) dot(a, b) + #define FxaaSat(x) saturate(x) + #define FxaaLerp(x,y,s) lerp(x,y,s) + #define FxaaTex sampler2D + #define FxaaTexTop(t, p) tex2Dlod(t, float4(p, 0.0, 0.0)) + #define FxaaTexOff(t, p, o, r) tex2Dlod(t, float4(p + (o * r), 0, 0)) +#endif +/*--------------------------------------------------------------------------*/ +#if FXAA_HLSL_4 + #define FxaaInt2 int2 + #define FxaaFloat2 float2 + #define FxaaFloat3 float3 + #define FxaaFloat4 float4 + #define FxaaDiscard clip(-1) + #define FxaaDot3(a, b) dot(a, b) + #define FxaaSat(x) saturate(x) + #define FxaaLerp(x,y,s) lerp(x,y,s) + struct FxaaTex { SamplerState smpl; Texture2D tex; }; + #define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0) + #define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o) +#endif +/*--------------------------------------------------------------------------*/ +#if FXAA_HLSL_5 + #define FxaaInt2 int2 + #define FxaaFloat2 float2 + #define FxaaFloat3 float3 + #define FxaaFloat4 float4 + #define FxaaDiscard clip(-1) + #define FxaaDot3(a, b) dot(a, b) + #define FxaaSat(x) saturate(x) + #define FxaaLerp(x,y,s) lerp(x,y,s) + struct FxaaTex { SamplerState smpl; Texture2D tex; }; + #define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0) + #define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o) + #define FxaaTexAlpha4(t, p, r) t.tex.GatherAlpha(t.smpl, p) + #define FxaaTexOffAlpha4(t, p, o, r) t.tex.GatherAlpha(t.smpl, p, o) +#endif + + + +/*============================================================================ + + FXAA3 CONSOLE - 360 PIXEL SHADER + +------------------------------------------------------------------------------ +Might be some optimizations left here, +as of this latest change didn't have a PIX dump to verify if TEX bound. +============================================================================*/ +#if (FXAA_360 == 1) +/*--------------------------------------------------------------------------*/ +half4 FxaaPixelShader( + // {xy} = center of pixel + float2 pos, + // {xy__} = upper left of pixel + // {__zw} = lower right of pixel + float4 posPos, + // {rgb_} = color in linear or perceptual color space + // {___a} = alpha output is junk value + FxaaTex tex, + // This must be from a constant/uniform. + // {xy} = rcpFrame not used on PC version of FXAA Console + float2 rcpFrame, + // This must be from a constant/uniform. + // {x___} = 2.0/screenWidthInPixels + // {_y__} = 2.0/screenHeightInPixels + // {__z_} = 0.5/screenWidthInPixels + // {___w} = 0.5/screenHeightInPixels + float4 rcpFrameOpt +) { +/*--------------------------------------------------------------------------*/ + half4 lumaNwNeSwSe; + lumaNwNeSwSe.x = FxaaTexTop(tex, posPos.xy).w; + lumaNwNeSwSe.y = FxaaTexTop(tex, posPos.zy).w; + lumaNwNeSwSe.z = FxaaTexTop(tex, posPos.xw).w; + lumaNwNeSwSe.w = FxaaTexTop(tex, posPos.zw).w; +/*--------------------------------------------------------------------------*/ + half4 rgbyM = FxaaTexTop(tex, pos.xy); +/*--------------------------------------------------------------------------*/ + lumaNwNeSwSe.y += 1.0/384.0; +/*--------------------------------------------------------------------------*/ + half2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); + half2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); +/*--------------------------------------------------------------------------*/ + half lumaMin = min(lumaMinTemp.x, lumaMinTemp.y); + half lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y); +/*--------------------------------------------------------------------------*/ + half lumaMinM = min(lumaMin, rgbyM.w); + half lumaMaxM = max(lumaMax, rgbyM.w); + if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD)) return rgbyM; +/*--------------------------------------------------------------------------*/ + half2 dir; + dir.x = dot(lumaNwNeSwSe, float4(-1.0, -1.0, 1.0, 1.0)); + dir.y = dot(lumaNwNeSwSe, float4( 1.0, -1.0, 1.0,-1.0)); +/*--------------------------------------------------------------------------*/ + half2 dir1; + dir1 = normalize(dir.xy); +/*--------------------------------------------------------------------------*/ + half dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * FXAA_CONSOLE__EDGE_SHARPNESS; + half2 dir2; + dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0); +/*--------------------------------------------------------------------------*/ + half4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * rcpFrameOpt.zw); + half4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * rcpFrameOpt.zw); + half4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * rcpFrameOpt.xy); + half4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * rcpFrameOpt.xy); +/*--------------------------------------------------------------------------*/ + half4 rgbyA = rgbyN1 * 0.5 + rgbyP1 * 0.5; + half4 rgbyB = rgbyN2 * 0.25 + rgbyP2 * 0.25 + rgbyA * 0.5; +/*--------------------------------------------------------------------------*/ + bool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax); + if(twoTap) rgbyB.xyz = rgbyA.xyz; + return rgbyB; } +/*==========================================================================*/ +#endif + + + +/*============================================================================ + + FXAA3 CONSOLE - 360 PIXEL SHADER OPTIMIZED PROTOTYPE + +------------------------------------------------------------------------------ +This prototype optimized version thanks to suggestions from Andy Luedke. +Should be fully tex bound in all cases. +As of the FXAA 3.10 release I have not tested this code, +but at least the missing ";" was fixed. +If it does not work, please let me know so I can fix it. +------------------------------------------------------------------------------ +Extra requirements, +(1.) Different inputs: no posPos. +(2.) Different inputs: alias three samplers with different exp bias settings! +(3.) New constants: setup fxaaConst as described below. +============================================================================*/ +#if (FXAA_360_OPT == 1) +/*--------------------------------------------------------------------------*/ +[reduceTempRegUsage(4)] +float4 FxaaPixelShader( + // {xy} = center of pixel + float2 pos, + // Three samplers, + // texExpBias0 = exponent bias 0 + // texExpBiasNeg1 = exponent bias -1 + // texExpBiasNeg2 = exponent bias -2 + // {rgb_} = color in linear or perceptual color space + // {___a} = alpha output is junk value + uniform sampler2D texExpBias0, + uniform sampler2D texExpBiasNeg1, + uniform sampler2D texExpBiasNeg2, + // These must be in physical constant registers and NOT immedates + // Immedates will result in compiler un-optimizing + // width = screen width in pixels + // height = screen height in pixels + fxaaConstDir, // float4(1.0, -1.0, 0.25, -0.25); + fxaaConstInner, // float4(0.5/width, 0.5/height, -0.5/width, -0.5/height); + fxaaConstOuter // float4(8.0/width, 8.0/height, -4.0/width, -4.0/height); +) { +/*--------------------------------------------------------------------------*/ + float4 lumaNwNeSwSe; + asm { + tfetch2D lumaNwNeSwSe.w___, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false + tfetch2D lumaNwNeSwSe._w__, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false + tfetch2D lumaNwNeSwSe.__w_, texExpBias0, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false + tfetch2D lumaNwNeSwSe.___w, texExpBias0, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false + }; +/*--------------------------------------------------------------------------*/ + lumaNwNeSwSe.y += 1.0/384.0; + float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); + float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); + float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y); + float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y); +/*--------------------------------------------------------------------------*/ + float4 rgbyM = tex2Dlod(texExpBias0, float4(pos.xy, 0.0, 0.0)); + float4 lumaMinM = min(lumaMin, rgbyM.w); + float4 lumaMaxM = max(lumaMax, rgbyM.w); + if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD)) return rgbyM; +/*--------------------------------------------------------------------------*/ + float2 dir; + dir.x = dot(lumaNwNeSwSe, fxaaConstDir.yyxx); + dir.y = dot(lumaNwNeSwSe, fxaaConstDir.xyxy); + dir = normalize(dir); +/*--------------------------------------------------------------------------*/ + float4 dir1 = dir.xyxy * fxaaConstInner.xyzw; +/*--------------------------------------------------------------------------*/ + float4 dir2; + float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y)); + dir2 = saturate(fxaaConstOuter.zzww * dir.xyxy / FXAA_CONSOLE__EDGE_SHARPNESS / dirAbsMinTimesC + 0.5); + dir2 = dir2 * fxaaConstOuter.xyxy + fxaaConstOuter.zwzw; +/*--------------------------------------------------------------------------*/ + float4 rgbyN1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.xy, 0.0, 0.0)); + float4 rgbyP1 = tex2Dlod(texExpBiasNeg1, float4(pos.xy + dir1.zw, 0.0, 0.0)); + float4 rgbyN2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.xy, 0.0, 0.0)); + float4 rgbyP2 = tex2Dlod(texExpBiasNeg2, float4(pos.xy + dir2.zw, 0.0, 0.0)); +/*--------------------------------------------------------------------------*/ + half4 rgbyA = rgbyN1 + rgbyP1; + half4 rgbyB = rgbyN2 + rgbyP2 * 0.5 + rgbyA; +/*--------------------------------------------------------------------------*/ + float4 rgbyR = ((rgbyB.w - lumaMax) > 0.0) ? rgbyA : rgbyB; + rgbyR = ((rgbyB.w - lumaMin) > 0.0) ? rgbyR : rgbyA; + return rgbyR; } +/*==========================================================================*/ +#endif + + + + +/*============================================================================ + + FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT) + +============================================================================== +The code below does not exactly match the assembly. +I have a feeling that 12 cycles is possible, but was not able to get there. +Might have to increase register count to get full performance. +Note this shader does not use perspective interpolation. + +Use the following cgc options, + + --fenable-bx2 --fastmath --fastprecision --nofloatbindings + +------------------------------------------------------------------------------ + NVSHADERPERF OUTPUT +------------------------------------------------------------------------------ +For reference and to aid in debug, output of NVShaderPerf should match this, + +Shader to schedule: + 0: texpkb h0.w(TRUE), v5.zyxx, #0 + 2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x + 4: texpkb h0.w(TRUE), v5.xwxx, #0 + 6: addh h0.z(TRUE), -h2, h0.w + 7: texpkb h1.w(TRUE), v5, #0 + 9: addh h0.x(TRUE), h0.z, -h1.w + 10: addh h3.w(TRUE), h0.z, h1 + 11: texpkb h2.w(TRUE), v5.zwzz, #0 + 13: addh h0.z(TRUE), h3.w, -h2.w + 14: addh h0.x(TRUE), h2.w, h0 + 15: nrmh h1.xz(TRUE), h0_n + 16: minh_m8 h0.x(TRUE), |h1|, |h1.z| + 17: maxh h4.w(TRUE), h0, h1 + 18: divx h2.xy(TRUE), h1_n.xzzw, h0_n + 19: movr r1.zw(TRUE), v4.xxxy + 20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww + 22: minh h5.w(TRUE), h0, h1 + 23: texpkb h0(TRUE), r2.xzxx, #0 + 25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1 + 27: maxh h4.x(TRUE), h2.z, h2.w + 28: texpkb h1(TRUE), r0.zwzz, #0 + 30: addh_d2 h1(TRUE), h0, h1 + 31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz + 33: texpkb h0(TRUE), r0, #0 + 35: minh h4.z(TRUE), h2, h2.w + 36: fenct TRUE + 37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz + 39: texpkb h2(TRUE), r1, #0 + 41: addh_d2 h0(TRUE), h0, h2 + 42: maxh h2.w(TRUE), h4, h4.x + 43: minh h2.x(TRUE), h5.w, h4.z + 44: addh_d2 h0(TRUE), h0, h1 + 45: slth h2.x(TRUE), h0.w, h2 + 46: sgth h2.w(TRUE), h0, h2 + 47: movh h0(TRUE), h0 + 48: addx.c0 rc(TRUE), h2, h2.w + 49: movh h0(c0.NE.x), h1 + +IPU0 ------ Simplified schedule: -------- +Pass | Unit | uOp | PC: Op +-----+--------+------+------------------------- + 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; + | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; + | SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-; + | | | + 2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0; + | TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0; + | SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-; + | | | + 3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0; + | TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0; + | SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---; + | SCB1 | add | 10: ADDh h3.w, h0.---z, h1; + | | | + 4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; + | TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; + | SCB0 | add | 14: ADDh h0.x, h2.w---, h0; + | SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-; + | | | + 5 | SCT1 | mov | 15: NRMh h1.xz, h0; + | SRB | nrm | 15: NRMh h1.xz, h0; + | SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|; + | SCB1 | max | 17: MAXh h4.w, h0, h1; + | | | + 6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0; + | SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy; + | SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-; + | SCB1 | min | 22: MINh h5.w, h0, h1; + | | | + 7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0; + | TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0; + | SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---; + | SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1; + | | | + 8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0; + | TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0; + | SCB0/1 | add | 30: ADDh/2 h1, h0, h1; + | | | + 9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--; + | SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0; + | TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0; + | SCB1 | min | 35: MINh h4.z, h2, h2.--w-; + | | | + 10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--; + | SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0; + | TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0; + | SCB0/1 | add | 41: ADDh/2 h0, h0, h2; + | | | + 11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---; + | SCT1 | max | 42: MAXh h2.w, h4, h4.---x; + | SCB0/1 | add | 44: ADDh/2 h0, h0, h1; + | | | + 12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2; + | SCT1 | set | 46: SGTh h2.w, h0, h2; + | SCB0/1 | mul | 47: MOVh h0, h0; + | | | + 13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---; + | SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1; + +Pass SCT TEX SCB + 1: 0% 100% 25% + 2: 0% 100% 25% + 3: 0% 100% 50% + 4: 0% 100% 50% + 5: 0% 0% 50% + 6: 100% 0% 75% + 7: 0% 100% 75% + 8: 0% 100% 100% + 9: 0% 100% 25% + 10: 0% 100% 100% + 11: 50% 0% 100% + 12: 50% 0% 100% + 13: 25% 0% 100% + +MEAN: 17% 61% 67% + +Pass SCT0 SCT1 TEX SCB0 SCB1 + 1: 0% 0% 100% 0% 100% + 2: 0% 0% 100% 0% 100% + 3: 0% 0% 100% 100% 100% + 4: 0% 0% 100% 100% 100% + 5: 0% 0% 0% 100% 100% + 6: 100% 100% 0% 100% 100% + 7: 0% 0% 100% 100% 100% + 8: 0% 0% 100% 100% 100% + 9: 0% 0% 100% 0% 100% + 10: 0% 0% 100% 100% 100% + 11: 100% 100% 0% 100% 100% + 12: 100% 100% 0% 100% 100% + 13: 100% 0% 0% 100% 100% + +MEAN: 30% 23% 61% 76% 100% +Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5 +Results 13 cycles, 3 r regs, 923,076,923 pixels/s +============================================================================*/ +#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0) +/*--------------------------------------------------------------------------*/ +#pragma disablepc all +#pragma option O3 +#pragma option OutColorPrec=fp16 +#pragma texformat default RGBA8 +/*==========================================================================*/ +half4 FxaaPixelShader( + // {xy} = center of pixel + float2 pos, + // {xy__} = upper left of pixel + // {__zw} = lower right of pixel + float4 posPos, + // {rgb_} = color in linear or perceptual color space + // {___a} = luma in perceptual color space (not linear) + sampler2D tex, + // This must be from a constant/uniform. + // {xy} = rcpFrame not used on PS3 + float2 rcpFrame, + // This must be from a constant/uniform. + // {x___} = 2.0/screenWidthInPixels + // {_y__} = 2.0/screenHeightInPixels + // {__z_} = 0.5/screenWidthInPixels + // {___w} = 0.5/screenHeightInPixels + float4 rcpFrameOpt +) { +/*--------------------------------------------------------------------------*/ +// (1) + half4 dir; + half4 lumaNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0)); + lumaNe.w += half(1.0/512.0); + dir.x = -lumaNe.w; + dir.z = -lumaNe.w; +/*--------------------------------------------------------------------------*/ +// (2) + half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0)); + dir.x += lumaSw.w; + dir.z += lumaSw.w; +/*--------------------------------------------------------------------------*/ +// (3) + half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0)); + dir.x -= lumaNw.w; + dir.z += lumaNw.w; +/*--------------------------------------------------------------------------*/ +// (4) + half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0)); + dir.x += lumaSe.w; + dir.z -= lumaSe.w; +/*--------------------------------------------------------------------------*/ +// (5) + half4 dir1_pos; + dir1_pos.xy = normalize(dir.xyz).xz; + half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS); +/*--------------------------------------------------------------------------*/ +// (6) + half4 dir2_pos; + dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0)); + dir1_pos.zw = pos.xy; + dir2_pos.zw = pos.xy; + half4 temp1N; + temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw; +/*--------------------------------------------------------------------------*/ +// (7) + temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0)); + half4 rgby1; + rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw; +/*--------------------------------------------------------------------------*/ +// (8) + rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0)); + rgby1 = (temp1N + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ +// (9) + half4 temp2N; + temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy; + temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0)); +/*--------------------------------------------------------------------------*/ +// (10) + half4 rgby2; + rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy; + rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0)); + rgby2 = (temp2N + rgby2) * 0.5; +/*--------------------------------------------------------------------------*/ +// (11) + // compilier moves these scalar ops up to other cycles + half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w)); + half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w)); + rgby2 = (rgby2 + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ +// (12) + bool twoTapLt = rgby2.w < lumaMin; + bool twoTapGt = rgby2.w > lumaMax; +/*--------------------------------------------------------------------------*/ +// (13) + if(twoTapLt || twoTapGt) rgby2 = rgby1; +/*--------------------------------------------------------------------------*/ + return rgby2; } +/*==========================================================================*/ +#endif + + + +/*============================================================================ + + FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT) + +============================================================================== +The code mostly matches the assembly. +I have a feeling that 14 cycles is possible, but was not able to get there. +Might have to increase register count to get full performance. +Note this shader does not use perspective interpolation. + +Use the following cgc options, + + --fenable-bx2 --fastmath --fastprecision --nofloatbindings + +------------------------------------------------------------------------------ + NVSHADERPERF OUTPUT +------------------------------------------------------------------------------ +For reference and to aid in debug, output of NVShaderPerf should match this, + +Shader to schedule: + 0: texpkb h0.w(TRUE), v5.zyxx, #0 + 2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x + 4: texpkb h1.w(TRUE), v5.xwxx, #0 + 6: addh h0.x(TRUE), h1.w, -h2.y + 7: texpkb h2.w(TRUE), v5.zwzz, #0 + 9: minh h4.w(TRUE), h2.y, h2 + 10: maxh h5.x(TRUE), h2.y, h2.w + 11: texpkb h0.w(TRUE), v5, #0 + 13: addh h3.w(TRUE), -h0, h0.x + 14: addh h0.x(TRUE), h0.w, h0 + 15: addh h0.z(TRUE), -h2.w, h0.x + 16: addh h0.x(TRUE), h2.w, h3.w + 17: minh h5.y(TRUE), h0.w, h1.w + 18: nrmh h2.xz(TRUE), h0_n + 19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z| + 20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w + 21: movr r1.zw(TRUE), v4.xxxy + 22: maxh h2.w(TRUE), h0, h1 + 23: fenct TRUE + 24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz + 26: texpkb h0(TRUE), r0, #0 + 28: maxh h5.x(TRUE), h2.w, h5 + 29: minh h5.w(TRUE), h5.y, h4 + 30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz + 32: texpkb h2(TRUE), r1, #0 + 34: addh_d2 h2(TRUE), h0, h2 + 35: texpkb h1(TRUE), v4, #0 + 37: maxh h5.y(TRUE), h5.x, h1.w + 38: minh h4.w(TRUE), h1, h5 + 39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz + 41: texpkb h0(TRUE), r0, #0 + 43: addh_m8 h5.z(TRUE), h5.y, -h4.w + 44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz + 46: texpkb h3(TRUE), r2, #0 + 48: addh_d2 h0(TRUE), h0, h3 + 49: addh_d2 h3(TRUE), h0, h2 + 50: movh h0(TRUE), h3 + 51: slth h3.x(TRUE), h3.w, h5.w + 52: sgth h3.w(TRUE), h3, h5.x + 53: addx.c0 rc(TRUE), h3.x, h3 + 54: slth.c0 rc(TRUE), h5.z, h5 + 55: movh h0(c0.NE.w), h2 + 56: movh h0(c0.NE.x), h1 + +IPU0 ------ Simplified schedule: -------- +Pass | Unit | uOp | PC: Op +-----+--------+------+------------------------- + 1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; + | TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; + | SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--; + | | | + 2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0; + | TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0; + | SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---; + | | | + 3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; + | TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; + | SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---; + | SCB1 | min | 9: MINh h4.w, h2.---y, h2; + | | | + 4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0; + | TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0; + | SCB0 | add | 14: ADDh h0.x, h0.w---, h0; + | SCB1 | add | 13: ADDh h3.w,-h0, h0.---x; + | | | + 5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---; + | SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-; + | SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--; + | | | + 6 | SCT1 | mov | 18: NRMh h2.xz, h0; + | SRB | nrm | 18: NRMh h2.xz, h0; + | SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|; + | | | + 7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--; + | SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy; + | SCB1 | max | 22: MAXh h2.w, h0, h1; + | | | + 8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--; + | SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0; + | TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0; + | SCB0 | max | 28: MAXh h5.x, h2.w---, h5; + | SCB1 | min | 29: MINh h5.w, h5.---y, h4; + | | | + 9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--; + | SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0; + | TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0; + | SCB0/1 | add | 34: ADDh/2 h2, h0, h2; + | | | + 10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0; + | TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0; + | SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--; + | SCB1 | min | 38: MINh h4.w, h1, h5; + | | | + 11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--; + | SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0; + | TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0; + | SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--; + | SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-; + | | | + 12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0; + | TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0; + | SCB0/1 | add | 48: ADDh/2 h0, h0, h3; + | | | + 13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2; + | SCB0/1 | mul | 50: MOVh h0, h3; + | | | + 14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---; + | SCT1 | set | 52: SGTh h3.w, h3, h5.---x; + | SCB0 | set | 54: SLThc0 rc, h5.z---, h5; + | SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3; + | | | + 15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2; + | SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1; + +Pass SCT TEX SCB + 1: 0% 100% 25% + 2: 0% 100% 25% + 3: 0% 100% 50% + 4: 0% 100% 50% + 5: 50% 0% 25% + 6: 0% 0% 25% + 7: 100% 0% 25% + 8: 0% 100% 50% + 9: 0% 100% 100% + 10: 0% 100% 50% + 11: 0% 100% 75% + 12: 0% 100% 100% + 13: 100% 0% 100% + 14: 50% 0% 50% + 15: 100% 0% 100% + +MEAN: 26% 60% 56% + +Pass SCT0 SCT1 TEX SCB0 SCB1 + 1: 0% 0% 100% 100% 0% + 2: 0% 0% 100% 100% 0% + 3: 0% 0% 100% 100% 100% + 4: 0% 0% 100% 100% 100% + 5: 100% 100% 0% 100% 0% + 6: 0% 0% 0% 0% 100% + 7: 100% 100% 0% 0% 100% + 8: 0% 0% 100% 100% 100% + 9: 0% 0% 100% 100% 100% + 10: 0% 0% 100% 100% 100% + 11: 0% 0% 100% 100% 100% + 12: 0% 0% 100% 100% 100% + 13: 100% 100% 0% 100% 100% + 14: 100% 100% 0% 100% 100% + 15: 100% 100% 0% 100% 100% + +MEAN: 33% 33% 60% 86% 80% +Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5 +Results 15 cycles, 3 r regs, 800,000,000 pixels/s +============================================================================*/ +#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1) +/*--------------------------------------------------------------------------*/ +#pragma disablepc all +#pragma option O2 +#pragma option OutColorPrec=fp16 +#pragma texformat default RGBA8 +/*==========================================================================*/ +half4 FxaaPixelShader( + // {xy} = center of pixel + float2 pos, + // {xy__} = upper left of pixel + // {__zw} = lower right of pixel + float4 posPos, + // {rgb_} = color in linear or perceptual color space + // {___a} = luma in perceptual color space (not linear) + sampler2D tex, + // This must be from a constant/uniform. + // {xy} = rcpFrame not used on PS3 + float2 rcpFrame, + // This must be from a constant/uniform. + // {x___} = 2.0/screenWidthInPixels + // {_y__} = 2.0/screenHeightInPixels + // {__z_} = 0.5/screenWidthInPixels + // {___w} = 0.5/screenHeightInPixels + float4 rcpFrameOpt +) { +/*--------------------------------------------------------------------------*/ +// (1) + half4 rgbyNe = h4tex2Dlod(tex, half4(posPos.zy, 0, 0)); + half lumaNe = rgbyNe.w + half(1.0/512.0); +/*--------------------------------------------------------------------------*/ +// (2) + half4 lumaSw = h4tex2Dlod(tex, half4(posPos.xw, 0, 0)); + half lumaSwNegNe = lumaSw.w - lumaNe; +/*--------------------------------------------------------------------------*/ +// (3) + half4 lumaNw = h4tex2Dlod(tex, half4(posPos.xy, 0, 0)); + half lumaMaxNwSw = max(lumaNw.w, lumaSw.w); + half lumaMinNwSw = min(lumaNw.w, lumaSw.w); +/*--------------------------------------------------------------------------*/ +// (4) + half4 lumaSe = h4tex2Dlod(tex, half4(posPos.zw, 0, 0)); + half dirZ = lumaNw.w + lumaSwNegNe; + half dirX = -lumaNw.w + lumaSwNegNe; +/*--------------------------------------------------------------------------*/ +// (5) + half3 dir; + dir.y = 0.0; + dir.x = lumaSe.w + dirX; + dir.z = -lumaSe.w + dirZ; + half lumaMinNeSe = min(lumaNe, lumaSe.w); +/*--------------------------------------------------------------------------*/ +// (6) + half4 dir1_pos; + dir1_pos.xy = normalize(dir).xz; + half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS); +/*--------------------------------------------------------------------------*/ +// (7) + half4 dir2_pos; + dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0)); + dir1_pos.zw = pos.xy; + dir2_pos.zw = pos.xy; + half lumaMaxNeSe = max(lumaNe, lumaSe.w); +/*--------------------------------------------------------------------------*/ +// (8) + half4 temp1N; + temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw; + temp1N = h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0)); + half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe); + half lumaMin = min(lumaMinNwSw, lumaMinNeSe); +/*--------------------------------------------------------------------------*/ +// (9) + half4 rgby1; + rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw; + rgby1 = h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0)); + rgby1 = (temp1N + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ +// (10) + half4 rgbyM = h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0)); + half lumaMaxM = max(lumaMax, rgbyM.w); + half lumaMinM = min(lumaMin, rgbyM.w); +/*--------------------------------------------------------------------------*/ +// (11) + half4 temp2N; + temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy; + temp2N = h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0)); + half4 rgby2; + rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy; + half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE__EDGE_THRESHOLD; +/*--------------------------------------------------------------------------*/ +// (12) + rgby2 = h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0)); + rgby2 = (temp2N + rgby2) * 0.5; +/*--------------------------------------------------------------------------*/ +// (13) + rgby2 = (rgby2 + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ +// (14) + bool twoTapLt = rgby2.w < lumaMin; + bool twoTapGt = rgby2.w > lumaMax; + bool earlyExit = lumaRangeM < lumaMax; + bool twoTap = twoTapLt || twoTapGt; +/*--------------------------------------------------------------------------*/ +// (15) + if(twoTap) rgby2 = rgby1; + if(earlyExit) rgby2 = rgbyM; +/*--------------------------------------------------------------------------*/ + return rgby2; } +/*==========================================================================*/ +#endif + + + +/*============================================================================ + + FXAA3 CONSOLE - PC PIXEL SHADER + +------------------------------------------------------------------------------ +Using a modified version of the PS3 version here to best target old hardware. +============================================================================*/ +#if (FXAA_PC_CONSOLE == 1) +/*--------------------------------------------------------------------------*/ +half4 FxaaPixelShader( + // {xy} = center of pixel + float2 pos, + // {xy__} = upper left of pixel + // {__zw} = lower right of pixel + float4 posPos, + // {rgb_} = color in linear or perceptual color space + // {___a} = alpha output is junk value + FxaaTex tex, + // This must be from a constant/uniform. + // {xy} = rcpFrame not used on PC version of FXAA Console + float2 rcpFrame, + // This must be from a constant/uniform. + // {x___} = 2.0/screenWidthInPixels + // {_y__} = 2.0/screenHeightInPixels + // {__z_} = 0.5/screenWidthInPixels + // {___w} = 0.5/screenHeightInPixels + float4 rcpFrameOpt +) { +/*--------------------------------------------------------------------------*/ + half4 dir; + dir.y = 0.0; + half4 lumaNe = FxaaTexTop(tex, posPos.zy); + lumaNe.w += half(1.0/384.0); + dir.x = -lumaNe.w; + dir.z = -lumaNe.w; +/*--------------------------------------------------------------------------*/ + half4 lumaSw = FxaaTexTop(tex, posPos.xw); + dir.x += lumaSw.w; + dir.z += lumaSw.w; +/*--------------------------------------------------------------------------*/ + half4 lumaNw = FxaaTexTop(tex, posPos.xy); + dir.x -= lumaNw.w; + dir.z += lumaNw.w; +/*--------------------------------------------------------------------------*/ + half4 lumaSe = FxaaTexTop(tex, posPos.zw); + dir.x += lumaSe.w; + dir.z -= lumaSe.w; +/*==========================================================================*/ + #if (FXAA_EARLY_EXIT == 1) + half4 rgbyM = FxaaTexTop(tex, pos.xy); +/*--------------------------------------------------------------------------*/ + half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w)); + half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w)); +/*--------------------------------------------------------------------------*/ + half lumaMinM = min(lumaMin, rgbyM.w); + half lumaMaxM = max(lumaMax, rgbyM.w); +/*--------------------------------------------------------------------------*/ + if((lumaMaxM - lumaMinM) < max(FXAA_CONSOLE__EDGE_THRESHOLD_MIN, lumaMax * FXAA_CONSOLE__EDGE_THRESHOLD)) + #if (FXAA_DISCARD == 1) + FxaaDiscard; + #else + return rgbyM; + #endif + #endif +/*==========================================================================*/ + half4 dir1_pos; + dir1_pos.xy = normalize(dir.xyz).xz; + half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__EDGE_SHARPNESS); +/*--------------------------------------------------------------------------*/ + half4 dir2_pos; + dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0)); + dir1_pos.zw = pos.xy; + dir2_pos.zw = pos.xy; + half4 temp1N; + temp1N.xy = dir1_pos.zw - dir1_pos.xy * rcpFrameOpt.zw; +/*--------------------------------------------------------------------------*/ + temp1N = FxaaTexTop(tex, temp1N.xy); + half4 rgby1; + rgby1.xy = dir1_pos.zw + dir1_pos.xy * rcpFrameOpt.zw; +/*--------------------------------------------------------------------------*/ + rgby1 = FxaaTexTop(tex, rgby1.xy); + rgby1 = (temp1N + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ + half4 temp2N; + temp2N.xy = dir2_pos.zw - dir2_pos.xy * rcpFrameOpt.xy; + temp2N = FxaaTexTop(tex, temp2N.xy); +/*--------------------------------------------------------------------------*/ + half4 rgby2; + rgby2.xy = dir2_pos.zw + dir2_pos.xy * rcpFrameOpt.xy; + rgby2 = FxaaTexTop(tex, rgby2.xy); + rgby2 = (temp2N + rgby2) * 0.5; +/*--------------------------------------------------------------------------*/ + #if (FXAA_EARLY_EXIT == 0) + half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w)); + half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w)); + #endif + rgby2 = (rgby2 + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ + bool twoTapLt = rgby2.w < lumaMin; + bool twoTapGt = rgby2.w > lumaMax; +/*--------------------------------------------------------------------------*/ + if(twoTapLt || twoTapGt) rgby2 = rgby1; +/*--------------------------------------------------------------------------*/ + return rgby2; } +/*==========================================================================*/ +#endif + + + +/*============================================================================ + + FXAA3 QUALITY - PC + +============================================================================*/ +#if (FXAA_PC == 1) +/*--------------------------------------------------------------------------*/ +float4 FxaaPixelShader( + // {xy} = center of pixel + float2 pos, + // {xyzw} = not used on FXAA3 Quality + float4 posPos, + // {rgb_} = color in linear or perceptual color space + // {___a} = luma in perceptual color space (not linear) + FxaaTex tex, + // This must be from a constant/uniform. + // {x_} = 1.0/screenWidthInPixels + // {_y} = 1.0/screenHeightInPixels + float2 rcpFrame, + // {xyzw} = not used on FXAA3 Quality + float4 rcpFrameOpt +) { +/*--------------------------------------------------------------------------*/ + float2 posM; + posM.x = pos.x; + posM.y = pos.y; + #if (FXAA_GATHER4_ALPHA == 1) + #if (FXAA_DISCARD == 0) + float4 rgbyM = FxaaTexTop(tex, posM); + #define lumaM rgbyM.w + #endif + float4 luma4A = FxaaTexAlpha4(tex, posM, rcpFrame.xy); + float4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1), rcpFrame.xy); + #if (FXAA_DISCARD == 1) + #define lumaM luma4A.w + #endif + #define lumaE luma4A.z + #define lumaS luma4A.x + #define lumaSE luma4A.y + #define lumaNW luma4B.w + #define lumaN luma4B.z + #define lumaW luma4B.x + #else + float4 rgbyM = FxaaTexTop(tex, posM); + #define lumaM rgbyM.w + float lumaS = FxaaTexOff(tex, posM, FxaaInt2( 0, 1), rcpFrame.xy).w; + float lumaE = FxaaTexOff(tex, posM, FxaaInt2( 1, 0), rcpFrame.xy).w; + float lumaN = FxaaTexOff(tex, posM, FxaaInt2( 0,-1), rcpFrame.xy).w; + float lumaW = FxaaTexOff(tex, posM, FxaaInt2(-1, 0), rcpFrame.xy).w; + #endif +/*--------------------------------------------------------------------------*/ + float maxSM = max(lumaS, lumaM); + float minSM = min(lumaS, lumaM); + float maxESM = max(lumaE, maxSM); + float minESM = min(lumaE, minSM); + float maxWN = max(lumaN, lumaW); + float minWN = min(lumaN, lumaW); + float rangeMax = max(maxWN, maxESM); + float rangeMin = min(minWN, minESM); + float rangeMaxScaled = rangeMax * FXAA_QUALITY__EDGE_THRESHOLD; + float range = rangeMax - rangeMin; + float rangeMaxClamped = max(FXAA_QUALITY__EDGE_THRESHOLD_MIN, rangeMaxScaled); + bool earlyExit = range < rangeMaxClamped; +/*--------------------------------------------------------------------------*/ + if(earlyExit) + #if (FXAA_DISCARD == 1) + FxaaDiscard; + #else + return rgbyM; + #endif +/*--------------------------------------------------------------------------*/ + #if (FXAA_GATHER4_ALPHA == 0) + float lumaNW = FxaaTexOff(tex, posM, FxaaInt2(-1,-1), rcpFrame.xy).w; + float lumaSE = FxaaTexOff(tex, posM, FxaaInt2( 1, 1), rcpFrame.xy).w; + float lumaNE = FxaaTexOff(tex, posM, FxaaInt2( 1,-1), rcpFrame.xy).w; + float lumaSW = FxaaTexOff(tex, posM, FxaaInt2(-1, 1), rcpFrame.xy).w; + #else + float lumaNE = FxaaTexOff(tex, posM, FxaaInt2(1, -1), rcpFrame.xy).w; + float lumaSW = FxaaTexOff(tex, posM, FxaaInt2(-1, 1), rcpFrame.xy).w; + #endif +/*--------------------------------------------------------------------------*/ + float lumaNS = lumaN + lumaS; + float lumaWE = lumaW + lumaE; + float subpixRcpRange = 1.0/range; + float subpixNSWE = lumaNS + lumaWE; + float edgeHorz1 = (-2.0 * lumaM) + lumaNS; + float edgeVert1 = (-2.0 * lumaM) + lumaWE; +/*--------------------------------------------------------------------------*/ + float lumaNESE = lumaNE + lumaSE; + float lumaNWNE = lumaNW + lumaNE; + float edgeHorz2 = (-2.0 * lumaE) + lumaNESE; + float edgeVert2 = (-2.0 * lumaN) + lumaNWNE; +/*--------------------------------------------------------------------------*/ + float lumaNWSW = lumaNW + lumaSW; + float lumaSWSE = lumaSW + lumaSE; + float edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2); + float edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2); + float edgeHorz3 = (-2.0 * lumaW) + lumaNWSW; + float edgeVert3 = (-2.0 * lumaS) + lumaSWSE; + float edgeHorz = abs(edgeHorz3) + edgeHorz4; + float edgeVert = abs(edgeVert3) + edgeVert4; +/*--------------------------------------------------------------------------*/ + float subpixNWSWNESE = lumaNWSW + lumaNESE; + float lengthSign = rcpFrame.x; + bool horzSpan = edgeHorz >= edgeVert; + float subpixA = subpixNSWE * 2.0 + subpixNWSWNESE; +/*--------------------------------------------------------------------------*/ + if(!horzSpan) lumaN = lumaW; + if(!horzSpan) lumaS = lumaE; + if(horzSpan) lengthSign = rcpFrame.y; + float subpixB = (subpixA * (1.0/12.0)) - lumaM; +/*--------------------------------------------------------------------------*/ + float gradientN = lumaN - lumaM; + float gradientS = lumaS - lumaM; + float lumaNN = lumaN + lumaM; + float lumaSS = lumaS + lumaM; + bool pairN = abs(gradientN) >= abs(gradientS); + float gradient = max(abs(gradientN), abs(gradientS)); + if(pairN) lengthSign = -lengthSign; + float subpixC = FxaaSat(abs(subpixB) * subpixRcpRange); +/*--------------------------------------------------------------------------*/ + float2 posB; + posB.x = posM.x; + posB.y = posM.y; + float2 offNP; + offNP.x = (!horzSpan) ? 0.0 : rcpFrame.x; + offNP.y = ( horzSpan) ? 0.0 : rcpFrame.y; + if(!horzSpan) posB.x += lengthSign * 0.5; + if( horzSpan) posB.y += lengthSign * 0.5; +/*--------------------------------------------------------------------------*/ + float2 posN; + posN.x = posB.x - offNP.x * FXAA_QUALITY__P0; + posN.y = posB.y - offNP.y * FXAA_QUALITY__P0; + float2 posP; + posP.x = posB.x + offNP.x * FXAA_QUALITY__P0; + posP.y = posB.y + offNP.y * FXAA_QUALITY__P0; + float subpixD = ((-2.0)*subpixC) + 3.0; + float lumaEndN = FxaaTexTop(tex, posN).w; + float subpixE = subpixC * subpixC; + float lumaEndP = FxaaTexTop(tex, posP).w; +/*--------------------------------------------------------------------------*/ + if(!pairN) lumaNN = lumaSS; + float gradientScaled = gradient * 1.0/4.0; + float lumaMM = lumaM - lumaNN * 0.5; + float subpixF = subpixD * subpixE; + bool lumaMLTZero = lumaMM < 0.0; +/*--------------------------------------------------------------------------*/ + lumaEndN -= lumaNN * 0.5; + lumaEndP -= lumaNN * 0.5; + bool doneN = abs(lumaEndN) >= gradientScaled; + bool doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1; + bool doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1; +/*--------------------------------------------------------------------------*/ + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 3) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 4) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 5) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 6) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 7) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 8) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 9) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 10) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 11) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11; +/*--------------------------------------------------------------------------*/ + #if (FXAA_QUALITY__PS > 12) + if(doneNP) { + if(!doneN) lumaEndN = FxaaTexTop(tex, posN.xy).w; + if(!doneP) lumaEndP = FxaaTexTop(tex, posP.xy).w; + if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; + if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; + doneN = abs(lumaEndN) >= gradientScaled; + doneP = abs(lumaEndP) >= gradientScaled; + if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12; + if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12; + doneNP = (!doneN) || (!doneP); + if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12; + if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12; +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } + #endif +/*--------------------------------------------------------------------------*/ + } +/*--------------------------------------------------------------------------*/ + float dstN = posM.x - posN.x; + float dstP = posP.x - posM.x; + if(!horzSpan) dstN = posM.y - posN.y; + if(!horzSpan) dstP = posP.y - posM.y; +/*--------------------------------------------------------------------------*/ + bool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero; + float spanLength = (dstP + dstN); + bool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero; + float spanLengthRcp = 1.0/spanLength; +/*--------------------------------------------------------------------------*/ + bool directionN = dstN < dstP; + float dst = min(dstN, dstP); + bool goodSpan = directionN ? goodSpanN : goodSpanP; + float subpixG = subpixF * subpixF; + float pixelOffset = (dst * (-spanLengthRcp)) + 0.5; + float subpixH = subpixG * FXAA_QUALITY__SUBPIX; +/*--------------------------------------------------------------------------*/ + float pixelOffsetGood = goodSpan ? pixelOffset : 0.0; + float pixelOffsetSubpix = max(pixelOffsetGood, subpixH); + if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign; + if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign; + #if (FXAA_DISCARD == 1) + return FxaaTexTop(tex, posM); + #else + return float4(FxaaTexTop(tex, posM).xyz, lumaM); + #endif +} +/*==========================================================================*/ +#endif + +PS_OUTPUT ps_main(PS_INPUT input) +{ + PS_OUTPUT output; + + float2 pos = input.t; + float4 posPos = (float4)0; + + FxaaTex tex; + + #if SHADER_MODEL >= 0x400 + + tex.tex = Texture; + tex.smpl = TextureSampler; + + #else + + tex = Texture; + + #endif + + output.c = FxaaPixelShader(pos, posPos, tex, _rcpFrame.xy, _rcpFrameOpt); + + return output; +} + +#endif diff --git a/plugins/GSdx/res/shadeboost.fx b/plugins/GSdx/res/shadeboost.fx index 929038ade1..5f3c2474a0 100644 --- a/plugins/GSdx/res/shadeboost.fx +++ b/plugins/GSdx/res/shadeboost.fx @@ -1,76 +1,76 @@ -#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency - -/* -** Contrast, saturation, brightness -** Code of this function is from TGM's shader pack -** http://irrlicht.sourceforge.net/phpBB2/viewtopic.php?t=21057 -*/ - -// For all settings: 1.0 = 100% 0.5=50% 1.5 = 150% -float4 ContrastSaturationBrightness(float4 color) // Ported to HLSL -{ - const float sat = SB_SATURATION / 50.0; - const float brt = SB_BRIGHTNESS / 50.0; - const float con = SB_CONTRAST / 50.0; - - // Increase or decrease theese values to adjust r, g and b color channels seperately - const float AvgLumR = 0.5; - const float AvgLumG = 0.5; - const float AvgLumB = 0.5; - - const float3 LumCoeff = float3(0.2125, 0.7154, 0.0721); - - float3 AvgLumin = float3(AvgLumR, AvgLumG, AvgLumB); - float3 brtColor = color.rgb * brt; - float3 intensity = dot(brtColor, LumCoeff); - float3 satColor = lerp(intensity, brtColor, sat); - float3 conColor = lerp(AvgLumin, satColor, con); - - color.rgb = conColor; - return color; -} - -#if SHADER_MODEL >= 0x400 - -Texture2D Texture; -SamplerState Sampler; - -cbuffer cb0 -{ - float4 BGColor; -}; - -struct PS_INPUT -{ - float4 p : SV_Position; - float2 t : TEXCOORD0; -}; - -float4 ps_main(PS_INPUT input) : SV_Target0 -{ - float4 c = Texture.Sample(Sampler, input.t); - return ContrastSaturationBrightness(c); -} - - -#elif SHADER_MODEL <= 0x300 - -sampler Texture : register(s0); - -float4 g_params[1]; - -#define BGColor (g_params[0]) - -struct PS_INPUT -{ - float2 t : TEXCOORD0; -}; - -float4 ps_main(PS_INPUT input) : COLOR -{ - float4 c = tex2D(Texture, input.t); - return ContrastSaturationBrightness(c); -} - -#endif +#ifdef SHADER_MODEL // make safe to include in resource file to enforce dependency + +/* +** Contrast, saturation, brightness +** Code of this function is from TGM's shader pack +** http://irrlicht.sourceforge.net/phpBB2/viewtopic.php?t=21057 +*/ + +// For all settings: 1.0 = 100% 0.5=50% 1.5 = 150% +float4 ContrastSaturationBrightness(float4 color) // Ported to HLSL +{ + const float sat = SB_SATURATION / 50.0; + const float brt = SB_BRIGHTNESS / 50.0; + const float con = SB_CONTRAST / 50.0; + + // Increase or decrease theese values to adjust r, g and b color channels seperately + const float AvgLumR = 0.5; + const float AvgLumG = 0.5; + const float AvgLumB = 0.5; + + const float3 LumCoeff = float3(0.2125, 0.7154, 0.0721); + + float3 AvgLumin = float3(AvgLumR, AvgLumG, AvgLumB); + float3 brtColor = color.rgb * brt; + float3 intensity = dot(brtColor, LumCoeff); + float3 satColor = lerp(intensity, brtColor, sat); + float3 conColor = lerp(AvgLumin, satColor, con); + + color.rgb = conColor; + return color; +} + +#if SHADER_MODEL >= 0x400 + +Texture2D Texture; +SamplerState Sampler; + +cbuffer cb0 +{ + float4 BGColor; +}; + +struct PS_INPUT +{ + float4 p : SV_Position; + float2 t : TEXCOORD0; +}; + +float4 ps_main(PS_INPUT input) : SV_Target0 +{ + float4 c = Texture.Sample(Sampler, input.t); + return ContrastSaturationBrightness(c); +} + + +#elif SHADER_MODEL <= 0x300 + +sampler Texture : register(s0); + +float4 g_params[1]; + +#define BGColor (g_params[0]) + +struct PS_INPUT +{ + float2 t : TEXCOORD0; +}; + +float4 ps_main(PS_INPUT input) : COLOR +{ + float4 c = tex2D(Texture, input.t); + return ContrastSaturationBrightness(c); +} + +#endif #endif \ No newline at end of file