diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index afa8a25a3d..1eb980b285 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -749,6 +749,7 @@ set(pcsx2GSHeaders GS/Renderers/Null/GSRendererNull.h GS/Renderers/Null/GSTextureNull.h GS/Renderers/HW/GSRendererHW.h + GS/Renderers/HW/GSRendererHWMultiISA.cpp GS/Renderers/HW/GSTextureCache.h GS/Renderers/HW/GSTextureReplacements.h GS/Renderers/HW/GSVertexHW.h diff --git a/pcsx2/GS/GS.cpp b/pcsx2/GS/GS.cpp index 882ebeea1f..02c4ae647e 100644 --- a/pcsx2/GS/GS.cpp +++ b/pcsx2/GS/GS.cpp @@ -22,12 +22,12 @@ #include "GSGL.h" #include "GSUtil.h" #include "GSExtra.h" -#include "Renderers/SW/GSRendererSW.h" #include "Renderers/Null/GSRendererNull.h" #include "Renderers/Null/GSDeviceNull.h" #include "Renderers/HW/GSRendererHW.h" #include "Renderers/HW/GSTextureReplacements.h" #include "GSLzma.h" +#include "MultiISA.h" #include "common/Console.h" #include "common/FileSystem.h" @@ -91,7 +91,7 @@ int GSinit() // const type qualifier from all the affected variables. GSinitConfig(); - + GSVertexSW::InitStatic(); GSUtil::Init(); @@ -262,7 +262,7 @@ static bool DoGSOpen(GSRendererType renderer, u8* basemem) else { const int threads = theApp.GetConfigI("extrathreads"); - g_gs_renderer = std::make_unique(threads); + g_gs_renderer = std::unique_ptr(MULTI_ISA_SELECT(makeGSRendererSW)(threads)); } } catch (std::exception& ex) diff --git a/pcsx2/GS/GSAlignedClass.h b/pcsx2/GS/GSAlignedClass.h index 7f2148ccec..b2259fa236 100644 --- a/pcsx2/GS/GSAlignedClass.h +++ b/pcsx2/GS/GSAlignedClass.h @@ -56,3 +56,11 @@ public: _aligned_free(p); } }; + +/// GSAlignedClass with a virtual destructor +template +class GSVirtualAlignedClass : public GSAlignedClass +{ +public: + virtual ~GSVirtualAlignedClass() {} +}; diff --git a/pcsx2/GS/GSDrawingContext.cpp b/pcsx2/GS/GSDrawingContext.cpp index dc17f02c6e..522185836a 100644 --- a/pcsx2/GS/GSDrawingContext.cpp +++ b/pcsx2/GS/GSDrawingContext.cpp @@ -79,7 +79,7 @@ static int extend(int uv, int size) return size; } -GIFRegTEX0 GSDrawingContext::GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap) +GIFRegTEX0 GSDrawingContext::GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap) const { if (mipmap) return TEX0; // no mipmaping allowed diff --git a/pcsx2/GS/GSDrawingContext.h b/pcsx2/GS/GSDrawingContext.h index d2445c96b5..91841abcf6 100644 --- a/pcsx2/GS/GSDrawingContext.h +++ b/pcsx2/GS/GSDrawingContext.h @@ -140,7 +140,7 @@ public: return ZBUF.ZMSK == 0 && TEST.ZTE != 0; // ZTE == 0 is bug on the real hardware, write is blocked then } - GIFRegTEX0 GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap = false); + GIFRegTEX0 GetSizeFixedTEX0(const GSVector4& st, bool linear, bool mipmap = false) const; void ComputeFixedTEX0(const GSVector4& st); bool HasFixedTEX0() const { return m_fixed_tex0; } diff --git a/pcsx2/GS/MultiISA.h b/pcsx2/GS/MultiISA.h index 5b0fd54bbc..ac796c4089 100644 --- a/pcsx2/GS/MultiISA.h +++ b/pcsx2/GS/MultiISA.h @@ -83,3 +83,6 @@ extern const ProcessorFeatures g_cpu; #define MULTI_ISA_FRIEND(klass) friend class isa_native::klass; #define MULTI_ISA_SELECT(fn) (isa_native::fn) #endif + +class GSRenderer; +MULTI_ISA_DEF(GSRenderer* makeGSRendererSW(int threads);) diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp index ff0dbba51b..dff7bc435a 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp @@ -17,8 +17,6 @@ #include "GSRendererHW.h" #include "GSTextureReplacements.h" #include "GS/GSGL.h" -#include "GS/Renderers/SW/GSTextureCacheSW.h" -#include "GS/Renderers/SW/GSDrawScanline.h" #include "Host.h" #include "common/Align.h" #include "common/StringUtil.h" @@ -35,6 +33,7 @@ GSRendererHW::GSRendererHW() , m_userhacks_tcoffset_y(0) , m_lod(GSVector2i(0, 0)) { + MULTI_ISA_SELECT(GSRendererHWPopulateFunctions)(*this); m_mipmap = (GSConfig.HWMipmap >= HWMipmapLevel::Basic); SetTCOffset(); @@ -1403,7 +1402,7 @@ void GSRendererHW::Draw() const bool single_page = (delta_p.x <= 64.0f) && (delta_p.y <= 64.0f); // We trigger the sw prim render here super early, to avoid creating superfluous render targets. - if (CanUseSwPrimRender(no_rt, no_ds, draw_sprite_tex) && SwPrimRender()) + if (CanUseSwPrimRender(no_rt, no_ds, draw_sprite_tex) && SwPrimRender(*this)) { GL_CACHE("Possible texture decompression, drawn with SwPrimRender()"); return; @@ -1416,7 +1415,7 @@ void GSRendererHW::Draw() m_mem.m_clut.ClearDrawInvalidity(); if (result) { - if (SwPrimRender()) + if (SwPrimRender(*this)) { GL_CACHE("Possible clut draw, drawn with SwPrimRender()"); return; @@ -4118,423 +4117,6 @@ bool GSRendererHW::CanUseSwPrimRender(bool no_rt, bool no_ds, bool draw_sprite_t return true; } -bool GSRendererHW::SwPrimRender() -{ - const GSDrawingContext* context = m_context; - const GSDrawingEnvironment& env = m_env; - const GS_PRIM_CLASS primclass = m_vt.m_primclass; - - GSDrawScanline::SharedData data; - GSScanlineGlobalData& gd = data.global; - - u32 clut_storage[256] = {0}; - GSVector4i dimx_storage[8]; - - m_sw_vertex_buffer.resize(((m_vertex.next + 1) & ~1)); - - data.primclass = m_vt.m_primclass; - data.buff = nullptr; - data.vertex = m_sw_vertex_buffer.data(); - data.vertex_count = m_vertex.next; - data.index = m_index.buff; - data.index_count = m_index.tail; - data.scanmsk_value = m_env.SCANMSK.MSK; - - // Skip per pixel division if q is constant. - // Optimize the division by 1 with a nop. It also means that GS_SPRITE_CLASS must be processed when !m_vt.m_eq.q. - // If you have both GS_SPRITE_CLASS && m_vt.m_eq.q, it will depends on the first part of the 'OR'. - const u32 q_div = ((m_vt.m_eq.q && m_vt.m_min.t.z != 1.0f) || (!m_vt.m_eq.q && m_vt.m_primclass == GS_SPRITE_CLASS)); - GSVertexSW::s_cvb[m_vt.m_primclass][PRIM->TME][PRIM->FST][q_div](m_context, data.vertex, m_vertex.buff, m_vertex.next); - - GSVector4i scissor = GSVector4i(m_context->scissor.in); - GSVector4i bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil())); - - // Points and lines may have zero area bbox (single line: 0, 0 - 256, 0) - - if (m_vt.m_primclass == GS_POINT_CLASS || m_vt.m_primclass == GS_LINE_CLASS) - { - if (bbox.x == bbox.z) - bbox.z++; - if (bbox.y == bbox.w) - bbox.w++; - } - - data.scissor = scissor; - data.bbox = bbox; - data.frame = g_perfmon.GetFrame(); - - gd.vm = m_mem.m_vm8; - - gd.fbo = context->offset.fb; - gd.zbo = context->offset.zb; - gd.fzbr = context->offset.fzb4->row; - gd.fzbc = context->offset.fzb4->col; - - gd.sel.key = 0; - - gd.sel.fpsm = 3; - gd.sel.zpsm = 3; - gd.sel.atst = ATST_ALWAYS; - gd.sel.tfx = TFX_NONE; - gd.sel.ababcd = 0xff; - gd.sel.prim = primclass; - - u32 fm = context->FRAME.FBMSK; - u32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; - const u32 fm_mask = GSLocalMemory::m_psm[m_context->FRAME.PSM].fmsk; - - // When the format is 24bit (Z or C), DATE ceases to function. - // It was believed that in 24bit mode all pixels pass because alpha doesn't exist - // however after testing this on a PS2 it turns out nothing passes, it ignores the draw. - if ((m_context->FRAME.PSM & 0xF) == PSM_PSMCT24 && m_context->TEST.DATE) - { - //DevCon.Warning("DATE on a 24bit format, Frame PSM %x", m_context->FRAME.PSM); - return false; - } - - if (context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER) - { - fm = 0xffffffff; - zm = 0xffffffff; - } - - if (PRIM->TME) - { - if (GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) - { - m_mem.m_clut.Read32(context->TEX0, env.TEXA); - } - } - - if (context->TEST.ATE) - { - if (!TryAlphaTest(fm, fm_mask, zm)) - { - gd.sel.atst = context->TEST.ATST; - gd.sel.afail = context->TEST.AFAIL; - - gd.aref = GSVector4i((int)context->TEST.AREF); - - switch (gd.sel.atst) - { - case ATST_LESS: - gd.sel.atst = ATST_LEQUAL; - gd.aref -= GSVector4i::x00000001(); - break; - case ATST_GREATER: - gd.sel.atst = ATST_GEQUAL; - gd.aref += GSVector4i::x00000001(); - break; - } - } - } - - const bool fwrite = (fm & fm_mask) != fm_mask; - const bool ftest = gd.sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; - - const bool zwrite = zm != 0xffffffff; - const bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS; - if (!fwrite && !zwrite) - return false; - - gd.sel.fwrite = fwrite; - gd.sel.ftest = ftest; - - if (fwrite || ftest) - { - gd.sel.fpsm = GSLocalMemory::m_psm[context->FRAME.PSM].fmt; - - if ((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && m_vt.m_eq.rgba != 0xffff) - { - gd.sel.iip = PRIM->IIP; - } - - if (PRIM->TME) - { - gd.sel.tfx = context->TEX0.TFX; - gd.sel.tcc = context->TEX0.TCC; - gd.sel.fst = PRIM->FST; - gd.sel.ltf = m_vt.IsLinear(); - - if (GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) - { - gd.sel.tlu = 1; - - gd.clut = clut_storage; // FIXME: might address uninitialized data of the texture (0xCD) that is not in 0-15 range for 4-bpp formats - - memcpy(gd.clut, (const u32*)m_mem.m_clut, sizeof(u32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal); - } - - gd.sel.wms = context->CLAMP.WMS; - gd.sel.wmt = context->CLAMP.WMT; - - if (gd.sel.tfx == TFX_MODULATE && gd.sel.tcc && m_vt.m_eq.rgba == 0xffff && m_vt.m_min.c.eq(GSVector4i(128))) - { - // modulate does not do anything when vertex color is 0x80 - - gd.sel.tfx = TFX_DECAL; - } - - GIFRegTEX0 TEX0 = m_context->GetSizeFixedTEX0(m_vt.m_min.t.xyxy(m_vt.m_max.t), m_vt.IsLinear(), false); - - const GSVector4i r = GetTextureMinMax(TEX0, context->CLAMP, gd.sel.ltf).coverage; - - if (!m_sw_texture) - m_sw_texture = std::make_unique(0, TEX0, env.TEXA); - else - m_sw_texture->Reset(0, TEX0, env.TEXA); - - m_sw_texture->Update(r); - gd.tex[0] = m_sw_texture->m_buff; - - gd.sel.tw = m_sw_texture->m_tw - 3; - - { - // skip per pixel division if q is constant. Sprite uses flat - // q, so it's always constant by primitive. - // Note: the 'q' division was done in GSRendererSW::ConvertVertexBuffer - gd.sel.fst |= (m_vt.m_eq.q || primclass == GS_SPRITE_CLASS); - - if (gd.sel.ltf && gd.sel.fst) - { - // if q is constant we can do the half pel shift for bilinear sampling on the vertices - // TODO: but not when mipmapping is used!!! - - GSVertexSW* RESTRICT v = data.vertex; - const GSVector4 half(0x8000, 0x8000); - for (int i = 0, j = data.vertex_count; i < j; i++) - { - const GSVector4 t = v[i].t; - v[i].t = (t - half).xyzw(t); - } - } - } - - u16 tw = 1u << TEX0.TW; - u16 th = 1u << TEX0.TH; - - if (tw > 1024) - tw = 1; - - if (th > 1024) - th = 1; - - switch (context->CLAMP.WMS) - { - case CLAMP_REPEAT: - gd.t.min.U16[0] = gd.t.minmax.U16[0] = tw - 1; - gd.t.max.U16[0] = gd.t.minmax.U16[2] = 0; - gd.t.mask.U32[0] = 0xffffffff; - break; - case CLAMP_CLAMP: - gd.t.min.U16[0] = gd.t.minmax.U16[0] = 0; - gd.t.max.U16[0] = gd.t.minmax.U16[2] = tw - 1; - gd.t.mask.U32[0] = 0; - break; - case CLAMP_REGION_CLAMP: - // REGION_CLAMP ignores the actual texture size - gd.t.min.U16[0] = gd.t.minmax.U16[0] = context->CLAMP.MINU; - gd.t.max.U16[0] = gd.t.minmax.U16[2] = context->CLAMP.MAXU; - gd.t.mask.U32[0] = 0; - break; - case CLAMP_REGION_REPEAT: - // MINU is restricted to MINU or texture size, whichever is smaller, MAXU is an offset in the texture. - gd.t.min.U16[0] = gd.t.minmax.U16[0] = context->CLAMP.MINU & (tw - 1); - gd.t.max.U16[0] = gd.t.minmax.U16[2] = context->CLAMP.MAXU; - gd.t.mask.U32[0] = 0xffffffff; - break; - default: - __assume(0); - } - - switch (context->CLAMP.WMT) - { - case CLAMP_REPEAT: - gd.t.min.U16[4] = gd.t.minmax.U16[1] = th - 1; - gd.t.max.U16[4] = gd.t.minmax.U16[3] = 0; - gd.t.mask.U32[2] = 0xffffffff; - break; - case CLAMP_CLAMP: - gd.t.min.U16[4] = gd.t.minmax.U16[1] = 0; - gd.t.max.U16[4] = gd.t.minmax.U16[3] = th - 1; - gd.t.mask.U32[2] = 0; - break; - case CLAMP_REGION_CLAMP: - // REGION_CLAMP ignores the actual texture size - gd.t.min.U16[4] = gd.t.minmax.U16[1] = context->CLAMP.MINV; - gd.t.max.U16[4] = gd.t.minmax.U16[3] = context->CLAMP.MAXV; // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256) - gd.t.mask.U32[2] = 0; - break; - case CLAMP_REGION_REPEAT: - // MINV is restricted to MINV or texture size, whichever is smaller, MAXV is an offset in the texture. - gd.t.min.U16[4] = gd.t.minmax.U16[1] = context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127 - gd.t.max.U16[4] = gd.t.minmax.U16[3] = context->CLAMP.MAXV; - gd.t.mask.U32[2] = 0xffffffff; - break; - default: - __assume(0); - } - - gd.t.min = gd.t.min.xxxxlh(); - gd.t.max = gd.t.max.xxxxlh(); - gd.t.mask = gd.t.mask.xxzz(); - gd.t.invmask = ~gd.t.mask; - } - - if (PRIM->FGE) - { - gd.sel.fge = 1; - - gd.frb = env.FOGCOL.U32[0] & 0x00ff00ff; - gd.fga = (env.FOGCOL.U32[0] >> 8) & 0x00ff00ff; - } - - if (context->FRAME.PSM != PSM_PSMCT24) - { - gd.sel.date = context->TEST.DATE; - gd.sel.datm = context->TEST.DATM; - } - - if (!IsOpaque()) - { - gd.sel.abe = PRIM->ABE; - gd.sel.ababcd = context->ALPHA.U32[0]; - - if (env.PABE.PABE) - { - gd.sel.pabe = 1; - } - - if (PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS)) - { - gd.sel.aa1 = 1; - } - - gd.afix = GSVector4i((int)context->ALPHA.FIX << 7).xxzzlh(); - } - - const u32 masked_fm = fm & fm_mask; - if (gd.sel.date - || gd.sel.aba == 1 || gd.sel.abb == 1 || gd.sel.abc == 1 || gd.sel.abd == 1 - || gd.sel.atst != ATST_ALWAYS && gd.sel.afail == AFAIL_RGB_ONLY - || gd.sel.fpsm == 0 && masked_fm != 0 && masked_fm != fm_mask - || gd.sel.fpsm == 1 && masked_fm != 0 && masked_fm != fm_mask - || gd.sel.fpsm == 2 && masked_fm != 0 && masked_fm != fm_mask) - { - gd.sel.rfb = 1; - } - - gd.sel.colclamp = env.COLCLAMP.CLAMP; - gd.sel.fba = context->FBA.FBA; - - if (env.DTHE.DTHE) - { - gd.sel.dthe = 1; - - gd.dimx = dimx_storage; - - memcpy(gd.dimx, env.dimx, sizeof(env.dimx)); - } - } - - gd.sel.zwrite = zwrite; - gd.sel.ztest = ztest; - - if (zwrite || ztest) - { - const u32 z_max = 0xffffffff >> (GSLocalMemory::m_psm[context->ZBUF.PSM].fmt * 8); - - gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt; - gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS; - gd.sel.zequal = !!m_vt.m_eq.z; - gd.sel.zoverflow = (u32)GSVector4i(m_vt.m_max.p).z == 0x80000000U; - gd.sel.zclamp = (u32)GSVector4i(m_vt.m_max.p).z > z_max; - } - -#if _M_SSE >= 0x501 - - gd.fm = fm; - gd.zm = zm; - - if (gd.sel.fpsm == 1) - { - gd.fm |= 0xff000000; - } - else if (gd.sel.fpsm == 2) - { - u32 rb = gd.fm & 0x00f800f8; - u32 ga = gd.fm & 0x8000f800; - - gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000; - } - - if (gd.sel.zpsm == 1) - { - gd.zm |= 0xff000000; - } - else if (gd.sel.zpsm == 2) - { - gd.zm |= 0xffff0000; - } - -#else - - gd.fm = GSVector4i(fm); - gd.zm = GSVector4i(zm); - - if (gd.sel.fpsm == 1) - { - gd.fm |= GSVector4i::xff000000(); - } - else if (gd.sel.fpsm == 2) - { - GSVector4i rb = gd.fm & 0x00f800f8; - GSVector4i ga = gd.fm & 0x8000f800; - - gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | GSVector4i::xffff0000(); - } - - if (gd.sel.zpsm == 1) - { - gd.zm |= GSVector4i::xff000000(); - } - else if (gd.sel.zpsm == 2) - { - gd.zm |= GSVector4i::xffff0000(); - } - -#endif - - if (gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data.bbox.eq(data.bbox.rintersect(data.scissor))) // TODO: check scissor horizontally only - { - gd.sel.notest = 1; - - const u32 ofx = context->XYOFFSET.OFX; - - for (int i = 0, j = m_vertex.tail; i < j; i++) - { -#if _M_SSE >= 0x501 - if ((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 7) // aligned to 8 -#else - if ((((m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4 -#endif - { - gd.sel.notest = 0; - - break; - } - } - } - - if (!m_sw_rasterizer) - m_sw_rasterizer = std::make_unique(new GSDrawScanline(), 0, 1); - - m_sw_rasterizer->Draw(&data); - - m_tc->InvalidateVideoMem(context->offset.fb, bbox); - return true; -} - // hacks GSRendererHW::Hacks::Hacks() diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.h b/pcsx2/GS/Renderers/HW/GSRendererHW.h index f8072e49e2..941d43a3bb 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.h +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.h @@ -20,11 +20,15 @@ #include "GS/Renderers/Common/GSRenderer.h" #include "GS/Renderers/SW/GSTextureCacheSW.h" #include "GS/GSState.h" +#include "GS/MultiISA.h" -class GSRasterizer; +class GSRendererHW; +MULTI_ISA_DEF(class GSRendererHWFunctions;) +MULTI_ISA_DEF(void GSRendererHWPopulateFunctions(GSRendererHW& renderer);) class GSRendererHW : public GSRenderer { + MULTI_ISA_FRIEND(GSRendererHWFunctions); public: static constexpr int MAX_FRAMEBUFFER_HEIGHT = 1280; @@ -130,7 +134,7 @@ private: bool PossibleCLUTDraw(); bool PossibleCLUTDrawAggressive(); bool CanUseSwPrimRender(bool no_rt, bool no_ds, bool draw_sprite_tex); - bool SwPrimRender(); + bool (*SwPrimRender)(GSRendererHW&); template void RoundSpriteOffset(); @@ -166,7 +170,7 @@ private: // software sprite renderer state std::vector m_sw_vertex_buffer; std::unique_ptr m_sw_texture; - std::unique_ptr m_sw_rasterizer; + std::unique_ptr> m_sw_rasterizer; public: GSRendererHW(); diff --git a/pcsx2/GS/Renderers/HW/GSRendererHWMultiISA.cpp b/pcsx2/GS/Renderers/HW/GSRendererHWMultiISA.cpp new file mode 100644 index 0000000000..c65a51c8cd --- /dev/null +++ b/pcsx2/GS/Renderers/HW/GSRendererHWMultiISA.cpp @@ -0,0 +1,461 @@ +/* PCSX2 - PS2 Emulator for PCs + * Copyright (C) 2002-2022 PCSX2 Dev Team + * + * PCSX2 is free software: you can redistribute it and/or modify it under the terms + * of the GNU Lesser General Public License as published by the Free Software Found- + * ation, either version 3 of the License, or (at your option) any later version. + * + * PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; + * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR + * PURPOSE. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along with PCSX2. + * If not, see . + */ + +#include "GSRendererHW.h" + +#include "GS/Renderers/SW/GSTextureCacheSW.h" +#include "GS/Renderers/SW/GSDrawScanline.h" + +class CURRENT_ISA::GSRendererHWFunctions +{ +public: + static bool SwPrimRender(GSRendererHW& hw); + + static void Populate(GSRendererHW& renderer) + { + renderer.SwPrimRender = SwPrimRender; + } +}; + +MULTI_ISA_UNSHARED_IMPL; + +void CURRENT_ISA::GSRendererHWPopulateFunctions(GSRendererHW& renderer) +{ + GSRendererHWFunctions::Populate(renderer); +} + +bool GSRendererHWFunctions::SwPrimRender(GSRendererHW& hw) +{ + GSVertexTrace& vt = hw.m_vt; + const GIFRegPRIM* PRIM = hw.PRIM; + const GSDrawingContext* context = hw.m_context; + const GSDrawingEnvironment& env = hw.m_env; + const GS_PRIM_CLASS primclass = vt.m_primclass; + + GSDrawScanline::SharedData data; + GSScanlineGlobalData& gd = data.global; + + u32 clut_storage[256]; + GSVector4i dimx_storage[8]; + + hw.m_sw_vertex_buffer.resize(((hw.m_vertex.next + 1) & ~1)); + + data.primclass = vt.m_primclass; + data.buff = nullptr; + data.vertex = hw.m_sw_vertex_buffer.data(); + data.vertex_count = hw.m_vertex.next; + data.index = hw.m_index.buff; + data.index_count = hw.m_index.tail; + data.scanmsk_value = hw.m_env.SCANMSK.MSK; + + // Skip per pixel division if q is constant. + // Optimize the division by 1 with a nop. It also means that GS_SPRITE_CLASS must be processed when !vt.m_eq.q. + // If you have both GS_SPRITE_CLASS && vt.m_eq.q, it will depends on the first part of the 'OR'. + const u32 q_div = ((vt.m_eq.q && vt.m_min.t.z != 1.0f) || (!vt.m_eq.q && vt.m_primclass == GS_SPRITE_CLASS)); + GSVertexSW::s_cvb[vt.m_primclass][PRIM->TME][PRIM->FST][q_div](context, data.vertex, hw.m_vertex.buff, hw.m_vertex.next); + + GSVector4i scissor = GSVector4i(context->scissor.in); + GSVector4i bbox = GSVector4i(vt.m_min.p.floor().xyxy(vt.m_max.p.ceil())); + + // Points and lines may have zero area bbox (single line: 0, 0 - 256, 0) + + if (vt.m_primclass == GS_POINT_CLASS || vt.m_primclass == GS_LINE_CLASS) + { + if (bbox.x == bbox.z) + bbox.z++; + if (bbox.y == bbox.w) + bbox.w++; + } + + data.scissor = scissor; + data.bbox = bbox; + data.frame = g_perfmon.GetFrame(); + + gd.vm = hw.m_mem.m_vm8; + + gd.fbo = context->offset.fb; + gd.zbo = context->offset.zb; + gd.fzbr = context->offset.fzb4->row; + gd.fzbc = context->offset.fzb4->col; + + gd.sel.key = 0; + + gd.sel.fpsm = 3; + gd.sel.zpsm = 3; + gd.sel.atst = ATST_ALWAYS; + gd.sel.tfx = TFX_NONE; + gd.sel.ababcd = 0xff; + gd.sel.prim = primclass; + + u32 fm = context->FRAME.FBMSK; + u32 zm = context->ZBUF.ZMSK || context->TEST.ZTE == 0 ? 0xffffffff : 0; + const u32 fm_mask = GSLocalMemory::m_psm[context->FRAME.PSM].fmsk; + + // When the format is 24bit (Z or C), DATE ceases to function. + // It was believed that in 24bit mode all pixels pass because alpha doesn't exist + // however after testing this on a PS2 it turns out nothing passes, it ignores the draw. + if ((context->FRAME.PSM & 0xF) == PSM_PSMCT24 && context->TEST.DATE) + { + //DevCon.Warning("DATE on a 24bit format, Frame PSM %x", context->FRAME.PSM); + return false; + } + + if (context->TEST.ZTE && context->TEST.ZTST == ZTST_NEVER) + { + fm = 0xffffffff; + zm = 0xffffffff; + } + + if (PRIM->TME) + { + if (GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) + { + hw.m_mem.m_clut.Read32(context->TEX0, env.TEXA); + } + } + + if (context->TEST.ATE) + { + if (!hw.TryAlphaTest(fm, fm_mask, zm)) + { + gd.sel.atst = context->TEST.ATST; + gd.sel.afail = context->TEST.AFAIL; + + gd.aref = GSVector4i((int)context->TEST.AREF); + + switch (gd.sel.atst) + { + case ATST_LESS: + gd.sel.atst = ATST_LEQUAL; + gd.aref -= GSVector4i::x00000001(); + break; + case ATST_GREATER: + gd.sel.atst = ATST_GEQUAL; + gd.aref += GSVector4i::x00000001(); + break; + } + } + } + + const bool fwrite = (fm & fm_mask) != fm_mask; + const bool ftest = gd.sel.atst != ATST_ALWAYS || context->TEST.DATE && context->FRAME.PSM != PSM_PSMCT24; + + const bool zwrite = zm != 0xffffffff; + const bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS; + if (!fwrite && !zwrite) + return false; + + gd.sel.fwrite = fwrite; + gd.sel.ftest = ftest; + + if (fwrite || ftest) + { + gd.sel.fpsm = GSLocalMemory::m_psm[context->FRAME.PSM].fmt; + + if ((primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS) && vt.m_eq.rgba != 0xffff) + { + gd.sel.iip = PRIM->IIP; + } + + if (PRIM->TME) + { + gd.sel.tfx = context->TEX0.TFX; + gd.sel.tcc = context->TEX0.TCC; + gd.sel.fst = PRIM->FST; + gd.sel.ltf = vt.IsLinear(); + + if (GSLocalMemory::m_psm[context->TEX0.PSM].pal > 0) + { + gd.sel.tlu = 1; + + gd.clut = clut_storage; // FIXME: might address uninitialized data of the texture (0xCD) that is not in 0-15 range for 4-bpp formats + + memcpy(gd.clut, (const u32*)hw.m_mem.m_clut, sizeof(u32) * GSLocalMemory::m_psm[context->TEX0.PSM].pal); + } + + gd.sel.wms = context->CLAMP.WMS; + gd.sel.wmt = context->CLAMP.WMT; + + if (gd.sel.tfx == TFX_MODULATE && gd.sel.tcc && vt.m_eq.rgba == 0xffff && vt.m_min.c.eq(GSVector4i(128))) + { + // modulate does not do anything when vertex color is 0x80 + + gd.sel.tfx = TFX_DECAL; + } + + GIFRegTEX0 TEX0 = context->GetSizeFixedTEX0(vt.m_min.t.xyxy(vt.m_max.t), vt.IsLinear(), false); + + const GSVector4i r = hw.GetTextureMinMax(TEX0, context->CLAMP, gd.sel.ltf).coverage; + + if (!hw.m_sw_texture) + hw.m_sw_texture = std::make_unique(0, TEX0, env.TEXA); + else + hw.m_sw_texture->Reset(0, TEX0, env.TEXA); + + hw.m_sw_texture->Update(r); + gd.tex[0] = hw.m_sw_texture->m_buff; + + gd.sel.tw = hw.m_sw_texture->m_tw - 3; + + { + // skip per pixel division if q is constant. Sprite uses flat + // q, so it's always constant by primitive. + // Note: the 'q' division was done in GSRendererSW::ConvertVertexBuffer + gd.sel.fst |= (vt.m_eq.q || primclass == GS_SPRITE_CLASS); + + if (gd.sel.ltf && gd.sel.fst) + { + // if q is constant we can do the half pel shift for bilinear sampling on the vertices + + // TODO: but not when mipmapping is used!!! + + const GSVector4 half(0x8000, 0x8000); + + GSVertexSW* RESTRICT v = data.vertex; + + for (int i = 0, j = data.vertex_count; i < j; i++) + { + const GSVector4 t = v[i].t; + + v[i].t = (t - half).xyzw(t); + } + } + } + + u16 tw = 1u << TEX0.TW; + u16 th = 1u << TEX0.TH; + + if (tw > 1024) + tw = 1; + + if (th > 1024) + th = 1; + + switch (context->CLAMP.WMS) + { + case CLAMP_REPEAT: + gd.t.min.U16[0] = gd.t.minmax.U16[0] = tw - 1; + gd.t.max.U16[0] = gd.t.minmax.U16[2] = 0; + gd.t.mask.U32[0] = 0xffffffff; + break; + case CLAMP_CLAMP: + gd.t.min.U16[0] = gd.t.minmax.U16[0] = 0; + gd.t.max.U16[0] = gd.t.minmax.U16[2] = tw - 1; + gd.t.mask.U32[0] = 0; + break; + case CLAMP_REGION_CLAMP: + // REGION_CLAMP ignores the actual texture size + gd.t.min.U16[0] = gd.t.minmax.U16[0] = context->CLAMP.MINU; + gd.t.max.U16[0] = gd.t.minmax.U16[2] = context->CLAMP.MAXU; + gd.t.mask.U32[0] = 0; + break; + case CLAMP_REGION_REPEAT: + // MINU is restricted to MINU or texture size, whichever is smaller, MAXU is an offset in the texture. + gd.t.min.U16[0] = gd.t.minmax.U16[0] = context->CLAMP.MINU & (tw - 1); + gd.t.max.U16[0] = gd.t.minmax.U16[2] = context->CLAMP.MAXU; + gd.t.mask.U32[0] = 0xffffffff; + break; + default: + __assume(0); + } + + switch (context->CLAMP.WMT) + { + case CLAMP_REPEAT: + gd.t.min.U16[4] = gd.t.minmax.U16[1] = th - 1; + gd.t.max.U16[4] = gd.t.minmax.U16[3] = 0; + gd.t.mask.U32[2] = 0xffffffff; + break; + case CLAMP_CLAMP: + gd.t.min.U16[4] = gd.t.minmax.U16[1] = 0; + gd.t.max.U16[4] = gd.t.minmax.U16[3] = th - 1; + gd.t.mask.U32[2] = 0; + break; + case CLAMP_REGION_CLAMP: + // REGION_CLAMP ignores the actual texture size + gd.t.min.U16[4] = gd.t.minmax.U16[1] = context->CLAMP.MINV; + gd.t.max.U16[4] = gd.t.minmax.U16[3] = context->CLAMP.MAXV; // ffx anima summon scene, when the anchor appears (th = 256, maxv > 256) + gd.t.mask.U32[2] = 0; + break; + case CLAMP_REGION_REPEAT: + // MINV is restricted to MINV or texture size, whichever is smaller, MAXV is an offset in the texture. + gd.t.min.U16[4] = gd.t.minmax.U16[1] = context->CLAMP.MINV & (th - 1); // skygunner main menu water texture 64x64, MINV = 127 + gd.t.max.U16[4] = gd.t.minmax.U16[3] = context->CLAMP.MAXV; + gd.t.mask.U32[2] = 0xffffffff; + break; + default: + __assume(0); + } + + gd.t.min = gd.t.min.xxxxlh(); + gd.t.max = gd.t.max.xxxxlh(); + gd.t.mask = gd.t.mask.xxzz(); + gd.t.invmask = ~gd.t.mask; + } + + if (PRIM->FGE) + { + gd.sel.fge = 1; + + gd.frb = env.FOGCOL.U32[0] & 0x00ff00ff; + gd.fga = (env.FOGCOL.U32[0] >> 8) & 0x00ff00ff; + } + + if (context->FRAME.PSM != PSM_PSMCT24) + { + gd.sel.date = context->TEST.DATE; + gd.sel.datm = context->TEST.DATM; + } + + if (!hw.IsOpaque()) + { + gd.sel.abe = PRIM->ABE; + gd.sel.ababcd = context->ALPHA.U32[0]; + + if (env.PABE.PABE) + { + gd.sel.pabe = 1; + } + + if (PRIM->AA1 && (primclass == GS_LINE_CLASS || primclass == GS_TRIANGLE_CLASS)) + { + gd.sel.aa1 = 1; + } + + gd.afix = GSVector4i((int)context->ALPHA.FIX << 7).xxzzlh(); + } + + const u32 masked_fm = fm & fm_mask; + if (gd.sel.date + || gd.sel.aba == 1 || gd.sel.abb == 1 || gd.sel.abc == 1 || gd.sel.abd == 1 + || gd.sel.atst != ATST_ALWAYS && gd.sel.afail == AFAIL_RGB_ONLY + || gd.sel.fpsm == 0 && masked_fm != 0 && masked_fm != fm_mask + || gd.sel.fpsm == 1 && masked_fm != 0 && masked_fm != fm_mask + || gd.sel.fpsm == 2 && masked_fm != 0 && masked_fm != fm_mask) + { + gd.sel.rfb = 1; + } + + gd.sel.colclamp = env.COLCLAMP.CLAMP; + gd.sel.fba = context->FBA.FBA; + + if (env.DTHE.DTHE) + { + gd.sel.dthe = 1; + + gd.dimx = dimx_storage; + + memcpy(gd.dimx, env.dimx, sizeof(env.dimx)); + } + } + + gd.sel.zwrite = zwrite; + gd.sel.ztest = ztest; + + if (zwrite || ztest) + { + const u32 z_max = 0xffffffff >> (GSLocalMemory::m_psm[context->ZBUF.PSM].fmt * 8); + + gd.sel.zpsm = GSLocalMemory::m_psm[context->ZBUF.PSM].fmt; + gd.sel.ztst = ztest ? context->TEST.ZTST : (int)ZTST_ALWAYS; + gd.sel.zequal = !!vt.m_eq.z; + gd.sel.zoverflow = (u32)GSVector4i(vt.m_max.p).z == 0x80000000U; + gd.sel.zclamp = (u32)GSVector4i(vt.m_max.p).z > z_max; + } + +#if _M_SSE >= 0x501 + + gd.fm = fm; + gd.zm = zm; + + if (gd.sel.fpsm == 1) + { + gd.fm |= 0xff000000; + } + else if (gd.sel.fpsm == 2) + { + u32 rb = gd.fm & 0x00f800f8; + u32 ga = gd.fm & 0x8000f800; + + gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | 0xffff0000; + } + + if (gd.sel.zpsm == 1) + { + gd.zm |= 0xff000000; + } + else if (gd.sel.zpsm == 2) + { + gd.zm |= 0xffff0000; + } + +#else + + gd.fm = GSVector4i(fm); + gd.zm = GSVector4i(zm); + + if (gd.sel.fpsm == 1) + { + gd.fm |= GSVector4i::xff000000(); + } + else if (gd.sel.fpsm == 2) + { + GSVector4i rb = gd.fm & 0x00f800f8; + GSVector4i ga = gd.fm & 0x8000f800; + + gd.fm = (ga >> 16) | (rb >> 9) | (ga >> 6) | (rb >> 3) | GSVector4i::xffff0000(); + } + + if (gd.sel.zpsm == 1) + { + gd.zm |= GSVector4i::xff000000(); + } + else if (gd.sel.zpsm == 2) + { + gd.zm |= GSVector4i::xffff0000(); + } + +#endif + + if (gd.sel.prim == GS_SPRITE_CLASS && !gd.sel.ftest && !gd.sel.ztest && data.bbox.eq(data.bbox.rintersect(data.scissor))) // TODO: check scissor horizontally only + { + gd.sel.notest = 1; + + const u32 ofx = context->XYOFFSET.OFX; + + for (int i = 0, j = hw.m_vertex.tail; i < j; i++) + { +#if _M_SSE >= 0x501 + if ((((hw.m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 7) // aligned to 8 +#else + if ((((hw.m_vertex.buff[i].XYZ.X - ofx) + 15) >> 4) & 3) // aligned to 4 +#endif + { + gd.sel.notest = 0; + + break; + } + } + } + + + if (!hw.m_sw_rasterizer) + hw.m_sw_rasterizer = std::make_unique(new GSDrawScanline(), 0, 1); + + static_cast(hw.m_sw_rasterizer.get())->Draw(&data); + + hw.m_tc->InvalidateVideoMem(context->offset.fb, bbox); + return true; +} diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index 85d8674d05..c656b8cca1 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -17,8 +17,12 @@ #include "GSDrawScanline.h" #include "GSTextureCacheSW.h" +#if MULTI_ISA_COMPILE_ONCE // Lack of a better home constexpr GSScanlineConstantData g_const; +#endif + +MULTI_ISA_UNSHARED_IMPL; GSDrawScanline::GSDrawScanline() : m_sp_map("GSSetupPrim", &m_local) diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.h b/pcsx2/GS/Renderers/SW/GSDrawScanline.h index 2ecff373c3..819e1f8dcd 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.h @@ -21,6 +21,8 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSDrawScanlineCodeGenerator.h" +MULTI_ISA_UNSHARED_START + class GSDrawScanline : public IDrawScanline { public: @@ -85,3 +87,5 @@ public: m_ds_map.PrintStats(); } }; + +MULTI_ISA_UNSHARED_END diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp index 59ac8b95b4..50752da109 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.cpp @@ -18,6 +18,7 @@ #include "GS/Renderers/Common/GSFunctionMap.h" #include "GSVertexSW.h" +MULTI_ISA_UNSHARED_IMPL; using namespace Xbyak; // Ease the reading of the code diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h index 8054a31b70..bd19766a7c 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.all.h @@ -17,6 +17,7 @@ #include "GSScanlineEnvironment.h" #include "GSNewCodeGenerator.h" +#include "GS/MultiISA.h" #undef _t // Conflict with wx, hopefully no one needs this @@ -30,6 +31,8 @@ #define DRAW_SCANLINE_USING_YMM 0 #endif +MULTI_ISA_UNSHARED_START + class GSDrawScanlineCodeGenerator2 : public GSNewCodeGenerator { using _parent = GSNewCodeGenerator; @@ -187,3 +190,5 @@ private: int pixels, int mip_offset); void ReadTexelImpl(const Xmm& dst, const Xmm& addr, u8 i, bool texInA3, bool preserveDst); }; + +MULTI_ISA_UNSHARED_END diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp index 268f30f61d..c531e9a547 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.cpp @@ -21,11 +21,13 @@ #include #include -static std::map s_use_c_draw_scanline; -static std::mutex s_use_c_draw_scanline_mutex; +MULTI_ISA_UNSHARED_IMPL; static bool shouldUseCDrawScanline(u64 key) { + static std::map s_use_c_draw_scanline; + static std::mutex s_use_c_draw_scanline_mutex; + static const char* const fname = getenv("USE_C_DRAW_SCANLINE"); if (!fname) return false; diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h index 60d7fe99ed..bfba670b6f 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSDrawScanlineCodeGenerator.h @@ -18,6 +18,7 @@ #include "GSScanlineEnvironment.h" #include "GS/Renderers/Common/GSFunctionMap.h" #include "GS/GSUtil.h" +#include "GS/MultiISA.h" #if defined(_M_AMD64) || defined(_WIN64) #define RegLong Xbyak::Reg64 @@ -25,6 +26,8 @@ #define RegLong Xbyak::Reg32 #endif +MULTI_ISA_UNSHARED_START + class GSDrawScanlineCodeGenerator : public GSCodeGenerator { void operator=(const GSDrawScanlineCodeGenerator&); @@ -36,3 +39,5 @@ class GSDrawScanlineCodeGenerator : public GSCodeGenerator public: GSDrawScanlineCodeGenerator(void* param, u64 key, void* code, size_t maxsize); }; + +MULTI_ISA_UNSHARED_END diff --git a/pcsx2/GS/Renderers/SW/GSRasterizer.cpp b/pcsx2/GS/Renderers/SW/GSRasterizer.cpp index bd8eed1bb4..2418be8370 100644 --- a/pcsx2/GS/Renderers/SW/GSRasterizer.cpp +++ b/pcsx2/GS/Renderers/SW/GSRasterizer.cpp @@ -27,6 +27,8 @@ #define ENABLE_DRAW_STATS 0 +MULTI_ISA_UNSHARED_IMPL; + int GSRasterizerData::s_counter = 0; static int compute_best_thread_height(int threads) diff --git a/pcsx2/GS/Renderers/SW/GSRasterizer.h b/pcsx2/GS/Renderers/SW/GSRasterizer.h index 02f7905029..a6e047c588 100644 --- a/pcsx2/GS/Renderers/SW/GSRasterizer.h +++ b/pcsx2/GS/Renderers/SW/GSRasterizer.h @@ -21,6 +21,9 @@ #include "GS/GSPerfMon.h" #include "GS/GSThread_CXX11.h" #include "GS/GSRingHeap.h" +#include "GS/MultiISA.h" + +MULTI_ISA_UNSHARED_START class alignas(32) GSRasterizerData : public GSAlignedClass<32> { @@ -113,7 +116,7 @@ public: __forceinline bool IsSolidRect() const { return m_dr != NULL; } }; -class IRasterizer : public GSAlignedClass<32> +class IRasterizer : public GSVirtualAlignedClass<32> { public: virtual ~IRasterizer() {} @@ -234,3 +237,5 @@ public: int GetPixels(bool reset); void PrintStats() {} }; + +MULTI_ISA_UNSHARED_END diff --git a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp index f1ddcd9ae9..6577ce9637 100644 --- a/pcsx2/GS/Renderers/SW/GSRendererSW.cpp +++ b/pcsx2/GS/Renderers/SW/GSRendererSW.cpp @@ -18,14 +18,18 @@ #include "GS/GSGL.h" #include "common/StringUtil.h" +MULTI_ISA_UNSHARED_IMPL; + +GSRenderer* CURRENT_ISA::makeGSRendererSW(int threads) +{ + return new GSRendererSW(threads); +} + #define LOG 0 static FILE* s_fp = LOG ? fopen("c:\\temp1\\_.txt", "w") : NULL; -CONSTINIT const GSVector4 GSVertexSW::m_pos_scale = GSVector4::cxpr(1.0f / 16, 1.0f / 16, 1.0f, 128.0f); -#if _M_SSE >= 0x501 -CONSTINIT const GSVector8 GSVertexSW::m_pos_scale2 = GSVector8::cxpr(1.0f / 16, 1.0f / 16, 1.0f, 128.0f, 1.0f / 16, 1.0f / 16, 1.0f, 128.0f); -#endif +static constexpr GSVector4 s_pos_scale = GSVector4::cxpr(1.0f / 16, 1.0f / 16, 1.0f, 128.0f); GSRendererSW::GSRendererSW(int threads) : GSRenderer(), m_fzb(NULL) @@ -223,9 +227,20 @@ GSTexture* GSRendererSW::GetFeedbackOutput() return nullptr; } +MULTI_ISA_DEF(void GSVertexSWInitStatic();) + +#if MULTI_ISA_COMPILE_ONCE +GSVertexSW::ConvertVertexBufferPtr GSVertexSW::s_cvb[4][2][2][2]; +void GSVertexSW::InitStatic() +{ + MULTI_ISA_SELECT(GSVertexSWInitStatic)(); +} +#endif + +MULTI_ISA_UNSHARED_START template -void GSVertexSW::ConvertVertexBuffer(GSDrawingContext* RESTRICT ctx, GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count) +void ConvertVertexBuffer(const GSDrawingContext* RESTRICT ctx, GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count) { // FIXME q_div wasn't added to AVX2 code path. @@ -274,7 +289,7 @@ void GSVertexSW::ConvertVertexBuffer(GSDrawingContext* RESTRICT ctx, GSVertexSW* if (primclass == GS_SPRITE_CLASS) { - dst->p = GSVector4(xy).xyyw(GSVector4(xyzuvf)) * m_pos_scale; + dst->p = GSVector4(xy).xyyw(GSVector4(xyzuvf)) * s_pos_scale; xyzuvf = xyzuvf.min_u32(z_max); t = t.insert32<1, 3>(GSVector4::cast(xyzuvf)); @@ -282,7 +297,7 @@ void GSVertexSW::ConvertVertexBuffer(GSDrawingContext* RESTRICT ctx, GSVertexSW* else { double z = static_cast(static_cast(xyzuvf.extract32<1>())); - dst->p = (GSVector4(xy) * m_pos_scale).upld(GSVector4::f64(z, 0.0)); + dst->p = (GSVector4(xy) * s_pos_scale).upld(GSVector4::f64(z, 0.0)); t = t.blend32<8>(GSVector4(xyzuvf << 7)); } @@ -296,22 +311,23 @@ void GSVertexSW::ConvertVertexBuffer(GSDrawingContext* RESTRICT ctx, GSVertexSW* } } -// clang-format off -GSVertexSW::ConvertVertexBufferPtr GSVertexSW::s_cvb[4][2][2][2] = { -#define InitCVB3(P, T, F) { &GSVertexSW::ConvertVertexBuffer, &GSVertexSW::ConvertVertexBuffer } -#define InitCVB2(P, T) { InitCVB3(P, T, 0), InitCVB3(P, T, 1) } -#define InitCVB(P) { InitCVB2(static_cast(P), 0), InitCVB2(static_cast(P), 1) } - - InitCVB(GS_POINT_CLASS), - InitCVB(GS_LINE_CLASS), - InitCVB(GS_TRIANGLE_CLASS), - InitCVB(GS_SPRITE_CLASS) - -#undef InitCVB +void GSVertexSWInitStatic() +{ +#define InitCVB4(P, T, F, Q) GSVertexSW::s_cvb[P][T][F][Q] = ConvertVertexBuffer; +#define InitCVB3(P, T, F) InitCVB4(P, T, F, 0) InitCVB4(P, T, F, 1) +#define InitCVB2(P, T) InitCVB3(P, T, 0) InitCVB3(P, T, 1) +#define InitCVB1(P) InitCVB2(P, 0) InitCVB2(P, 1) + InitCVB1(GS_POINT_CLASS) + InitCVB1(GS_LINE_CLASS) + InitCVB1(GS_TRIANGLE_CLASS) + InitCVB1(GS_SPRITE_CLASS) +#undef InitCVB1 #undef InitCVB2 #undef InitCVB3 -}; -// clang-format on +#undef InitCVB4 +} + +MULTI_ISA_UNSHARED_END void GSRendererSW::Draw() { diff --git a/pcsx2/GS/Renderers/SW/GSRendererSW.h b/pcsx2/GS/Renderers/SW/GSRendererSW.h index 715601aa01..321bfa7c14 100644 --- a/pcsx2/GS/Renderers/SW/GSRendererSW.h +++ b/pcsx2/GS/Renderers/SW/GSRendererSW.h @@ -18,6 +18,9 @@ #include "GSTextureCacheSW.h" #include "GSDrawScanline.h" #include "GS/GSRingHeap.h" +#include "GS/MultiISA.h" + +MULTI_ISA_UNSHARED_START class GSRendererSW final : public GSRenderer { @@ -95,3 +98,5 @@ public: void Destroy() override; }; + +MULTI_ISA_UNSHARED_END diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp index bd49c45abf..8571a7e352 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.cpp @@ -17,6 +17,7 @@ #include "GSSetupPrimCodeGenerator.all.h" #include "GSVertexSW.h" +MULTI_ISA_UNSHARED_IMPL; using namespace Xbyak; #define _rip_local(field) ((m_rip) ? ptr[rip + (char*)&m_local.field] : ptr[_m_local + OFFSETOF(GSScanlineLocalData, field)]) diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h index 6b1463f919..c37170fc0a 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.all.h @@ -17,6 +17,7 @@ #include "GSScanlineEnvironment.h" #include "GSNewCodeGenerator.h" +#include "GS/MultiISA.h" #if _M_SSE >= 0x501 #define SETUP_PRIM_VECTOR_REGISTER Xbyak::Ymm @@ -28,6 +29,8 @@ #define SETUP_PRIM_USING_YMM 0 #endif +MULTI_ISA_UNSHARED_START + class GSSetupPrimCodeGenerator2 : public GSNewCodeGenerator { using _parent = GSNewCodeGenerator; @@ -77,3 +80,5 @@ private: void Texture(); void Color(); }; + +MULTI_ISA_UNSHARED_END diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.cpp b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.cpp index d28d7ba162..08cf573733 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.cpp +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.cpp @@ -17,7 +17,7 @@ #include "GSSetupPrimCodeGenerator.h" #include "GSSetupPrimCodeGenerator.all.h" -using namespace Xbyak; +MULTI_ISA_UNSHARED_IMPL; GSSetupPrimCodeGenerator::GSSetupPrimCodeGenerator(void* param, u64 key, void* code, size_t maxsize) : GSCodeGenerator(code, maxsize) diff --git a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.h b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.h index 17b50e6b60..affbf29676 100644 --- a/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.h +++ b/pcsx2/GS/Renderers/SW/GSSetupPrimCodeGenerator.h @@ -18,6 +18,9 @@ #include "GSScanlineEnvironment.h" #include "GS/Renderers/Common/GSFunctionMap.h" #include "GS/GSUtil.h" +#include "GS/MultiISA.h" + +MULTI_ISA_UNSHARED_START class GSSetupPrimCodeGenerator : public GSCodeGenerator { @@ -35,3 +38,5 @@ class GSSetupPrimCodeGenerator : public GSCodeGenerator public: GSSetupPrimCodeGenerator(void* param, u64 key, void* code, size_t maxsize); }; + +MULTI_ISA_UNSHARED_END diff --git a/pcsx2/GS/Renderers/SW/GSVertexSW.h b/pcsx2/GS/Renderers/SW/GSVertexSW.h index 028ecbc381..316f6cde61 100644 --- a/pcsx2/GS/Renderers/SW/GSVertexSW.h +++ b/pcsx2/GS/Renderers/SW/GSVertexSW.h @@ -246,17 +246,11 @@ struct alignas(32) GSVertexSW #endif } - typedef void (*ConvertVertexBufferPtr)(GSDrawingContext* RESTRICT ctx, GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); + typedef void (*ConvertVertexBufferPtr)(const GSDrawingContext* RESTRICT ctx, GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); static ConvertVertexBufferPtr s_cvb[4][2][2][2]; - template - static void ConvertVertexBuffer(GSDrawingContext* RESTRICT ctx, GSVertexSW* RESTRICT dst, const GSVertex* RESTRICT src, size_t count); - - static const GSVector4 m_pos_scale; -#if _M_SSE >= 0x501 - static const GSVector8 m_pos_scale2; -#endif + static void InitStatic(); }; #if _M_SSE >= 0x501 diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj index 66e8ff908c..f8e9b92b56 100644 --- a/pcsx2/pcsx2.vcxproj +++ b/pcsx2/pcsx2.vcxproj @@ -456,6 +456,7 @@ + diff --git a/pcsx2/pcsx2.vcxproj.filters b/pcsx2/pcsx2.vcxproj.filters index 3c36422c72..8fdceb3c52 100644 --- a/pcsx2/pcsx2.vcxproj.filters +++ b/pcsx2/pcsx2.vcxproj.filters @@ -1607,6 +1607,9 @@ System\Ps2\GS\Renderers\Hardware + + System\Ps2\GS\Renderers\Hardware + System\Ps2\GS\Renderers\Hardware diff --git a/pcsx2/pcsx2core.vcxproj b/pcsx2/pcsx2core.vcxproj index aae45c7b42..6410aabdde 100644 --- a/pcsx2/pcsx2core.vcxproj +++ b/pcsx2/pcsx2core.vcxproj @@ -309,6 +309,7 @@ + diff --git a/pcsx2/pcsx2core.vcxproj.filters b/pcsx2/pcsx2core.vcxproj.filters index dbfb50d0da..977451b52f 100644 --- a/pcsx2/pcsx2core.vcxproj.filters +++ b/pcsx2/pcsx2core.vcxproj.filters @@ -1100,6 +1100,9 @@ System\Ps2\GS\Renderers\Hardware + + System\Ps2\GS\Renderers\Hardware + System\Ps2\GS\Renderers\Hardware