From 59fd815c3d86222768dc7177066e1c874b9bf117 Mon Sep 17 00:00:00 2001 From: TellowKrinkle Date: Tue, 2 Mar 2021 22:32:33 -0600 Subject: [PATCH] GS: Faster GSOffset::PAHelper --- pcsx2/GS/GSClut.cpp | 21 ++- pcsx2/GS/GSLocalMemory.cpp | 198 ++++++++++++----------- pcsx2/GS/GSLocalMemory.h | 97 ++++++++--- pcsx2/GS/GSState.cpp | 56 ++++--- pcsx2/GS/Renderers/HW/GSRendererHW.cpp | 30 ++-- pcsx2/GS/Renderers/SW/GSDrawScanline.cpp | 8 +- 6 files changed, 253 insertions(+), 157 deletions(-) diff --git a/pcsx2/GS/GSClut.cpp b/pcsx2/GS/GSClut.cpp index ceed73402e..1ae56ecbc3 100644 --- a/pcsx2/GS/GSClut.cpp +++ b/pcsx2/GS/GSClut.cpp @@ -197,14 +197,13 @@ template void GSClut::WriteCLUT32_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT) { GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSM_PSMCT32); - GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COV); + auto pa = off.paMulti(m_mem->m_vm32, TEXCLUT.COU << 4, TEXCLUT.COV); - int x = TEXCLUT.COU << 4; uint16* RESTRICT clut = m_clut + ((TEX0.CSA & 15) << 4); - for (int i = 0; i < n; x++, i++) + for (int i = 0; i < n; i++) { - uint32 c = m_mem->m_vm32[pa.value(x)]; + uint32 c = *pa.value(i); clut[i] = (uint16)(c & 0xffff); clut[i + 256] = (uint16)(c >> 16); @@ -215,14 +214,13 @@ template void GSClut::WriteCLUT16_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT) { GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSM_PSMCT16); - GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COV); + auto pa = off.paMulti(m_mem->m_vm16, TEXCLUT.COU << 4, TEXCLUT.COV); - int x = TEXCLUT.COU << 4; uint16* RESTRICT clut = m_clut + (TEX0.CSA << 4); - for (int i = 0; i < n; x++, i++) + for (int i = 0; i < n; i++) { - clut[i] = m_mem->m_vm16[pa.value(x)]; + clut[i] = *pa.value(i); } } @@ -230,14 +228,13 @@ template void GSClut::WriteCLUT16S_CSM2(const GIFRegTEX0& TEX0, const GIFRegTEXCLUT& TEXCLUT) { GSOffset off = GSOffset::fromKnownPSM(TEX0.CBP, TEXCLUT.CBW, PSM_PSMCT16S); - GSOffset::PAHelper pa = off.paMulti(TEXCLUT.COV); + auto pa = off.paMulti(m_mem->m_vm16, TEXCLUT.COU << 4, TEXCLUT.COV); - int x = TEXCLUT.COU << 4; uint16* RESTRICT clut = m_clut + (TEX0.CSA << 4); - for (int i = 0; i < n; x++, i++) + for (int i = 0; i < n; i++) { - clut[i] = m_mem->m_vm16[pa.value(x)]; + clut[i] = *pa.value(i); } } diff --git a/pcsx2/GS/GSLocalMemory.cpp b/pcsx2/GS/GSLocalMemory.cpp index b021de4f9b..93f3376e33 100644 --- a/pcsx2/GS/GSLocalMemory.cpp +++ b/pcsx2/GS/GSLocalMemory.cpp @@ -1065,6 +1065,63 @@ void GSLocalMemory::WriteImage24Z(int& tx, int& ty, const uint8* src, int len, G } } +/// Helper for WriteImageX and ReadImageX +/// `len` is in pixels, unlike WriteImageX/ReadImageX where it's bytes +/// `xinc` is the amount to increment `x` by per iteration +/// Calls `paGetter` on a starting (x, y) to get some sort of pixel address helper for each line, +/// then `fn` on the helper and an x offset once for every `xinc` pixels along that line +template +static void readWriteHelperImpl(int& tx, int& ty, int len, int xinc, int sx, int w, PAGetter&& paGetter, Fn&& fn) +{ + int y = ty; + int ex = sx + w; + int remX = ex - tx; + + ASSERT(remX >= 0); + + auto pa = paGetter(tx, y); + + while (len > 0) + { + int stop = std::min(remX, len); + len -= stop; + remX -= stop; + + for (int x = 0; x < stop; x += xinc) + fn(pa, x); + + if (remX == 0) + { + y++; + remX = w; + pa = paGetter(sx, y); + } + } + + tx = ex - remX; + ty = y; +} + +/// Helper for WriteImageX and ReadImageX +/// `len` is in pixels, unlike WriteImageX/ReadImageX where it's bytes +/// `xinc` is the amount to increment `x` by per iteration +/// Calls `fn` with a `PAHelper` representing the current line and an int representing the x offset in that line +template +static void readWriteHelper(int& tx, int& ty, int len, int xinc, int sx, int w, const GSOffset& off, Fn&& fn) +{ + readWriteHelperImpl(tx, ty, len, xinc, sx, w, [&](int x, int y){ return off.paMulti(x, y); }, std::forward(fn)); +} + +/// Helper for WriteImageX and ReadImageX +/// `len` is in pixels, unlike WriteImageX/ReadImageX where it's bytes +/// `xinc` is the amount to increment `x` by per iteration +/// Calls `fn` with a `PAPtrHelper` representing the current line and an int representing the x offset in that line +template +static void readWriteHelper(VM* vm, int& tx, int& ty, int len, int xinc, int sx, int w, const GSOffset& off, Fn&& fn) +{ + readWriteHelperImpl(tx, ty, len, xinc, sx, w, [&](int x, int y){ return off.paMulti(vm, x, y); }, std::forward(fn)); +} + void GSLocalMemory::WriteImageX(int& tx, int& ty, const uint8* src, int len, GIFRegBITBLTBUF& BITBLTBUF, GIFRegTRXPOS& TRXPOS, GIFRegTRXREG& TRXREG) { if (len <= 0) @@ -1077,27 +1134,8 @@ void GSLocalMemory::WriteImageX(int& tx, int& ty, const uint8* src, int len, GIF uint32 bp = BITBLTBUF.DBP; uint32 bw = BITBLTBUF.DBW; - int x = tx; - int y = ty; - int sx = (int)TRXPOS.DSAX; - int ex = sx + (int)TRXREG.RRW; - - auto copy = [&](int len, const GSOffset& off, auto&& fn) - { - GSOffset::PAHelper pa = off.paMulti(y); - - for (; len > 0; len--) - { - fn(pa); - x++; - if (x >= ex) - { - y++; - x = sx; - pa = off.paMulti(y); - } - } - }; + int sx = TRXPOS.DSAX; + int w = TRXREG.RRW; GSOffset off = GetOffset(bp, bw, BITBLTBUF.DPSM); @@ -1105,16 +1143,16 @@ void GSLocalMemory::WriteImageX(int& tx, int& ty, const uint8* src, int len, GIF { case PSM_PSMCT32: case PSM_PSMZ32: - copy(len / 4, off.assertSizesMatch(swizzle32), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len / 4, 1, sx, w, off.assertSizesMatch(swizzle32), [&](auto& pa, int x) { - WritePixel32(pa.value(x), *pd); + *pa.value(x) = *pd; pd++; }); break; case PSM_PSMCT24: case PSM_PSMZ24: - copy(len / 3, off.assertSizesMatch(swizzle32), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len / 3, 1, sx, w, off.assertSizesMatch(swizzle32), [&](auto& pa, int x) { WritePixel24(pa.value(x), *(uint32*)pb); pb += 3; @@ -1125,32 +1163,32 @@ void GSLocalMemory::WriteImageX(int& tx, int& ty, const uint8* src, int len, GIF case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - copy(len / 2, off.assertSizesMatch(swizzle16), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm16, tx, ty, len / 2, 1, sx, w, off.assertSizesMatch(swizzle16), [&](auto& pa, int x) { - WritePixel16(pa.value(x), *pw); + *pa.value(x) = *pw; pw++; }); break; case PSM_PSMT8: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT8), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm8, tx, ty, len, 1, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT8), [&](auto& pa, int x) { - WritePixel8(pa.value(x), *pb); + *pa.value(x) = *pb; pb++; }); break; case PSM_PSMT4: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4), [&](GSOffset::PAHelper& pa) + readWriteHelper(tx, ty, len * 2, 2, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4), [&](GSOffset::PAHelper& pa, int x) { - WritePixel4(pa.value(x++), *pb & 0xf); - WritePixel4(pa.value(x), *pb >> 4); + WritePixel4(pa.value(x), *pb & 0xf); + WritePixel4(pa.value(x + 1), *pb >> 4); pb++; }); break; case PSM_PSMT8H: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT8H), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len, 1, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT8H), [&](auto& pa, int x) { WritePixel8H(pa.value(x), *pb); pb++; @@ -1158,26 +1196,23 @@ void GSLocalMemory::WriteImageX(int& tx, int& ty, const uint8* src, int len, GIF break; case PSM_PSMT4HL: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4HL), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len * 2, 2, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4HL), [&](auto& pa, int x) { - WritePixel4HL(pa.value(x++), *pb & 0xf); - WritePixel4HL(pa.value(x), *pb >> 4); + WritePixel4HL(pa.value(x), *pb & 0xf); + WritePixel4HL(pa.value(x + 1), *pb >> 4); pb++; }); break; case PSM_PSMT4HH: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4HH), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len * 2, 2, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4HH), [&](auto& pa, int x) { - WritePixel4HH(pa.value(x++), *pb & 0xf); - WritePixel4HH(pa.value(x), *pb >> 4); + WritePixel4HH(pa.value(x), *pb & 0xf); + WritePixel4HH(pa.value(x + 1), *pb >> 4); pb++; }); break; } - - tx = x; - ty = y; } // @@ -1193,29 +1228,9 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB uint32 bp = BITBLTBUF.SBP; uint32 bw = BITBLTBUF.SBW; - psm_t* RESTRICT psm = &m_psm[BITBLTBUF.SPSM]; - int x = tx; - int y = ty; - int sx = (int)TRXPOS.SSAX; - int ex = sx + (int)TRXREG.RRW; - - auto copy = [&](int len, const GSOffset& off, auto&& fn) - { - GSOffset::PAHelper pa = off.paMulti(y); - - for (; len > 0; len--) - { - fn(pa); - x++; - if (x >= ex) - { - y++; - x = sx; - pa = off.paMulti(y); - } - } - }; + int sx = TRXPOS.SSAX; + int w = TRXREG.RRW; GSOffset off = GetOffset(bp, bw, BITBLTBUF.SPSM); @@ -1228,50 +1243,57 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB { // MGS1 intro, fade effect between two scenes (airplane outside-inside transition) + int x = tx; + int y = ty; + int ex = sx + w; + len /= 4; - GSOffset::PAHelper pa = off.assertSizesMatch(swizzle32).paMulti(y); + GSOffset::PAPtrHelper pa = off.assertSizesMatch(swizzle32).paMulti(m_vm32, 0, y); while (len > 0) { for (; len > 0 && x < ex && (x & 7); len--, x++, pd++) { - *pd = m_vm32[pa.value(x)]; + *pd = *pa.value(x); } // aligned to a column for (int ex8 = ex - 8; len >= 8 && x <= ex8; len -= 8, x += 8, pd += 8) { - uint32* ps = m_vm32 + pa.value(x); + uint32* ps = pa.value(x); GSVector4i::store(&pd[0], GSVector4i::load(ps + 0, ps + 4)); GSVector4i::store(&pd[4], GSVector4i::load(ps + 8, ps + 12)); for (int i = 0; i < 8; i++) - ASSERT(pd[i] == m_vm32[pa.value(x + i)]); + ASSERT(pd[i] == *pa.value(x + i)); } for (; len > 0 && x < ex; len--, x++, pd++) { - *pd = m_vm32[pa.value(x)]; + *pd = *pa.value(x); } if (x == ex) { y++; x = sx; - pa = off.assertSizesMatch(swizzle32).paMulti(y); + pa = off.assertSizesMatch(swizzle32).paMulti(m_vm32, 0, y); } } + + tx = x; + ty = y; } break; case PSM_PSMCT24: case PSM_PSMZ24: - copy(len / 3, off.assertSizesMatch(swizzle32), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len / 3, 1, sx, w, off.assertSizesMatch(swizzle32), [&](auto& pa, int x) { - uint32 c = m_vm32[pa.value(x)]; + uint32 c = *pa.value(x); pb[0] = (uint8)(c); pb[1] = (uint8)(c >> 8); pb[2] = (uint8)(c >> 16); @@ -1283,62 +1305,58 @@ void GSLocalMemory::ReadImageX(int& tx, int& ty, uint8* dst, int len, GIFRegBITB case PSM_PSMCT16S: case PSM_PSMZ16: case PSM_PSMZ16S: - copy(len / 2, off.assertSizesMatch(swizzle16), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm16, tx, ty, len / 2, 1, sx, w, off.assertSizesMatch(swizzle16), [&](auto& pa, int x) { - *pw = m_vm16[pa.value(x)]; + *pw = *pa.value(x); pw++; }); break; case PSM_PSMT8: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT8), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm8, tx, ty, len, 1, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT8), [&](auto& pa, int x) { - *pb = m_vm8[pa.value(x)]; + *pb = *pa.value(x); pb++; }); break; case PSM_PSMT4: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4), [&](GSOffset::PAHelper& pa) + readWriteHelper(tx, ty, len * 2, 2, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4), [&](GSOffset::PAHelper& pa, int x) { - uint8 low = ReadPixel4(pa.value(x++)); - uint8 high = ReadPixel4(pa.value(x)); + uint8 low = ReadPixel4(pa.value(x)); + uint8 high = ReadPixel4(pa.value(x + 1)); *pb = low | (high << 4); - pb++; }); break; case PSM_PSMT8H: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT8H), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len, 1, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT8H), [&](auto& pa, int x) { - *pb = (uint8)(m_vm32[pa.value(x)] >> 24); + *pb = (uint8)(*pa.value(x) >> 24); pb++; }); break; case PSM_PSMT4HL: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4HL), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len * 2, 2, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4HL), [&](auto& pa, int x) { - uint32 c0 = m_vm32[pa.value(x++)] >> 24 & 0x0f; - uint32 c1 = m_vm32[pa.value(x)] >> 20 & 0xf0; + uint32 c0 = *pa.value(x) >> 24 & 0x0f; + uint32 c1 = *pa.value(x + 1) >> 20 & 0xf0; *pb = (uint8)(c0 | c1); pb++; }); break; case PSM_PSMT4HH: - copy(len, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4HH), [&](GSOffset::PAHelper& pa) + readWriteHelper(m_vm32, tx, ty, len * 2, 2, sx, w, GSOffset::fromKnownPSM(bp, bw, PSM_PSMT4HH), [&](auto& pa, int x) { - uint32 c0 = m_vm32[pa.value(x++)] >> 28 & 0x0f; - uint32 c1 = m_vm32[pa.value(x)] >> 24 & 0xf0; + uint32 c0 = *pa.value(x) >> 28 & 0x0f; + uint32 c1 = *pa.value(x + 1) >> 24 & 0xf0; *pb = (uint8)(c0 | c1); pb++; }); break; } - - tx = x; - ty = y; } /////////////////// diff --git a/pcsx2/GS/GSLocalMemory.h b/pcsx2/GS/GSLocalMemory.h index 04d393a462..80a3b5ea28 100644 --- a/pcsx2/GS/GSLocalMemory.h +++ b/pcsx2/GS/GSLocalMemory.h @@ -206,41 +206,80 @@ public: fn(bn.value()); } + /// Calculate the pixel address at the given y position with x of 0 + int pixelAddressZeroX(int y) const + { + int base = m_bp << (m_pageShiftX + m_pageShiftY - 5); // Offset from base pointer + base += ((y & ~m_pageMask.y) * m_bwPg) << m_pageShiftX; // Offset from pages in y direction + // TODO: Old GSOffset masked here but is that useful? Probably should mask at end or not at all... + base &= (MAX_PAGES << (m_pageShiftX + m_pageShiftY)) - 1; // Mask + base += m_pixelSwizzleCol[y & m_pageMask.y]; // Add offset from y within page + return base; + } + /// Helper class for efficiently getting the addresses of multiple pixels in a line (along the x axis) class PAHelper { /// Pixel swizzle array - const GSPixelRowOffsetTable* m_pixelSwizzleRow; + const int* m_pixelSwizzleRow; int m_base; public: PAHelper() = default; - PAHelper(const GSOffset& off, int y) + PAHelper(const GSOffset& off, int x, int y) { - m_pixelSwizzleRow = off.m_pixelSwizzleRow[y & off.m_pixelRowMask]; - m_base = off.m_bp << (off.m_pageShiftX + off.m_pageShiftY - 5); - m_base += ((y & ~off.m_pageMask.y) * off.m_bwPg) << off.m_pageShiftX; - m_base &= (MAX_PAGES << (off.m_pageShiftX + off.m_pageShiftY)) - 1; - m_base += off.m_pixelSwizzleCol[y & off.m_pageMask.y]; + m_pixelSwizzleRow = off.m_pixelSwizzleRow[y & off.m_pixelRowMask]->value + x; + m_base = off.pixelAddressZeroX(y); } - /// Get current pixel address - uint32 value(size_t x) const + /// Get pixel reference for the given x offset from the one used to create the PAHelper + uint32 value(int x) const { - return m_base + (*m_pixelSwizzleRow)[x]; + return m_base + m_pixelSwizzleRow[x]; + } + }; + + /// Helper class for efficiently getting the addresses of multiple pixels in a line (along the x axis) + /// Slightly more efficient than PAHelper by pre-adding the base offset to the VM pointer + template + class PAPtrHelper + { + /// Pixel swizzle array + const int* m_pixelSwizzleRow; + VM* m_base; + + public: + PAPtrHelper() = default; + PAPtrHelper(const GSOffset& off, VM* vm, int x, int y) + { + m_pixelSwizzleRow = off.m_pixelSwizzleRow[y & off.m_pixelRowMask]->value + x; + m_base = &vm[off.pixelAddressZeroX(y)]; + } + + /// Get pixel reference for the given x offset from the one used to create the PAPtrHelper + VM* value(int x) const + { + return m_base + m_pixelSwizzleRow[x]; } }; /// Get the address of the given pixel uint32 pa(int x, int y) const { - return PAHelper(*this, y).value(x); + return PAHelper(*this, 0, y).value(x); } /// Get a helper class for efficiently calculating multiple pixel addresses in a line (along the x axis) - PAHelper paMulti(int y) const + PAHelper paMulti(int x, int y) const { - return PAHelper(*this, y); + return PAHelper(*this, x, y); + } + + /// Get a helper class for efficiently calculating multiple pixel addresses in a line (along the x axis) + template + PAPtrHelper paMulti(VM* vm, int x, int y) const + { + return PAPtrHelper(*this, vm, x, y); } /// Loop over the pixels in the given rectangle @@ -252,10 +291,10 @@ public: for (int y = r.top; y < r.bottom; y++, px = reinterpret_cast(reinterpret_cast(px) + pitch)) { - PAHelper pa = paMulti(y); + PAPtrHelper pa = paMulti(vm, 0, y); for (int x = r.left; x < r.right; x++) { - fn(vm + pa.value(x), px + x); + fn(pa.value(x), px + x); } } } @@ -739,9 +778,14 @@ public: m_vm32[addr] = c; } + __forceinline static void WritePixel24(uint32* addr, uint32 c) + { + *addr = (*addr & 0xff000000) | (c & 0x00ffffff); + } + __forceinline void WritePixel24(uint32 addr, uint32 c) { - m_vm32[addr] = (m_vm32[addr] & 0xff000000) | (c & 0x00ffffff); + WritePixel24(m_vm32 + addr, c); } __forceinline void WritePixel16(uint32 addr, uint32 c) @@ -762,19 +806,34 @@ public: m_vm8[addr] = (uint8)((m_vm8[addr] & (0xf0 >> shift)) | ((c & 0x0f) << shift)); } + __forceinline static void WritePixel8H(uint32* addr, uint32 c) + { + *addr = (*addr & 0x00ffffff) | (c << 24); + } + __forceinline void WritePixel8H(uint32 addr, uint32 c) { - m_vm32[addr] = (m_vm32[addr] & 0x00ffffff) | (c << 24); + WritePixel8H(m_vm32 + addr, c); + } + + __forceinline static void WritePixel4HL(uint32* addr, uint32 c) + { + *addr = (*addr & 0xf0ffffff) | ((c & 0x0f) << 24); } __forceinline void WritePixel4HL(uint32 addr, uint32 c) { - m_vm32[addr] = (m_vm32[addr] & 0xf0ffffff) | ((c & 0x0f) << 24); + WritePixel4HL(m_vm32 + addr, c); + } + + __forceinline static void WritePixel4HH(uint32* addr, uint32 c) + { + *addr = (*addr & 0x0fffffff) | ((c & 0x0f) << 28); } __forceinline void WritePixel4HH(uint32 addr, uint32 c) { - m_vm32[addr] = (m_vm32[addr] & 0x0fffffff) | ((c & 0x0f) << 28); + WritePixel4HH(m_vm32 + addr, c); } __forceinline void WriteFrame16(uint32 addr, uint32 c) diff --git a/pcsx2/GS/GSState.cpp b/pcsx2/GS/GSState.cpp index 1387066469..0f7ca88b57 100644 --- a/pcsx2/GS/GSState.cpp +++ b/pcsx2/GS/GSState.cpp @@ -1657,66 +1657,86 @@ void GSState::Move() GSOffset spo = m_mem.GetOffset(sbp, sbw, m_env.BITBLTBUF.SPSM); GSOffset dpo = m_mem.GetOffset(dbp, dbw, m_env.BITBLTBUF.DPSM); - auto copy = [&](const GSOffset& dpo, const GSOffset& spo, auto&& pxCopyFn) + auto genericCopy = [=](const GSOffset& dpo, const GSOffset& spo, auto&& getPAHelper, auto&& pxCopyFn) { + int _sy = sy, _dy = dy; // Faster with local copied variables, compiler optimizations are dumb if (xinc > 0) { - for (int y = 0; y < h; y++, sy += yinc, dy += yinc) + for (int y = 0; y < h; y++, _sy += yinc, _dy += yinc) { - GSOffset::PAHelper s = spo.paMulti(sy); - GSOffset::PAHelper d = dpo.paMulti(dy); + auto s = getPAHelper(spo, sx, _sy); + auto d = getPAHelper(dpo, dx, _dy); for (int x = 0; x < w; x++) { - pxCopyFn(d.value(dx + x), s.value(sx + x)); + pxCopyFn(d, s, x); } } } else { - for (int y = 0; y < h; y++, sy += yinc, dy += yinc) + for (int y = 0; y < h; y++, _sy += yinc, _dy += yinc) { - GSOffset::PAHelper s = spo.paMulti(sy); - GSOffset::PAHelper d = dpo.paMulti(dy); + auto s = getPAHelper(spo, sx, _sy); + auto d = getPAHelper(dpo, dx, _dy); for (int x = 0; x < w; x++) { - pxCopyFn(d.value(dx - x), s.value(sx - x)); + pxCopyFn(d, s, -x); } } } }; + auto copy = [=](const GSOffset& dpo, const GSOffset& spo, auto&& pxCopyFn) + { + genericCopy(dpo, spo, + [](const GSOffset& o, int x, int y) { return o.paMulti(x, y); }, + [=](const GSOffset::PAHelper& d, const GSOffset::PAHelper& s, int x) + { + return pxCopyFn(d.value(x), s.value(x)); + }); + }; + + auto copyFast = [=](auto* vm, const GSOffset& dpo, const GSOffset& spo, auto&& pxCopyFn) + { + genericCopy(dpo, spo, + [=](const GSOffset& o, int x, int y) { return o.paMulti(vm, x, y); }, + [=](const auto& d, const auto& s, int x) + { + return pxCopyFn(d.value(x), s.value(x)); + }); + }; + if (spsm.trbpp == dpsm.trbpp && spsm.trbpp >= 16) { if (spsm.trbpp == 32) { - copy(dpo.assertSizesMatch(GSLocalMemory::swizzle32), spo.assertSizesMatch(GSLocalMemory::swizzle32), [&](uint32 doff, uint32 soff) + copyFast(m_mem.m_vm32, dpo.assertSizesMatch(GSLocalMemory::swizzle32), spo.assertSizesMatch(GSLocalMemory::swizzle32), [](uint32* d, uint32* s) { - m_mem.m_vm32[doff] = m_mem.m_vm32[soff]; + *d = *s; }); } else if (spsm.trbpp == 24) { - copy(dpo.assertSizesMatch(GSLocalMemory::swizzle32), spo.assertSizesMatch(GSLocalMemory::swizzle32), [&](uint32 doff, uint32 soff) + copyFast(m_mem.m_vm32, dpo.assertSizesMatch(GSLocalMemory::swizzle32), spo.assertSizesMatch(GSLocalMemory::swizzle32), [](uint32* d, uint32* s) { - uint32& d = m_mem.m_vm32[doff]; - d = (d & 0xff000000) | (m_mem.m_vm32[soff] & 0x00ffffff); + *d = (*d & 0xff000000) | (*s & 0x00ffffff); }); } else // if(spsm.trbpp == 16) { - copy(dpo.assertSizesMatch(GSLocalMemory::swizzle16), spo.assertSizesMatch(GSLocalMemory::swizzle16), [&](uint32 doff, uint32 soff) + copyFast(m_mem.m_vm16, dpo.assertSizesMatch(GSLocalMemory::swizzle16), spo.assertSizesMatch(GSLocalMemory::swizzle16), [](uint16* d, uint16* s) { - m_mem.m_vm16[doff] = m_mem.m_vm16[soff]; + *d = *s; }); } } else if (m_env.BITBLTBUF.SPSM == PSM_PSMT8 && m_env.BITBLTBUF.DPSM == PSM_PSMT8) { - copy(GSOffset::fromKnownPSM(dbp, dbw, PSM_PSMT8), GSOffset::fromKnownPSM(sbp, sbw, PSM_PSMT8), [&](uint32 doff, uint32 soff) + copyFast(m_mem.m_vm8, GSOffset::fromKnownPSM(dbp, dbw, PSM_PSMT8), GSOffset::fromKnownPSM(sbp, sbw, PSM_PSMT8), [](uint8* d, uint8* s) { - m_mem.m_vm8[doff] = m_mem.m_vm8[soff]; + *d = *s; }); } else if (m_env.BITBLTBUF.SPSM == PSM_PSMT4 && m_env.BITBLTBUF.DPSM == PSM_PSMT4) diff --git a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp index 9c40a7cc83..884418e46f 100644 --- a/pcsx2/GS/Renderers/HW/GSRendererHW.cpp +++ b/pcsx2/GS/Renderers/HW/GSRendererHW.cpp @@ -907,23 +907,23 @@ void GSRendererHW::SwSpriteRender() for (int y = 0; y < h; y++, ++sy, ++dy) { - GSOffset::PAHelper spa = texture_mapping_enabled ? spo.paMulti(sy) : GSOffset::PAHelper(); - GSOffset::PAHelper dpa = dpo.paMulti(dy); + auto spa = texture_mapping_enabled ? spo.paMulti(m_mem.m_vm32, sx, sy) : GSOffset::PAPtrHelper(); + auto dpa = dpo.paMulti(m_mem.m_vm32, dx, dy); ASSERT(w % 2 == 0); for (int x = 0; x < w; x += 2) { - uint32 di = dpa.value(dx + x); - ASSERT(di + 1 == dpa.value(dx + x + 1)); // Destination pixel pair is adjacent in memory + uint32* di = dpa.value(x); + ASSERT(*di + 1 == *dpa.value(x + 1)); // Destination pixel pair is adjacent in memory GSVector4i sc; if (texture_mapping_enabled) { - uint32 si = spa.value(sx + x); + uint32* si = spa.value(x); // Read 2 source pixel colors - ASSERT((si + 1) == spa.value(sx + x + 1)); // Source pixel pair is adjacent in memory - sc = GSVector4i::loadl(&m_mem.m_vm32[si]).u8to16(); // 0x00AA00BB00GG00RR00aa00bb00gg00rr + ASSERT((*si + 1) == *spa.value(x + 1)); // Source pixel pair is adjacent in memory + sc = GSVector4i::loadl(si).u8to16(); // 0x00AA00BB00GG00RR00aa00bb00gg00rr // Apply TFX ASSERT(tex0_tfx == 0 || tex0_tfx == 1); @@ -944,7 +944,7 @@ void GSRendererHW::SwSpriteRender() if (alpha_blending_enabled || fb_mask_enabled) { // Read 2 destination pixel colors - dc0 = GSVector4i::loadl(&m_mem.m_vm32[di]).u8to16(); // 0x00AA00BB00GG00RR00aa00bb00gg00rr + dc0 = GSVector4i::loadl(di).u8to16(); // 0x00AA00BB00GG00RR00aa00bb00gg00rr } if (alpha_blending_enabled) @@ -999,7 +999,7 @@ void GSRendererHW::SwSpriteRender() // Store 2 pixel colors dc = dc.pu16(GSVector4i::zero()); // 0x0000000000000000AABBGGRRaabbggrr - GSVector4i::storel(&m_mem.m_vm32[di], dc); + GSVector4i::storel(di, dc); } } } @@ -1823,11 +1823,11 @@ void GSRendererHW::OI_GsMemClear() // Based on WritePixel32 for (int y = r.top; y < r.bottom; y++) { - GSOffset::PAHelper pa = off.assertSizesMatch(GSLocalMemory::swizzle32).paMulti(y); + auto pa = off.assertSizesMatch(GSLocalMemory::swizzle32).paMulti(m_mem.m_vm32, 0, y); for (int x = r.left; x < r.right; x++) { - m_mem.m_vm32[pa.value(x)] = 0; // Here the constant color + *pa.value(x) = 0; // Here the constant color } } } @@ -1836,11 +1836,11 @@ void GSRendererHW::OI_GsMemClear() // Based on WritePixel24 for (int y = r.top; y < r.bottom; y++) { - GSOffset::PAHelper pa = off.assertSizesMatch(GSLocalMemory::swizzle32).paMulti(y); + auto pa = off.assertSizesMatch(GSLocalMemory::swizzle32).paMulti(m_mem.m_vm32, 0, y); for (int x = r.left; x < r.right; x++) { - m_mem.m_vm32[pa.value(x)] &= 0xff000000; // Clear the color + *pa.value(x) &= 0xff000000; // Clear the color } } } @@ -1851,11 +1851,11 @@ void GSRendererHW::OI_GsMemClear() // Based on WritePixel16 for(int y = r.top; y < r.bottom; y++) { - GSOffset::PAHelper pa = off.assertSizesMatch(GSLocalMemory::swizzle16).paMulti(y); + auto pa = off.assertSizesMatch(GSLocalMemory::swizzle16).paMulti(m_mem.m_vm16, 0, y); for(int x = r.left; x < r.right; x++) { - m_mem.m_vm16[pa.value(x)] = 0; // Here the constant color + *pa.value(x) = 0; // Here the constant color } } #endif diff --git a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp index c25fa4dddc..8386f48732 100644 --- a/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp +++ b/pcsx2/GS/Renderers/SW/GSDrawScanline.cpp @@ -2942,11 +2942,11 @@ void GSDrawScanline::FillRect(const GSOffset& off, const GSVector4i& r, uint32 c for (int y = r.y; y < r.w; y++) { - GSOffset::PAHelper pa = off.paMulti(y); + auto pa = off.paMulti(vm, 0, y); for (int x = r.x; x < r.z; x++) { - T& d = vm[pa.value(x)]; + T& d = *pa.value(x); d = (T)(!masked ? c : (c | (d & m))); } } @@ -2992,9 +2992,11 @@ void GSDrawScanline::FillBlock(const GSOffset& off, const GSVector4i& r, const G for (int y = r.y; y < r.w; y += 8) { + auto pa = off.paMulti(vm, 0, y); + for (int x = r.x; x < r.z; x += 8 * 4 / sizeof(T)) { - GSVector4i* RESTRICT p = (GSVector4i*)&vm[off.pa(x, y)]; + GSVector4i* RESTRICT p = (GSVector4i*)pa.value(x); for (int i = 0; i < 16; i += 4) {