pcsx2/plugins/zzogl-pg/opengl/ZZoglMem.h

791 lines
29 KiB
C
Raw Normal View History

/* ZeroGS KOSMOS
* Copyright (C) 2005-2006 zerofrog@gmail.com
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#ifndef __ZZOGL_MEM_H__
#define __ZZOGL_MEM_H__
#include <assert.h>
#include <vector>
#include "GS.h"
#include "Util.h"
#include "Mem.h"
#ifndef ZZNORMAL_MEMORY
extern u32 g_blockTable32[4][8];
extern u32 g_blockTable32Z[4][8];
extern u32 g_blockTable16[8][4];
extern u32 g_blockTable16S[8][4];
extern u32 g_blockTable16Z[8][4];
extern u32 g_blockTable16SZ[8][4];
extern u32 g_blockTable8[4][8];
extern u32 g_blockTable4[8][4];
extern u32 g_columnTable32[8][8];
extern u32 g_columnTable16[8][16];
extern u32 g_columnTable8[16][16];
extern u32 g_columnTable4[16][32];
//--
extern u32 g_pageTable32[32][64];
extern u32 g_pageTable32Z[32][64];
extern u32 g_pageTable16[64][64];
extern u32 g_pageTable16S[64][64];
extern u32 g_pageTable16Z[64][64];
extern u32 g_pageTable16SZ[64][64];
extern u32 g_pageTable8[64][128];
extern u32 g_pageTable4[128][128];
//maximum PSM is 58, so our arrays have 58 + 1 = 59 elements
// This table is used for fast access to memory storage data.
extern u32 ZZ_DT[MAX_PSM][TABLE_WIDTH];
//maxium PSM is 58, so our arrays have 58 + 1 = 59 elements
extern u32** g_pageTable[MAX_PSM];
extern u32** g_blockTable[MAX_PSM];
extern u32** g_columnTable[MAX_PSM];
extern u32 g_pageTable2[MAX_PSM][127][127];
extern u32** g_pageTableNew[MAX_PSM];
// rest not visible externally
struct BLOCK
{
BLOCK() { memset(this, 0, sizeof(BLOCK)); }
// shader constants for this block
float4 vTexBlock;
float4 vTexDims;
int width, height; // dims of one page in pixels
int bpp;
int colwidth, colheight;
u32** pageTable; // offset inside each page
u32** blockTable;
u32** columnTable;
// Nobody use this, so we better remove it.
// u32 (*getPixelAddress)(int x, int y, u32 bp, u32 bw);
// u32 (*getPixelAddress_0)(int x, int y, u32 bw);
// void (*writePixel)(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw);
// void (*writePixel_0)(void* pmem, int x, int y, u32 pixel, u32 bw);
// u32 (*readPixel)(const void* pmem, int x, int y, u32 bp, u32 bw);
// u32 (*readPixel_0)(const void* pmem, int x, int y, u32 bw);
int (*TransferHostLocal)(const void* pbyMem, u32 nQWordSize);
void (*TransferLocalHost)(void* pbyMem, u32 nQWordSize);
// texture must be of dims BLOCK_TEXWIDTH and BLOCK_TEXHEIGHT
static void FillBlocks(std::vector<char>& vBlockData, std::vector<char>& vBilinearData, int floatfmt);
};
void FillBlockTables();
void DestroyBlockTables();
void FillNewPageTable();
extern BLOCK m_Blocks[];
extern u32 g_blockTable32[4][8];
extern u32 g_blockTable32Z[4][8];
extern u32 g_blockTable16[8][4];
extern u32 g_blockTable16S[8][4];
extern u32 g_blockTable16Z[8][4];
extern u32 g_blockTable16SZ[8][4];
extern u32 g_blockTable8[4][8];
extern u32 g_blockTable4[8][4];
extern u32 g_columnTable32[8][8];
extern u32 g_columnTable16[8][16];
extern u32 g_columnTable8[16][16];
extern u32 g_columnTable4[16][32];
extern u32 g_pageTable32[32][64];
extern u32 g_pageTable32Z[32][64];
extern u32 g_pageTable16[64][64];
extern u32 g_pageTable16S[64][64];
extern u32 g_pageTable16Z[64][64];
extern u32 g_pageTable16SZ[64][64];
extern u32 g_pageTable8[64][128];
extern u32 g_pageTable4[128][128];
extern u32** g_pageTable[MAX_PSM];
extern u32** g_blockTable[MAX_PSM];
extern u32** g_columnTable[MAX_PSM];
extern u32 ZZ_DT[MAX_PSM][TABLE_WIDTH];
extern u32** g_pageTableNew[MAX_PSM];
static __forceinline void MaskedOR(u32* dst, u32 pixel, u32 mask = 0xffffffff) {
if (mask == 0xffffffff)
*dst = pixel;
else
*dst = (*dst & (~mask)) | (pixel & mask);
}
// This two defines seems like idiotic code, but in reality it have one, but big importance -- this code
// made psm variable (and psm2 in second case) -- constant, so optimiser could properly pass proper function
#define PSM_SWITCHCASE(X) { \
switch (psm) { \
case PSMCT32: { \
const int psmC = PSMCT32; \
X; } \
break; \
case PSMT32Z: { \
const int psmC = PSMT32Z; \
X; } \
break; \
case PSMCT24: { \
const int psmC = PSMCT24; \
X; } \
break; \
case PSMT24Z: { \
const int psmC = PSMT24Z; \
X; } \
break; \
case PSMCT16: { \
const int psmC = PSMCT16; \
X; } \
break; \
case PSMCT16S: { \
const int psmC = PSMCT16S; \
X; } \
break; \
case PSMT16Z: { \
const int psmC = PSMT16Z; \
X; } \
break; \
case PSMT16SZ: { \
const int psmC = PSMT16SZ; \
X; } \
break; \
case PSMT8: { \
const int psmC = PSMT8; \
X; } \
break; \
case PSMT8H: { \
const int psmC = PSMT8H; \
X; } \
break; \
case PSMT4HH: { \
const int psmC = PSMT4HH; \
X; } \
break; \
case PSMT4HL: { \
const int psmC = PSMT4HL; \
X; } \
break; \
case PSMT4: { \
const int psmC = PSMT4; \
X; } \
break; \
}\
}
#define PSM_SWITCHCASE_2(X) { \
switch (psm) { \
case PSMCT32: \
if( psm2 == PSMCT32 ) { const int psmC = PSMCT32, psmC1 = PSMCT32; X; } \
else { const int psmC = PSMCT32, psmC1 = PSMT32Z; X; } \
break; \
case PSMCT24: \
if( psm2 == PSMCT24 ) { const int psmC = PSMCT24, psmC1 = PSMCT24; X; } \
else { const int psmC = PSMCT24, psmC1 = PSMT24Z; X; } \
break; \
case PSMT32Z: \
if( psm2 == PSMT32Z ) { const int psmC = PSMT32Z, psmC1 = PSMCT32; X; } \
else { const int psmC = PSMT32Z, psmC1 = PSMT32Z; X; } \
break; \
case PSMT24Z: \
if( psm2 == PSMCT24 ) { const int psmC = PSMT24Z, psmC1 = PSMCT24; X; } \
else { const int psmC = PSMT24Z, psmC1 = PSMT24Z; X; } \
break; \
case PSMCT16: \
switch(psm2) { \
case PSMCT16: { const int psmC = PSMCT16, psmC1 = PSMCT16; X; } break; \
case PSMCT16S: { const int psmC = PSMCT16, psmC1 = PSMCT16S; X; } break; \
case PSMT16Z: { const int psmC = PSMCT16, psmC1 = PSMT16Z; X; } break; \
case PSMT16SZ: { const int psmC = PSMCT16, psmC1 = PSMT16SZ; X; } break; \
} \
break; \
case PSMCT16S: \
switch(psm2) { \
case PSMCT16: { const int psmC = PSMCT16S, psmC1 = PSMCT16; X; } break; \
case PSMCT16S: { const int psmC = PSMCT16S, psmC1 = PSMCT16S; X; } break; \
case PSMT16Z: { const int psmC = PSMCT16S, psmC1 = PSMT16Z; X; } break; \
case PSMT16SZ: { const int psmC = PSMCT16S, psmC1 = PSMT16SZ; X; } break; \
} \
break; \
case PSMT16Z: \
switch(psm2) { \
case PSMCT16: { const int psmC = PSMT16Z, psmC1 = PSMCT16; X; } break; \
case PSMCT16S: { const int psmC = PSMT16Z, psmC1 = PSMCT16S; X; } break; \
case PSMT16Z: { const int psmC = PSMT16Z, psmC1 = PSMT16Z; X; } break; \
case PSMT16SZ: { const int psmC = PSMT16Z, psmC1 = PSMT16SZ; X; } break; \
} \
break; \
case PSMT16SZ: \
switch(psm2) { \
case PSMCT16: { const int psmC = PSMT16SZ, psmC1 = PSMCT16; X; } break; \
case PSMCT16S: { const int psmC = PSMT16SZ, psmC1 = PSMCT16S; X; } break; \
case PSMT16Z: { const int psmC = PSMT16SZ, psmC1 = PSMT16Z; X; } break; \
case PSMT16SZ: { const int psmC = PSMT16SZ, psmC1 = PSMT16SZ; X; } break; \
} \
break; \
case PSMT8: \
if( psm2 == PSMT8 ) { const int psmC = PSMT8, psmC1 = PSMT8; X; } \
else { const int psmC = PSMT8, psmC1 = PSMT8H; X; } \
break; \
case PSMT8H: \
if( psm2 == PSMT8H ) { const int psmC = PSMT8H, psmC1 = PSMT8; X; } \
else { const int psmC = PSMT8H, psmC1 = PSMT8H; X; } \
break; \
case PSMT4: \
switch(psm2) { \
case PSMT4: { const int psmC = PSMT4, psmC1 = PSMT4; X; } break; \
case PSMT4HL: { const int psmC = PSMT4, psmC1 = PSMT4HL; X; } break; \
case PSMT4HH: { const int psmC = PSMT4, psmC1 = PSMT4HH; X; } break; \
} \
break; \
case PSMT4HL: \
switch(psm2) { \
case PSMT4: { const int psmC = PSMT4HL, psmC1 = PSMT4; X; } break; \
case PSMT4HL: { const int psmC = PSMT4HL, psmC1 = PSMT4HL; X; } break; \
case PSMT4HH: { const int psmC = PSMT4HL, psmC1 = PSMT4HH; X; } break; \
} \
break; \
case PSMT4HH: \
switch(psm2) { \
case PSMT4: { const int psmC = PSMT4HH, psmC1 = PSMT4; X; } break; \
case PSMT4HL: { const int psmC = PSMT4HH, psmC1 = PSMT4HL; X; } break; \
case PSMT4HH: { const int psmC = PSMT4HH, psmC1 = PSMT4HH; X; } break; \
} \
break; \
} \
}
template <int psm>
static __forceinline void setPsmtConstantsX(u8& A, u8& B, u8& C, u8& D, u8& E, u8& F, u32& G, u8& H) {
switch (psm) {
case PSMCT32:
case PSMT32Z:
A = 5; B = 6; C = 0; D = 31; E = 63; F = 0; H = 1; G = 0xffffffff;
break;
case PSMCT24:
case PSMT24Z:
A = 5; B = 6; C = 0; D = 31; E = 63; F = 0; H = 1; G = 0xffffff;
break;
case PSMT8H:
A = 5; B = 6; C = 0; D = 31; E = 63; F = 24; H = 4; G = 0xff;
break;
case PSMT4HH:
A = 5; B = 6; C = 0; D = 31; E = 63; F = 28; H = 8; G = 0xf;
break;
case PSMT4HL:
A = 5; B = 6; C = 0; D = 31; E = 63; F = 24; H = 8; G = 0xf;
break;
case PSMCT16:
case PSMT16Z:
case PSMCT16S:
case PSMT16SZ:
A = 6; B = 6; C = 1; D = 63; E = 63; F = 0; H = 2; G = 0xffff;
break;
case PSMT8:
A = 6; B = 7; C = 2; D = 63; E = 127; F = 0; H = 4; G = 0xff;
break;
case PSMT4:
A = 7; B = 7; C = 3; D = 127; E = 127; F = 0; H = 8; G = 0xf;
break;
}
}
// This is where the NEW_CODE define used to be.
// ------------------------------------------ get Address functions ------------------------------------
// Yes, only 1 function to all cases of life!
// Warning! We switch bp and bw for usage of default value, so be warned! It's
// not C, it's C++, so not it.
template <int psm>
static __forceinline u32 getPixelAddress(int x, int y, u32 bw, u32 bp = 0) {
u32 basepage;
u32 word;
u8 A = 0, B = 0, C = 0, D = 0, E = 0, F = 0; u32 G = 0; u8 H= 0;
setPsmtConstantsX<psm>(A, B, C, D, E, F, G, H);
basepage = ((y>>A) * (bw>>B)) + (x>>B);
word = ((bp * 64 + basepage * 2048) << C) + g_pageTable[psm][y&D][x&E];
return word;
}
// It's Zerofrog's function. I need to eliminate them all! All access should be 32-bit aligned.
static __forceinline u32 getPixelAddress(int psm, int x, int y, u32 bw, u32 bp = 0) {
PSM_SWITCHCASE(return getPixelAddress<psmC>(x, y, bw, bp) ;)
return 0;
}
// This is compatibility code, for reference,
#define Def_getPixelAddress(psmT, psmX) \
static __forceinline u32 getPixelAddress##psmT(int x, int y, u32 bp, u32 bw) { \
return getPixelAddress<psmX>(x, y, bw, bp); } \
static __forceinline u32 getPixelAddress##psmT##_0(int x, int y, u32 bw) { \
return getPixelAddress<psmX>(x, y, bw); } \
Def_getPixelAddress(32, PSMCT32)
Def_getPixelAddress(16, PSMCT16)
Def_getPixelAddress(16S, PSMCT16S)
Def_getPixelAddress(8, PSMT8)
Def_getPixelAddress(4, PSMT4)
Def_getPixelAddress(32Z, PSMT32Z)
Def_getPixelAddress(16Z, PSMT16Z)
Def_getPixelAddress(16SZ, PSMT16SZ)
#define getPixelAddress24 getPixelAddress32
#define getPixelAddress24_0 getPixelAddress32_0
#define getPixelAddress8H getPixelAddress32
#define getPixelAddress8H_0 getPixelAddress32_0
#define getPixelAddress4HL getPixelAddress32
#define getPixelAddress4HL_0 getPixelAddress32_0
#define getPixelAddress4HH getPixelAddress32
#define getPixelAddress4HH_0 getPixelAddress32_0
#define getPixelAddress24Z getPixelAddress32Z
#define getPixelAddress24Z_0 getPixelAddress32Z_0
// Check FFX-1 (very begining) for PSMT8
// Check Tekken menu for PSMT4
// ZZ_DT[7] is needed only for PSMT8H, PSMT4HL and PSMT4HH -- at this case word contain data not from a begining.
// This function return shift from 32-bit aligned address and shift -- number of byte in u32 order.
// so if ((u32*)mem + getPixelAddress_Aligned32) is exact location of u32, where our pixel data stored.
// Just for remember:
// PMSCT32, 24, 32Z, 24Z, 8HH, 4HL and 4HH have ZZ_DT[psm] == 3, so shift is always 0.
// PSMCT16, 16S, 16SZ, 16Z have ZZ_DT[psm] == 2, so shift is 0 or 16.
// PSMT8 ZZ_DT[psm] == 1, shift is 0, 8, 16, 24
// PSMT4 ZZ_DT[psm] == 0, shift is 0, 4, 8, 12, 16, 20, 24, 28.
// It allow us to made a fast access to pixels in the same basepage: if x % N == 0 (N = 1, 2, 4, 8, .. 64)
// than we could guarantee that all pixels form x to x + N - 1 are in the same basepage.
template <int psm>
static __forceinline u32* getPixelBasepage(const void* pmem, int x, int y, u32 bw, u32 bp = 0) {
u32 basepage;
u8 A = 0, B = 0, C = 0 , D = 0, E = 0, F = 0; u32 G = 0; u8 H = 0;
setPsmtConstantsX<psm> (A, B, C, D, E, F, G, H);
basepage = ((y>>A) * (bw>>B)) + (x>>B);
return ((u32*)pmem + (bp * 64 + basepage * 2048));
}
// And this is offset for this pixels.
template <int psm>
static __forceinline u32* getPixelOffset(u32& mask, u32& shift, const void* pmem, int x, int y) {
u32 word;
u8 A = 0, B = 0, C = 0 , D = 0, E = 0, F = 0; u32 G = 0; u8 H = 0;
setPsmtConstantsX<psm> (A, B, C, D, E, F, G, H);
word = (g_pageTable[psm][y&D][x&E] << (3 - C));
shift = ((word & 0x7) << 2) + F;
mask &= G << shift;
return ((u32*)pmem + ((word & ~0x7) >> 3));
}
template <int psm>
static __forceinline u32* getPixelAddress_A32(u32& mask, u32& shift, const void* pmem, int x, int y, u32 bw, u32 bp = 0) {
return getPixelOffset<psm>(mask, shift, getPixelBasepage<psm>(pmem, x, y, bw, bp), x, y);
}
template <int psm>
static __forceinline u32* getPixelBaseAddress_A32(const void* pmem, int x, int y, u32 bw, u32 bp = 0) {
u32 word;
u8 A = 0, B = 0, C = 0 , D = 0, E = 0, F = 0; u32 G = 0; u8 H = 0;
setPsmtConstantsX<psm> (A, B, C, D, E, F, G, H);
word = (g_pageTable[psm][y&D][x&E] << (3 - C));
return ((u32*)getPixelBasepage<psm>(pmem, x, y, bw, bp) + ((word & ~0x7) >> 3));
}
// Wrapper for cases, where psm is not constant, should be avoided inside cycles
static __forceinline u32* getPixelAddress_A32(u32& mask, u32& shift, int psm, const void* pmem, int x, int y, u32 bw, u32 bp = 0) {
PSM_SWITCHCASE( return getPixelAddress_A32<psmC>(mask, shift, pmem, x, y, bw, bp) );
return 0;
}
static __forceinline u32* getClutAddress(u8* pmem, const tex0Info& tex0) {
if (PSMT_ISHALF(tex0.cpsm))
return (u32*)(pmem + 64 * (tex0.csa & 15) + (tex0.csa >= 16 ? 2 : 0) );
else
return (u32*)(pmem + 64 * (tex0.csa & 15));
}
//--------------------------------------------- Write Pixel -----------------------------------------------------------
// Set proper mask for transfering multiple bytes per word.
template <int psm>
inline u32 HandleWritemask(u32 Writemask) {
u8 G = PSM_BITS_PER_PIXEL<psm>();
u32 dmask = Writemask & ((1 << G) - 1); // drop all bits in writemask, that could not be used
u32 mask;
switch (psm) {
case PSMT8H: // modes with non-zero start bit should be handled differently
return 0xff000000;
case PSMT4HL:
return 0x0f000000;
case PSMT4HH:
return 0xf0000000;
default:
mask = dmask; // 32 targets and lower
if (G < 24) {
mask |= dmask << G; // 16 targets and lower
if (G < 16) {
mask |= dmask << (2 * G); // 8 targets and lower
mask |= dmask << (3 * G);
if (G < 8) {
mask |= dmask << (4 * G); // 4 targets
mask |= dmask << (5 * G);
mask |= dmask << (6 * G);
mask |= dmask << (7 * G);
}}}
return mask;
}
}
//push pixel data at position x,y, according psm storage format. pixel do not need to be properly masked, wrong bit's would not be used
//mask should be made according PSM.
template <int psm>
static __forceinline void writePixel(void* pmem, int x, int y, u32 pixel, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
u32 shift;
u32* p = getPixelAddress_A32<psm>(mask, shift, pmem, x, y, bw, bp);
MaskedOR (p, pixel << shift, mask);
}
static __forceinline void writePixel(int psm, void* pmem, int x, int y, u32 pixel, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
PSM_SWITCHCASE(writePixel<psmC>(pmem, x, y, pixel, bw, bp, mask));
}
// Put pixel data from memory. Pixel is p, memory start from pixel, and we should count pmove words and shift resulting word to shift
// 24 targets could be outside of 32-bit borders.
template <int psm>
static __forceinline void pushPixelMem(u32* p, u32* pixel, int pmove, int shift, u32 mask = 0xffffffff) {
if (psm != PSMCT24 || psm != PSMT24Z) {
if (shift > 0)
MaskedOR (p, (*(pixel + pmove)) << (shift), mask);
else
MaskedOR (p, (*(pixel + pmove)) >> (-shift), mask);
}
else { // for 24 and 24Z psm data could be not-aligned by 32. Merde!
u64 pixel64 = (*(u64*)(pixel + pmove) ) >> (-shift); // we read more data, but for 24 targets shift always negative and resulting data is u32
MaskedOR(p, (u32)pixel64, mask); // drop upper part, we don't need it. all data is stored in lower part of u64 after shift
// MaskedOR(p, (u32)((u8*)pixel + count * 3), mask);
}
}
// use it if pixel already shifted by needed number of bytes.
// offseted mean that we should skip basepage calculation, pmem is link to basepage'ed memory. Just a little quicker.
template <int psm, int offseted>
static __forceinline void writePixelMem(const void* pmem, int x, int y, u32* pixel, int count, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
u32 shift;
u32* p;
if (offseted)
p = getPixelOffset<psm>(mask, shift, pmem, x, y);
else
p = getPixelAddress_A32<psm>(mask, shift, pmem, x, y, bw, bp);
int A = PSM_BITS_PER_PIXEL<psm>();
int pmove = (count * A) >> 5;
int pshift = (count * A) & 31; // we assume, that if shift outside word, than user want next pixel data
pushPixelMem<psm>(p, pixel, pmove, (int)shift - pshift, mask);
}
// This function push several pixels. Note, that for 32, 24, 8HH, 4HL, 4HH it's simply write (and pixel should not be properly masked), 16 do push 2 pixels (and x should be even).
// 8 push 4 pixels: 0,0; 0,1; 1,0 and 1,1. 4 push 8: 0,0; 0,1; 1,0; 1,1; 2,0, 2,1; 3,0; 3,1.
template <int psm>
static __forceinline void writePixelWord(const void* pmem, int x, int y, u32 pixel, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
u32 maskA = mask, shift;
u32* p = getPixelAddress_A32<psm>(maskA, shift, pmem, x, y, bw, bp);
/* if (PSM_NON_FULL_WORD<psm>())
maskA = maskA & mask;
else
maskA = mask;*/
MaskedOR (p, pixel, mask);
}
// ------------------------------------- Read Pixel ---------------------------------------
template <int psm>
static __forceinline u32 readPixel(const void* pmem, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
u32 shift;
u32* p = getPixelAddress_A32<psm>(mask, shift, pmem, x, y, bw, bp);
return ((*p & mask) >> shift);
}
static __forceinline u32 readPixel(int psm, const void* pmem, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
PSM_SWITCHCASE(return readPixel<psmC>(pmem, x, y, bw, bp, mask););
return 0;
}
template <int psm>
static __forceinline u32 readPixelWord(const void* pmem, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
u32 maskA = 0xffffffff, shift;
if (PSM_NON_FULL_WORD<psm>())
return *getPixelAddress_A32<psm>(mask, shift, pmem, x, y, bw, bp) & mask;
else
return *getPixelAddress_A32<psm>(maskA, shift, pmem, x, y, bw, bp) & mask;
}
template <int psm>
static __forceinline void fillMemoryFromPixels(u32* dst, const void* pmem, int& count, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
u32 pixel;
u8 I = PSM_BITS_PER_PIXEL<psm>();
int K = count / PSM_PIXELS_STORED_PER_WORD<psm>(); // offset for pmem, count for 32, count / 2 for 16, etc.
pixel = readPixel<psm>(pmem, x, y, bw, bp, mask); // I prefer not to use for here. It's slow
if (I < 32) {
pixel += readPixel<psm>(pmem, x + 1, y, bw, bp, mask) << I;
if (I < 16) { // 8 and 4 targets
pixel += readPixel<psm>(pmem, x + 2, y, bw, bp, mask) << (2 * I);
pixel += readPixel<psm>(pmem, x + 3, y, bw, bp, mask) << (3 * I);
if (I < 8) { // This is for 4, 4HH and 4HL
pixel += readPixel<psm>(pmem, x + 4, y, bw, bp, mask) << (4 * I);
pixel += readPixel<psm>(pmem, x + 5, y, bw, bp, mask) << (5 * I);
pixel += readPixel<psm>(pmem, x + 6, y, bw, bp, mask) << (6 * I);
pixel += readPixel<psm>(pmem, x + 7, y, bw, bp, mask) << (7 * I);
}}}
if (I != 24) {
*(dst + K) = pixel;
}
else { // 24. should have special care.
// ERROR_LOG("special care %d\n", count);
MaskedOR((u32*)((u8*)dst + 3 * count), pixel, 0xffffff);
}
count += PSM_PIXELS_STORED_PER_WORD<psm>();
}
// Fill count pixels form continues memory region, starting from pmem, First pixel to read have number shift in this region.
// Read no more than count pixels. We could assert, that all this pixels would be place in the same basepage
// Shift is automaticaly increased by count (or decreased if count < 0)
template <int psm, bool offseted, int count>
static __forceinline void writePixelsFromMemory(void* dst, const void* pmem, int& shift, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
const void* base;
if (offseted)
base = getPixelBasepage<psm>(dst, x, y, bw, bp);
else
base = (const void*)dst;
shift += count;
writePixelMem<psm, offseted>(base, x, y, (u32*)pmem, shift - count, bw, bp, mask); // I prefer not to use for here. It's slow
if (count < 2) return;
writePixelMem<psm, offseted>(base, x + 1, y, (u32*)pmem, shift - count + 1, bw, bp, mask);
if (count < 3) return;
writePixelMem<psm, offseted>(base, x + 2, y, (u32*)pmem, shift - count + 2, bw, bp, mask);
if (count < 4) return;
writePixelMem<psm, offseted>(base, x + 3, y, (u32*)pmem, shift - count + 3, bw, bp, mask);
if (count < 5) return;
writePixelMem<psm, offseted>(base, x + 4, y, (u32*)pmem, shift - count + 4, bw, bp, mask);
if (count < 6) return;
writePixelMem<psm, offseted>(base, x + 5, y, (u32*)pmem, shift - count + 5, bw, bp, mask);
if (count < 7) return;
writePixelMem<psm, offseted>(base, x + 6, y, (u32*)pmem, shift - count + 6, bw, bp, mask);
if (count < 8) return;
writePixelMem<psm, offseted>(base, x + 7, y, (u32*)pmem, shift - count + 7, bw, bp, mask);
}
// Use it if we don't know that starting pixel is aligned for multiple-pixel write
template <int psm, bool offseted>
static __forceinline void writeUnalignedPixelsFromMemory(void* dst, int div, const void* pmem, int& shift, int x, int y, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
switch (div){
case 0: return; // Pixels are aligned, so we could move on
case 1: writePixelsFromMemory<psm, offseted, 1>(dst, pmem, shift, x, y, bw, bp, mask);
return;
case 2: writePixelsFromMemory<psm, offseted, 2>(dst, pmem, shift, x, y, bw, bp, mask);
return;
case 3: writePixelsFromMemory<psm, offseted, 3>(dst, pmem, shift, x, y, bw, bp, mask);
return;
case 4: writePixelsFromMemory<psm, offseted, 4>(dst, pmem, shift, x, y, bw, bp, mask);
return;
case 5: writePixelsFromMemory<psm, offseted, 5>(dst, pmem, shift, x, y, bw, bp, mask);
return;
case 6: writePixelsFromMemory<psm, offseted, 6>(dst, pmem, shift, x, y, bw, bp, mask);
return;
case 7: writePixelsFromMemory<psm, offseted, 7>(dst, pmem, shift, x, y, bw, bp, mask);
return;
}
}
// This little swizzle function used to convert data form memory. z is first byte in destination block, and y is number of word, in which we look look for data.
// s is shift by number of pixels, that should be used in masking
template <int psm, int y, int z>
static __forceinline u32 BitmaskinPSM(u32* pmem, u8 x) {
u8 H = PSM_BITCOUNT<psm>();
u8 I = PSM_BITS_PER_PIXEL<psm>() ; // length of bitmask in bits.
if (PSM_BITMODE<psm>() != 1) { // PSMCT24 and 24Z should be handle separated, as it could pass 32-bit storage.
u8 k = (x & (H - 1)) * I; // shift of PC data -- in PC we use pixels from constant position: x / H word and k is shift: x = ( x % H ) * H + k / I
// in PS2 we use all bit position from 0 by I pixels.
u32 J = ((1 << I) - 1) << k; // bitmask (of length ) & mask, moved by position k
// gcc complains repeatedly about this always being false. I'll investigate later.
if (z > k)
return ((*(pmem + x/H + y)) & J) << (z - k); // we use PX data from *mem + and properly shift
else // This formula loo little swizzled.
return ((*(pmem + x/H + y)) & J) >> (k - z);
}
else { // only 24 targets
u8* mem = ((u8*)pmem + (x * 3) + 4 * y); // Our pixel's is disaligned on 32-bit. So just use u8*.
return *(u32*)mem; // Mask would be handled later
}
}
// We use this function to limit number of memory R/W. This function fill all pixels for data with coordindates x, y. inside block data.
// Only rule is x, y should be < 8 (it automatically fill all needed pixels, that lie in blockdata, but have coords more than 8).
template <int psm>
static __forceinline void fillPixelsFromMemory(u32* dst, u32* pmem, int x, int y, int pitch, u32 bw, u32 bp = 0, u32 mask = 0xffffffff) {
u32 pixel = 0;
const u8 H = PSM_PIXELS_PER_WORD<psm>();
if (PSM_PIXEL_SHIFT<psm>() == 0) // We could not use calculated constants as templated parameters.
pixel = BitmaskinPSM<psm, 0, 0>(pmem, x); // First pixel x,y is the common part of all psmt path's
else {
if (PSM_PIXEL_SHIFT<psm>() == 24) // 8H and 4HL have 1 pixel, but shifted to 24 bits. 4HH -- 28 bits.
pixel = BitmaskinPSM<psm, 0, 24>(pmem, x);
else
pixel = BitmaskinPSM<psm, 0, 28>(pmem, x);
}
if (H > 1) {
const u8 G = psm & 0x7; // Bitmode, we use it for better chance of switch optimization
int div = ( x < 4 ) ? 4 : -4; // secondary row have shift by +4 or -4 pixels
switch (G) {
case 2:
pixel |= BitmaskinPSM<psm, 4, 16>(pmem, x);
break;
case 3:
pixel |= BitmaskinPSM<psm, 2, 16>(pmem, x);
pixel |= BitmaskinPSM<psm, 0, 8>(pmem + 2 * pitch, x + div);
pixel |= BitmaskinPSM<psm, 2, 24>(pmem + 2 * pitch, x + div);
break;
case 4:
pixel |= BitmaskinPSM<psm, 1, 8>(pmem, x);
pixel |= BitmaskinPSM<psm, 2, 16>(pmem, x);
pixel |= BitmaskinPSM<psm, 3, 24>(pmem, x);
pixel |= BitmaskinPSM<psm, 0, 4>(pmem + 2 * pitch, x + div);
pixel |= BitmaskinPSM<psm, 1, 12>(pmem + 2 * pitch, x + div);
pixel |= BitmaskinPSM<psm, 2, 20>(pmem + 2 * pitch, x + div);
pixel |= BitmaskinPSM<psm, 3, 28>(pmem + 2 * pitch, x + div);
break;
}
}
writePixelWord<psm>(dst, x, y, pixel, bw, bp, HandleWritemask<psm>(mask)); // use it for 32, 24, 8H, 4HL and 4HH
}
template <int psm>
void writeWordPixel(u32* pmem, u32 pixel, u32 mask) {
if (psm == PSMT4HH || psm == PSMT8H || psm == PSMT4HL || psm == PSMCT24 || psm == PSMT24Z)
MaskedOR(pmem, pixel, mask);
else
*pmem = pixel;
}
// Get pixel from src and put in in src. We assume, that psm of both buffers are the same and (sx-dx) & E == (sy - dy) & D == 0;
// Also in this case we could transfer the whole word
template <int psm>
void transferPixelFast(void* dst, void* src, int dx, int dy, int sx, int sy, u32 dbw, u32 sbw ) {
u32 Dbasepage, Sbasepage;
u32 word, mask = 0xffffffff;
u8 A = 0, B = 0, C = 0 , D = 0, E = 0, F = 0; u32 G = 0; u8 H = 0;
setPsmtConstantsX<psm> (A, B, C, D, E, F, G, H);
assert ( ((sx-dx) & E == (sy - dy) & D) && ((sy - dy) & D == 0) );
Dbasepage = ((dy>>A) * (dbw>>B)) + (dx>>B);
Sbasepage = ((sy>>A) * (sbw>>B)) + (sx>>B);
word = (g_pageTable[psm][sy&D][sx&E] >> C);
u32* dstp = (u32*)dst + Dbasepage * 2048 + word;
u32* srcp = (u32*)src + Sbasepage * 2048 + word;
writeWordPixel<psm>(dstp, *srcp, G << F);
}
// if we could not guarantee, that buffer suize shared same page Table address
template <int psm>
void transferPixel(void* dst, void* src, int dx, int dy, int sx, int sy, u32 dbw, u32 sbw ) {
u32 mask = 0xffffffff, shift;
u32* dstp = getPixelAddress_A32<psm>(mask, shift, dst, dx, dy, dbw);
u32* srcp = getPixelAddress_A32<psm>(mask, shift, src, sx, sy, sbw);
writeWordPixel<psm>(dstp, *srcp, mask); // write whole word
}
#define Def_getReadWrite(psmT, psmX) \
static __forceinline void writePixel##psmT(void* pmem, int x, int y, u32 pixel, u32 bp, u32 bw) { \
writePixel<psmX>(pmem, x, y, pixel, bw, bp); } \
static __forceinline u32 readPixel##psmT(const void* pmem, int x, int y, u32 bp, u32 bw) { \
return readPixel<psmX>(pmem, x, y, bw, bp); } \
static __forceinline void writePixel##psmT##_0(void* pmem, int x, int y, u32 pixel, u32 bw) { \
writePixel<psmX>(pmem, x, y, pixel, bw); } \
static __forceinline u32 readPixel##psmT##_0(const void* pmem, int x, int y, u32 bw) { \
return readPixel<psmX>(pmem, x, y, bw); }
Def_getReadWrite(32, PSMCT32);
Def_getReadWrite(24, PSMCT24);
Def_getReadWrite(16, PSMCT16);
Def_getReadWrite(16S, PSMCT16);
Def_getReadWrite(8, PSMT8);
Def_getReadWrite(8H, PSMT8H);
Def_getReadWrite(4, PSMT4);
Def_getReadWrite(4HH, PSMT4HH);
Def_getReadWrite(4HL, PSMT4HL);
Def_getReadWrite(32Z, PSMCT32);
Def_getReadWrite(24Z, PSMCT24);
Def_getReadWrite(16Z, PSMCT16);
Def_getReadWrite(16SZ, PSMCT16);
#endif // Zeydlitz's code
#endif /* __ZZOGL_MEM_H__ */