GregMiscellaneous: zzogl-pg: Use _aligned_malloc in GetMemoryTarget.

git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3800 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
arcum42 2010-09-18 19:27:11 +00:00
parent 7afdf9e7c7
commit 4d5f4ad81b
3 changed files with 23 additions and 52 deletions

View File

@ -78,13 +78,12 @@
static vector<u8> s_vTempBuffer, s_vTransferCache;
static int gs_imageEnd = 0;
// From the start of monster labs. In all 3 cases, psm == 0.
// ZZogl-PG: GetRectMemAddress(0x3f4000, 0x404000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3f40, 0x100);
// ZZogl-PG: GetRectMemAddress(0x3f8000, 0x408000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3f80, 0x100);
// ZZogl-PG: GetRectMemAddress(0x3fc000, 0x40c000, 0x0, 0x0, 0x0, 0x100, 0x40, 0x3fc0, 0x100);
void GetRectMemAddress(int& start, int& end, int psm, int x, int y, int w, int h, int bp, int bw)
{
FUNCLOG
@ -114,7 +113,7 @@
bits = PSMT_BITS_NUM(psm);
start = getPixelFun[psm](x, y, bp, bw);
end = getPixelFun[psm](x + w - 1, y + h - 1, bp, bw) + 1;
if (bits > 0)
{
start *= bits;

View File

@ -120,9 +120,9 @@ u32 g_columnTable32[8][8] =
u32 g_columnTable16[8][16] =
{
{ 0, 2, 8, 10, 16, 18, 24, 26,
1, 3, 9, 11, 17, 19, 25, 27 },
1, 3, 9, 11, 17, 19, 25, 27 },
{ 4, 6, 12, 14, 20, 22, 28, 30,
5, 7, 13, 15, 21, 23, 29, 31 },
5, 7, 13, 15, 21, 23, 29, 31 },
{ 32, 34, 40, 42, 48, 50, 56, 58,
33, 35, 41, 43, 49, 51, 57, 59 },
{ 36, 38, 44, 46, 52, 54, 60, 62,
@ -139,15 +139,15 @@ u32 g_columnTable16[8][16] =
u32 g_columnTable8[16][16] =
{
{ 0, 4, 16, 20, 32, 36, 48, 52, // column 0
2, 6, 18, 22, 34, 38, 50, 54 },
{ 0, 4, 16, 20, 32, 36, 48, 52, // column 0
2, 6, 18, 22, 34, 38, 50, 54 },
{ 8, 12, 24, 28, 40, 44, 56, 60,
10, 14, 26, 30, 42, 46, 58, 62 },
10, 14, 26, 30, 42, 46, 58, 62 },
{ 33, 37, 49, 53, 1, 5, 17, 21,
35, 39, 51, 55, 3, 7, 19, 23 },
{ 41, 45, 57, 61, 9, 13, 25, 29,
43, 47, 59, 63, 11, 15, 27, 31 },
{ 96, 100, 112, 116, 64, 68, 80, 84, // column 1
{ 96, 100, 112, 116, 64, 68, 80, 84, // column 1
98, 102, 114, 118, 66, 70, 82, 86 },
{ 104, 108, 120, 124, 72, 76, 88, 92,
106, 110, 122, 126, 74, 78, 90, 94 },
@ -155,7 +155,7 @@ u32 g_columnTable8[16][16] =
67, 71, 83, 87, 99, 103, 115, 119 },
{ 73, 77, 89, 93, 105, 109, 121, 125,
75, 79, 91, 95, 107, 111, 123, 127 },
{ 128, 132, 144, 148, 160, 164, 176, 180, // column 2
{ 128, 132, 144, 148, 160, 164, 176, 180, // column 2
130, 134, 146, 150, 162, 166, 178, 182 },
{ 136, 140, 152, 156, 168, 172, 184, 188,
138, 142, 154, 158, 170, 174, 186, 190 },
@ -163,7 +163,7 @@ u32 g_columnTable8[16][16] =
163, 167, 179, 183, 131, 135, 147, 151 },
{ 169, 173, 185, 189, 137, 141, 153, 157,
171, 175, 187, 191, 139, 143, 155, 159 },
{ 224, 228, 240, 244, 192, 196, 208, 212, // column 3
{ 224, 228, 240, 244, 192, 196, 208, 212, // column 3
226, 230, 242, 246, 194, 198, 210, 214 },
{ 232, 236, 248, 252, 200, 204, 216, 220,
234, 238, 250, 254, 202, 206, 218, 222 },
@ -175,10 +175,10 @@ u32 g_columnTable8[16][16] =
u32 g_columnTable4[16][32] =
{
{ 0, 8, 32, 40, 64, 72, 96, 104, // column 0
2, 10, 34, 42, 66, 74, 98, 106,
4, 12, 36, 44, 68, 76, 100, 108,
6, 14, 38, 46, 70, 78, 102, 110 },
{ 0, 8, 32, 40, 64, 72, 96, 104, // column 0
2, 10, 34, 42, 66, 74, 98, 106,
4, 12, 36, 44, 68, 76, 100, 108,
6, 14, 38, 46, 70, 78, 102, 110 },
{ 16, 24, 48, 56, 80, 88, 112, 120,
18, 26, 50, 58, 82, 90, 114, 122,
20, 28, 52, 60, 84, 92, 116, 124,
@ -191,7 +191,7 @@ u32 g_columnTable4[16][32] =
83, 91, 115, 123, 19, 27, 51, 59,
85, 93, 117, 125, 21, 29, 53, 61,
87, 95, 119, 127, 23, 31, 55, 63 },
{ 192, 200, 224, 232, 128, 136, 160, 168, // column 1
{ 192, 200, 224, 232, 128, 136, 160, 168, // column 1
194, 202, 226, 234, 130, 138, 162, 170,
196, 204, 228, 236, 132, 140, 164, 172,
198, 206, 230, 238, 134, 142, 166, 174 },
@ -207,7 +207,7 @@ u32 g_columnTable4[16][32] =
147, 155, 179, 187, 211, 219, 243, 251,
149, 157, 181, 189, 213, 221, 245, 253,
151, 159, 183, 191, 215, 223, 247, 255 },
{ 256, 264, 288, 296, 320, 328, 352, 360, // column 2
{ 256, 264, 288, 296, 320, 328, 352, 360, // column 2
258, 266, 290, 298, 322, 330, 354, 362,
260, 268, 292, 300, 324, 332, 356, 364,
262, 270, 294, 302, 326, 334, 358, 366 },
@ -223,7 +223,7 @@ u32 g_columnTable4[16][32] =
339, 347, 371, 379, 275, 283, 307, 315,
341, 349, 373, 381, 277, 285, 309, 317,
343, 351, 375, 383, 279, 287, 311, 319 },
{ 448, 456, 480, 488, 384, 392, 416, 424, // column 3
{ 448, 456, 480, 488, 384, 392, 416, 424, // column 3
450, 458, 482, 490, 386, 394, 418, 426,
452, 460, 484, 492, 388, 396, 420, 428,
454, 462, 486, 494, 390, 398, 422, 430 },

View File

@ -2202,13 +2202,12 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
memcpy_amd(targ->ptex->memptr, g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy, 4 * GPU_TEXWIDTH * targ->height);
vector<u8> texdata;
u8* ptexdata = NULL;
if (PSMT_ISCLUT(tex0.psm))
{
texdata.resize(((tex0.cpsm <= 1) ? 4 : 2) * texW * texH);
ptexdata = &texdata[0];
u32 tex_size = ((tex0.cpsm <= 1) ? 4 : 2) * texW * texH;
ptexdata = (u8*)_aligned_malloc(tex_size, 16);
u8* psrc = (u8*)(g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy);
@ -2231,43 +2230,16 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
{
if (tex0.psm == PSMT16Z || tex0.psm == PSMT16SZ)
{
texdata.resize(4 * texW * texH
#if defined(ZEROGS_SSE2)
+ 15 // reserve additional elements for alignment if SSE2 used.
// better do it now, so less resizing would be needed
#endif
);
ptexdata = &texdata[0];
ptexdata = (u8*)_aligned_malloc(4 * texW * texH, 16);
// needs to be 8 bit, use xmm for unpacking
u16* dst = (u16*)ptexdata;
u16* src = (u16*)(g_pbyGSMemory + 4 * GPU_TEXWIDTH * targ->realy);
#if defined(ZEROGS_SSE2)
if (((u32)(uptr)dst) % 16 != 0)
{
// This is not unusual situation, when vector<u8> does not 16bit alignment, that is destructive for SSE2
// instruction movdqa [%eax], xmm0
// The idea would be resize vector to 15 elements, that set ptxedata to aligned position.
// Later we would move eax by 16, so only we should verify is first element align
// FIXME. As I see, texdata used only once here, it does not have any impact on other code.
// Probably, usage of _aligned_maloc() would be preferable.
// Note: this often happens when changing AA.
int disalignment = 16 - ((u32)(uptr)dst) % 16; // This is value of shift. It could be 0 < disalignment <= 15
ptexdata = &texdata[disalignment]; // Set pointer to aligned element
dst = (u16*)ptexdata;
ZZLog::GS_Log("Made alignment for texdata, 0x%x", dst);
assert(((u32)(uptr)dst) % 16 == 0); // Assert, because at future could be vectors with uncontigious spaces
}
int iters = targ->height * GPU_TEXWIDTH / 16;
SSE2_UnswizzleZ16Target(dst, src, iters) ;
assert(((u32)(uptr)dst) % 16 == 0);
SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
#else // ZEROGS_SSE2
for (int i = 0; i < targ->height; ++i)
{
for (int j = 0; j < GPU_TEXWIDTH; ++j)